@goshenkata/dryscan-core 1.2.5 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,13 +1,7 @@
1
- var __defProp = Object.defineProperty;
2
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
3
- var __decorateClass = (decorators, target, key, kind) => {
4
- var result = kind > 1 ? void 0 : kind ? __getOwnPropDesc(target, key) : target;
5
- for (var i = decorators.length - 1, decorator; i >= 0; i--)
6
- if (decorator = decorators[i])
7
- result = (kind ? decorator(target, key, result) : decorator(result)) || result;
8
- if (kind && result) __defProp(target, key, result);
9
- return result;
10
- };
1
+ import {
2
+ __decorateClass,
3
+ parallelCosineSimilarity
4
+ } from "./chunk-ZZC4V5LV.js";
11
5
 
12
6
  // src/DryScan.ts
13
7
  import upath6 from "upath";
@@ -61,7 +55,7 @@ var DEFAULT_CONFIG = {
61
55
  excludedPairs: [],
62
56
  minLines: 3,
63
57
  minBlockLines: 5,
64
- threshold: 0.88,
58
+ threshold: 0.8,
65
59
  embeddingSource: "http://localhost:11434",
66
60
  contextLength: 2048
67
61
  };
@@ -244,7 +238,8 @@ var JavaExtractor = class {
244
238
  const fnUnit = this.buildFunctionUnit(node, source, fileRelPath, currentClass);
245
239
  const fnLength = fnUnit.endLine - fnUnit.startLine;
246
240
  const bodyNode = this.getFunctionBody(node);
247
- const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength);
241
+ const fnArity = this.getNodeArity(node);
242
+ const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength, fnArity);
248
243
  if (skipFunction) {
249
244
  return;
250
245
  }
@@ -311,22 +306,34 @@ var JavaExtractor = class {
311
306
  const normalized = this.normalizeCode(unit.code);
312
307
  return crypto.createHash(BLOCK_HASH_ALGO).update(normalized).digest("hex");
313
308
  }
314
- shouldSkip(unitType, name, lineCount) {
309
+ shouldSkip(unitType, name, lineCount, arity) {
315
310
  if (!this.config) {
316
311
  throw new Error("Config not loaded before skip evaluation");
317
312
  }
318
313
  const config = this.config;
319
314
  const minLines = unitType === "block" /* BLOCK */ ? Math.max(indexConfig.blockMinLines, config.minBlockLines ?? 0) : config.minLines;
320
315
  const belowMin = minLines > 0 && lineCount < minLines;
321
- const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name);
316
+ const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name, arity ?? 0);
322
317
  return belowMin || trivial;
323
318
  }
324
- isTrivialFunction(fullName) {
319
+ /**
320
+ * A function is trivial if it follows a simple accessor pattern:
321
+ * - getters/isers: name matches get[A-Z] or is[A-Z] with exactly 0 parameters
322
+ * - setters: name matches set[A-Z] with at most 1 parameter
323
+ * Methods like getUserById(Long id) have arity > 0 and are NOT trivial.
324
+ */
325
+ isTrivialFunction(fullName, arity) {
325
326
  const simpleName = fullName.split(".").pop() || fullName;
326
- const isGetter = /^(get|is)[A-Z]/.test(simpleName);
327
- const isSetter = /^set[A-Z]/.test(simpleName);
327
+ const isGetter = /^(get|is)[A-Z]/.test(simpleName) && arity === 0;
328
+ const isSetter = /^set[A-Z]/.test(simpleName) && arity <= 1;
328
329
  return isGetter || isSetter;
329
330
  }
331
+ /** Counts the formal parameters of a method or constructor node. */
332
+ getNodeArity(node) {
333
+ const params = node.childForFieldName?.("parameters");
334
+ if (!params) return 0;
335
+ return params.namedChildren.filter((c) => c.type === "formal_parameter" || c.type === "spread_parameter").length;
336
+ }
330
337
  isDtoClass(node, source, className) {
331
338
  const classBody = node.children.find((child) => child.type === "class_body");
332
339
  if (!classBody) return false;
@@ -344,7 +351,8 @@ var JavaExtractor = class {
344
351
  if (child.type === "method_declaration" || child.type === "constructor_declaration") {
345
352
  const simpleName = this.getSimpleFunctionName(child, source);
346
353
  const fullName = `${className}.${simpleName}`;
347
- if (!this.isTrivialFunction(fullName)) {
354
+ const arity = this.getNodeArity(child);
355
+ if (!this.isTrivialFunction(fullName, arity)) {
348
356
  return false;
349
357
  }
350
358
  continue;
@@ -1021,7 +1029,7 @@ var RepositoryInitializer = class {
1021
1029
  };
1022
1030
 
1023
1031
  // src/services/UpdateService.ts
1024
- import debug4 from "debug";
1032
+ import debug5 from "debug";
1025
1033
 
1026
1034
  // src/DryScanUpdater.ts
1027
1035
  import path4 from "path";
@@ -1141,11 +1149,19 @@ async function performIncrementalUpdate(repoPath, extractor, db) {
1141
1149
  }
1142
1150
 
1143
1151
  // src/services/DuplicationCache.ts
1152
+ import debug4 from "debug";
1153
+ var log4 = debug4("DryScan:DuplicationCache");
1144
1154
  var DuplicationCache = class _DuplicationCache {
1145
1155
  static instance = null;
1146
1156
  comparisons = /* @__PURE__ */ new Map();
1147
1157
  fileIndex = /* @__PURE__ */ new Map();
1148
1158
  initialized = false;
1159
+ /** Per-run similarity matrix from a single batched library call (reset each run). */
1160
+ embSimMatrix = [];
1161
+ /** Maps unit ID to its row/column index in embSimMatrix. */
1162
+ embSimIndex = /* @__PURE__ */ new Map();
1163
+ /** Per-run memoization of parent unit similarity scores (reset each run). */
1164
+ parentSimCache = /* @__PURE__ */ new Map();
1149
1165
  static getInstance() {
1150
1166
  if (!_DuplicationCache.instance) {
1151
1167
  _DuplicationCache.instance = new _DuplicationCache();
@@ -1208,6 +1224,84 @@ var DuplicationCache = class _DuplicationCache {
1208
1224
  this.comparisons.clear();
1209
1225
  this.fileIndex.clear();
1210
1226
  this.initialized = false;
1227
+ this.embSimMatrix = [];
1228
+ this.embSimIndex.clear();
1229
+ this.clearRunCaches();
1230
+ }
1231
+ /**
1232
+ * Resets per-run memoization (parent similarities).
1233
+ * The embedding matrix is intentionally preserved so incremental runs can
1234
+ * reuse clean×clean values across calls.
1235
+ */
1236
+ clearRunCaches() {
1237
+ this.parentSimCache.clear();
1238
+ }
1239
+ /**
1240
+ * Builds or incrementally updates the embedding similarity matrix.
1241
+ *
1242
+ * Full rebuild (default): replaces the entire matrix — O(n²).
1243
+ * Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
1244
+ * cells from the old matrix and recomputes only dirty rows via one batched
1245
+ * cosineSimilarity call — O(d·n) where d = number of dirty units.
1246
+ */
1247
+ async buildEmbSimCache(units, dirtyPaths) {
1248
+ const embedded = units.filter((u) => Array.isArray(u.embedding) && u.embedding.length > 0);
1249
+ if (embedded.length < 2) {
1250
+ this.embSimMatrix = [];
1251
+ this.embSimIndex.clear();
1252
+ return;
1253
+ }
1254
+ const embeddings = embedded.map((u) => u.embedding);
1255
+ const newIndex = new Map(embedded.map((u, i) => [u.id, i]));
1256
+ const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
1257
+ const hasPriorMatrix = this.embSimMatrix.length > 0;
1258
+ if (!dirtySet || !hasPriorMatrix) {
1259
+ this.embSimIndex = newIndex;
1260
+ this.embSimMatrix = await parallelCosineSimilarity(embeddings, embeddings);
1261
+ log4("Built full embedding similarity matrix: %d units", embedded.length);
1262
+ return;
1263
+ }
1264
+ const dirtyIds = new Set(embedded.filter((u) => dirtySet.has(u.filePath)).map((u) => u.id));
1265
+ if (dirtyIds.size === 0) {
1266
+ log4("Matrix reused: no dirty units detected");
1267
+ return;
1268
+ }
1269
+ const n = embedded.length;
1270
+ const newMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
1271
+ for (let i = 0; i < n; i++) {
1272
+ for (let j = 0; j < n; j++) {
1273
+ if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
1274
+ const oi = this.embSimIndex.get(embedded[i].id);
1275
+ const oj = this.embSimIndex.get(embedded[j].id);
1276
+ if (oi !== void 0 && oj !== void 0) newMatrix[i][j] = this.embSimMatrix[oi][oj];
1277
+ }
1278
+ }
1279
+ const dirtyIndices = embedded.reduce((acc, u, i) => dirtyIds.has(u.id) ? [...acc, i] : acc, []);
1280
+ const dirtyRows = await parallelCosineSimilarity(dirtyIndices.map((i) => embeddings[i]), embeddings);
1281
+ dirtyIndices.forEach((rowIdx, di) => {
1282
+ for (let j = 0; j < n; j++) {
1283
+ newMatrix[rowIdx][j] = dirtyRows[di][j];
1284
+ newMatrix[j][rowIdx] = dirtyRows[di][j];
1285
+ }
1286
+ });
1287
+ this.embSimIndex = newIndex;
1288
+ this.embSimMatrix = newMatrix;
1289
+ log4("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
1290
+ }
1291
+ /** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
1292
+ getEmbSim(id1, id2) {
1293
+ const i = this.embSimIndex.get(id1);
1294
+ const j = this.embSimIndex.get(id2);
1295
+ if (i === void 0 || j === void 0) return void 0;
1296
+ return this.embSimMatrix[i][j];
1297
+ }
1298
+ /** Returns the memoized parent similarity for the given stable key, if available. */
1299
+ getParentSim(key) {
1300
+ return this.parentSimCache.get(key);
1301
+ }
1302
+ /** Stores a memoized parent similarity for the given stable key. */
1303
+ setParentSim(key, sim) {
1304
+ this.parentSimCache.set(key, sim);
1211
1305
  }
1212
1306
  addKeyForFile(filePath, key) {
1213
1307
  const current = this.fileIndex.get(filePath) ?? /* @__PURE__ */ new Set();
@@ -1224,145 +1318,106 @@ var DuplicationCache = class _DuplicationCache {
1224
1318
  };
1225
1319
 
1226
1320
  // src/services/UpdateService.ts
1227
- var log4 = debug4("DryScan:UpdateService");
1321
+ var log5 = debug5("DryScan:UpdateService");
1228
1322
  var UpdateService = class {
1229
1323
  constructor(deps, exclusionService) {
1230
1324
  this.deps = deps;
1231
1325
  this.exclusionService = exclusionService;
1232
1326
  }
1327
+ /** Returns the list of file paths that were modified or deleted (dirty). */
1233
1328
  async updateIndex() {
1234
1329
  const extractor = this.deps.extractor;
1235
1330
  const cache = DuplicationCache.getInstance();
1236
1331
  try {
1237
1332
  const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
1238
1333
  await this.exclusionService.cleanupExcludedFiles();
1239
- await cache.invalidate([...changeSet.changed, ...changeSet.deleted]);
1334
+ const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
1335
+ await cache.invalidate(dirtyPaths);
1336
+ return dirtyPaths;
1240
1337
  } catch (err) {
1241
- log4("Error during index update:", err);
1338
+ log5("Error during index update:", err);
1242
1339
  throw err;
1243
1340
  }
1244
1341
  }
1245
1342
  };
1246
1343
 
1247
1344
  // src/services/DuplicateService.ts
1248
- import debug5 from "debug";
1345
+ import debug6 from "debug";
1249
1346
  import shortUuid from "short-uuid";
1250
- import { cosineSimilarity } from "@langchain/core/utils/math";
1251
- var log5 = debug5("DryScan:DuplicateService");
1347
+ var log6 = debug6("DryScan:DuplicateService");
1252
1348
  var DuplicateService = class {
1253
1349
  constructor(deps) {
1254
1350
  this.deps = deps;
1255
1351
  }
1256
1352
  config;
1257
1353
  cache = DuplicationCache.getInstance();
1258
- //todo vetter optimisation
1259
- async findDuplicates(config) {
1354
+ /**
1355
+ * @param dirtyPaths - File paths changed since last run. When provided, only
1356
+ * dirty×all similarities are recomputed; clean×clean values are reused from
1357
+ * the existing matrix. Pass undefined (or omit) for a full rebuild.
1358
+ */
1359
+ async findDuplicates(config, dirtyPaths) {
1260
1360
  this.config = config;
1261
1361
  const t0 = performance.now();
1262
1362
  const allUnits = await this.deps.db.getAllUnits();
1263
- log5("Starting duplicate analysis on %d units", allUnits.length);
1363
+ log6("Starting duplicate analysis on %d units", allUnits.length);
1264
1364
  if (allUnits.length < 2) {
1265
- log5("Not enough units to compare, returning empty result");
1266
- const score2 = this.computeDuplicationScore([], allUnits);
1267
- return { duplicates: [], score: score2 };
1365
+ return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
1268
1366
  }
1269
1367
  const thresholds = this.resolveThresholds(config.threshold);
1270
- log5("Resolved thresholds: function=%d, block=%d, class=%d", thresholds.function, thresholds.block, thresholds.class);
1271
- const duplicates = this.computeDuplicates(allUnits, thresholds);
1272
- const filteredDuplicates = duplicates.filter((group) => !this.isGroupExcluded(group));
1273
- log5("Found %d duplicate groups (%d excluded)", filteredDuplicates.length, duplicates.length - filteredDuplicates.length);
1274
- this.cache.update(filteredDuplicates).catch((err) => log5("Cache update failed: %O", err));
1275
- const score = this.computeDuplicationScore(filteredDuplicates, allUnits);
1276
- log5("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
1277
- return { duplicates: filteredDuplicates, score };
1368
+ const duplicates = await this.computeDuplicates(allUnits, thresholds, dirtyPaths);
1369
+ const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
1370
+ log6("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
1371
+ this.cache.update(filtered).catch((err) => log6("Cache update failed: %O", err));
1372
+ const score = this.computeDuplicationScore(filtered, allUnits);
1373
+ log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
1374
+ return { duplicates: filtered, score };
1278
1375
  }
1279
1376
  resolveThresholds(functionThreshold) {
1280
- const defaults = indexConfig.thresholds;
1281
- const clamp = (value) => Math.min(1, Math.max(0, value));
1282
- const base = functionThreshold ?? defaults.function;
1283
- const blockOffset = defaults.block - defaults.function;
1284
- const classOffset = defaults.class - defaults.function;
1285
- const functionThresholdValue = clamp(base);
1377
+ const d = indexConfig.thresholds;
1378
+ const clamp = (v) => Math.min(1, Math.max(0, v));
1379
+ const fn = clamp(functionThreshold ?? d.function);
1286
1380
  return {
1287
- function: functionThresholdValue,
1288
- block: clamp(functionThresholdValue + blockOffset),
1289
- class: clamp(functionThresholdValue + classOffset)
1381
+ function: fn,
1382
+ block: clamp(fn + d.block - d.function),
1383
+ class: clamp(fn + d.class - d.function)
1290
1384
  };
1291
1385
  }
1292
- computeDuplicates(units, thresholds) {
1386
+ async computeDuplicates(units, thresholds, dirtyPaths) {
1387
+ this.cache.clearRunCaches();
1388
+ await this.cache.buildEmbSimCache(units, dirtyPaths);
1293
1389
  const duplicates = [];
1294
- const byType = /* @__PURE__ */ new Map();
1295
- for (const unit of units) {
1296
- const list = byType.get(unit.unitType) ?? [];
1297
- list.push(unit);
1298
- byType.set(unit.unitType, list);
1299
- }
1300
1390
  const t0 = performance.now();
1301
- for (const [type, typedUnits] of byType.entries()) {
1391
+ for (const [type, typedUnits] of this.groupByType(units)) {
1302
1392
  const threshold = this.getThreshold(type, thresholds);
1303
- log5("Comparing %d units of type '%s' with threshold %d", typedUnits.length, type, threshold);
1304
- const typeStart = performance.now();
1393
+ log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
1305
1394
  for (let i = 0; i < typedUnits.length; i++) {
1306
1395
  for (let j = i + 1; j < typedUnits.length; j++) {
1307
- const left = typedUnits[i];
1308
- const right = typedUnits[j];
1309
- if (this.shouldSkipComparison(left, right)) {
1310
- log5("Skipping nested block comparison: '%s' and '%s'", left.name, right.name);
1311
- continue;
1312
- }
1396
+ const left = typedUnits[i], right = typedUnits[j];
1397
+ if (this.shouldSkipComparison(left, right)) continue;
1313
1398
  const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
1314
- let similarity = null;
1315
- if (cached !== null) {
1316
- log5("Cache hit for '%s' <-> '%s': similarity=%d", left.name, right.name, cached);
1317
- similarity = cached;
1318
- } else {
1319
- if (!left.embedding || !right.embedding) {
1320
- log5("Skipping '%s' <-> '%s': missing embedding", left.name, right.name);
1321
- continue;
1322
- }
1323
- similarity = this.computeWeightedSimilarity(left, right);
1324
- log5("Computed similarity for '%s' <-> '%s': %d", left.name, right.name, similarity);
1325
- }
1326
- if (similarity === null) continue;
1327
- if (similarity >= threshold) {
1328
- const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
1329
- if (!exclusionString) continue;
1330
- log5("Duplicate found: '%s' <-> '%s' (similarity=%d)", left.name, right.name, similarity);
1331
- duplicates.push({
1332
- id: `${left.id}::${right.id}`,
1333
- similarity,
1334
- shortId: shortUuid.generate(),
1335
- exclusionString,
1336
- left: {
1337
- id: left.id,
1338
- name: left.name,
1339
- filePath: left.filePath,
1340
- startLine: left.startLine,
1341
- endLine: left.endLine,
1342
- code: left.code,
1343
- unitType: left.unitType
1344
- },
1345
- right: {
1346
- id: right.id,
1347
- name: right.name,
1348
- filePath: right.filePath,
1349
- startLine: right.startLine,
1350
- endLine: right.endLine,
1351
- code: right.code,
1352
- unitType: right.unitType
1353
- }
1354
- });
1355
- }
1399
+ const hasEmbeddings = left.embedding?.length && right.embedding?.length;
1400
+ const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
1401
+ if (similarity < threshold) continue;
1402
+ const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
1403
+ if (!exclusionString) continue;
1404
+ duplicates.push({
1405
+ id: `${left.id}::${right.id}`,
1406
+ similarity,
1407
+ shortId: shortUuid.generate(),
1408
+ exclusionString,
1409
+ left: this.toMember(left),
1410
+ right: this.toMember(right)
1411
+ });
1356
1412
  }
1357
1413
  }
1358
- log5("Type '%s' comparisons completed in %dms", type, (performance.now() - typeStart).toFixed(2));
1359
1414
  }
1360
- log5("computeDuplicates completed in %dms, found %d raw duplicates", (performance.now() - t0).toFixed(2), duplicates.length);
1415
+ log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
1361
1416
  return duplicates.sort((a, b) => b.similarity - a.similarity);
1362
1417
  }
1363
1418
  isGroupExcluded(group) {
1364
1419
  const config = this.config;
1365
- if (!config || !config.excludedPairs || config.excludedPairs.length === 0) return false;
1420
+ if (!config?.excludedPairs?.length) return false;
1366
1421
  const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
1367
1422
  if (!key) return false;
1368
1423
  const actual = this.deps.pairing.parsePairKey(key);
@@ -1377,109 +1432,108 @@ var DuplicateService = class {
1377
1432
  if (type === "block" /* BLOCK */) return thresholds.block;
1378
1433
  return thresholds.function;
1379
1434
  }
1380
- computeWeightedSimilarity(left, right) {
1381
- const selfSimilarity = this.similarityWithFallback(left, right);
1435
+ computeWeightedSimilarity(left, right, threshold) {
1436
+ const selfSim = this.similarity(left, right);
1382
1437
  if (left.unitType === "class" /* CLASS */) {
1383
- return selfSimilarity * indexConfig.weights.class.self;
1438
+ return selfSim * indexConfig.weights.class.self;
1384
1439
  }
1385
1440
  if (left.unitType === "function" /* FUNCTION */) {
1386
- const weights2 = indexConfig.weights.function;
1387
- const hasParentClass2 = !!this.findParentOfType(left, "class" /* CLASS */) && !!this.findParentOfType(right, "class" /* CLASS */);
1388
- const parentClassSimilarity = hasParentClass2 ? this.parentSimilarity(left, right, "class" /* CLASS */) : 0;
1389
- const totalWeight2 = weights2.self + (hasParentClass2 ? weights2.parentClass : 0);
1390
- return (weights2.self * selfSimilarity + (hasParentClass2 ? weights2.parentClass * parentClassSimilarity : 0)) / totalWeight2;
1441
+ const w2 = indexConfig.weights.function;
1442
+ const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
1443
+ const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
1444
+ if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
1445
+ return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
1391
1446
  }
1392
- const weights = indexConfig.weights.block;
1393
- const hasParentFunction = !!this.findParentOfType(left, "function" /* FUNCTION */) && !!this.findParentOfType(right, "function" /* FUNCTION */);
1394
- const hasParentClass = !!this.findParentOfType(left, "class" /* CLASS */) && !!this.findParentOfType(right, "class" /* CLASS */);
1395
- const parentFuncSim = hasParentFunction ? this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0;
1396
- const parentClassSim = hasParentClass ? this.parentSimilarity(left, right, "class" /* CLASS */) : 0;
1397
- const totalWeight = weights.self + (hasParentFunction ? weights.parentFunction : 0) + (hasParentClass ? weights.parentClass : 0);
1398
- return (weights.self * selfSimilarity + (hasParentFunction ? weights.parentFunction * parentFuncSim : 0) + (hasParentClass ? weights.parentClass * parentClassSim : 0)) / totalWeight;
1399
- }
1400
- parentSimilarity(left, right, targetType) {
1401
- const leftParent = this.findParentOfType(left, targetType);
1402
- const rightParent = this.findParentOfType(right, targetType);
1403
- if (!leftParent || !rightParent) return 0;
1404
- return this.similarityWithFallback(leftParent, rightParent);
1405
- }
1406
- similarityWithFallback(left, right) {
1407
- const leftHasEmbedding = this.hasVector(left);
1408
- const rightHasEmbedding = this.hasVector(right);
1409
- if (leftHasEmbedding && rightHasEmbedding) {
1410
- return cosineSimilarity([left.embedding], [right.embedding])[0][0];
1447
+ const w = indexConfig.weights.block;
1448
+ const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
1449
+ const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
1450
+ const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
1451
+ if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
1452
+ return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
1453
+ }
1454
+ /** Groups all units by type for the comparison loop. Units without embeddings are included
1455
+ * so that cache hits can still be returned for pairs whose embeddings were cleared. */
1456
+ groupByType(units) {
1457
+ const byType = /* @__PURE__ */ new Map();
1458
+ for (const unit of units) {
1459
+ const list = byType.get(unit.unitType) ?? [];
1460
+ list.push(unit);
1461
+ byType.set(unit.unitType, list);
1411
1462
  }
1412
- return this.childSimilarity(left, right);
1463
+ return byType;
1464
+ }
1465
+ toMember(unit) {
1466
+ return {
1467
+ id: unit.id,
1468
+ name: unit.name,
1469
+ filePath: unit.filePath,
1470
+ startLine: unit.startLine,
1471
+ endLine: unit.endLine,
1472
+ code: unit.code,
1473
+ unitType: unit.unitType
1474
+ };
1475
+ }
1476
+ bothHaveParent(left, right, type) {
1477
+ return !!this.findParent(left, type) && !!this.findParent(right, type);
1478
+ }
1479
+ parentSimilarity(left, right, type) {
1480
+ const lp = this.findParent(left, type), rp = this.findParent(right, type);
1481
+ if (!lp || !rp) return 0;
1482
+ const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
1483
+ const cached = this.cache.getParentSim(key);
1484
+ if (cached !== void 0) return cached;
1485
+ const sim = this.similarity(lp, rp);
1486
+ this.cache.setParentSim(key, sim);
1487
+ return sim;
1488
+ }
1489
+ /** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
1490
+ similarity(left, right) {
1491
+ return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
1413
1492
  }
1414
1493
  childSimilarity(left, right) {
1415
- const leftChildren = left.children ?? [];
1416
- const rightChildren = right.children ?? [];
1417
- if (leftChildren.length === 0 || rightChildren.length === 0) return 0;
1494
+ const lc = left.children ?? [], rc = right.children ?? [];
1495
+ if (!lc.length || !rc.length) return 0;
1418
1496
  let best = 0;
1419
- for (const lChild of leftChildren) {
1420
- for (const rChild of rightChildren) {
1421
- if (lChild.unitType !== rChild.unitType) continue;
1422
- const sim = this.similarityWithFallback(lChild, rChild);
1497
+ for (const l of lc) {
1498
+ for (const r of rc) {
1499
+ if (l.unitType !== r.unitType) continue;
1500
+ const sim = this.similarity(l, r);
1423
1501
  if (sim > best) best = sim;
1424
1502
  }
1425
1503
  }
1426
1504
  return best;
1427
1505
  }
1428
- hasVector(unit) {
1429
- return Array.isArray(unit.embedding) && unit.embedding.length > 0;
1430
- }
1431
1506
  shouldSkipComparison(left, right) {
1432
- if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) {
1433
- return false;
1434
- }
1435
- if (left.filePath !== right.filePath) {
1436
- return false;
1437
- }
1438
- const leftContainsRight = left.startLine <= right.startLine && left.endLine >= right.endLine;
1439
- const rightContainsLeft = right.startLine <= left.startLine && right.endLine >= left.endLine;
1440
- return leftContainsRight || rightContainsLeft;
1441
- }
1442
- findParentOfType(unit, targetType) {
1443
- let current = unit.parent;
1444
- while (current) {
1445
- if (current.unitType === targetType) return current;
1446
- current = current.parent;
1507
+ if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
1508
+ if (left.filePath !== right.filePath) return false;
1509
+ return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
1510
+ }
1511
+ findParent(unit, type) {
1512
+ let p = unit.parent;
1513
+ while (p) {
1514
+ if (p.unitType === type) return p;
1515
+ p = p.parent;
1447
1516
  }
1448
1517
  return null;
1449
1518
  }
1450
1519
  computeDuplicationScore(duplicates, allUnits) {
1451
- const totalLines = this.calculateTotalLines(allUnits);
1452
- if (totalLines === 0 || duplicates.length === 0) {
1453
- return {
1454
- score: 0,
1455
- grade: "Excellent",
1456
- totalLines,
1457
- duplicateLines: 0,
1458
- duplicateGroups: 0
1459
- };
1520
+ const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
1521
+ if (!totalLines || !duplicates.length) {
1522
+ return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
1460
1523
  }
1461
- const weightedDuplicateLines = duplicates.reduce((sum, group) => {
1462
- const leftLines = group.left.endLine - group.left.startLine + 1;
1463
- const rightLines = group.right.endLine - group.right.startLine + 1;
1464
- const avgLines = (leftLines + rightLines) / 2;
1465
- return sum + group.similarity * avgLines;
1524
+ const duplicateLines = duplicates.reduce((sum, g) => {
1525
+ const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
1526
+ return sum + g.similarity * avg;
1466
1527
  }, 0);
1467
- const score = weightedDuplicateLines / totalLines * 100;
1468
- const grade = this.getScoreGrade(score);
1528
+ const score = duplicateLines / totalLines * 100;
1469
1529
  return {
1470
1530
  score,
1471
- grade,
1531
+ grade: this.getScoreGrade(score),
1472
1532
  totalLines,
1473
- duplicateLines: Math.round(weightedDuplicateLines),
1533
+ duplicateLines: Math.round(duplicateLines),
1474
1534
  duplicateGroups: duplicates.length
1475
1535
  };
1476
1536
  }
1477
- calculateTotalLines(units) {
1478
- return units.reduce((sum, unit) => {
1479
- const lines = unit.endLine - unit.startLine + 1;
1480
- return sum + lines;
1481
- }, 0);
1482
- }
1483
1537
  getScoreGrade(score) {
1484
1538
  if (score < 5) return "Excellent";
1485
1539
  if (score < 15) return "Good";
@@ -1575,9 +1629,9 @@ var ExclusionService = class {
1575
1629
 
1576
1630
  // src/services/PairingService.ts
1577
1631
  import crypto3 from "crypto";
1578
- import debug6 from "debug";
1632
+ import debug7 from "debug";
1579
1633
  import { minimatch as minimatch2 } from "minimatch";
1580
- var log6 = debug6("DryScan:pairs");
1634
+ var log7 = debug7("DryScan:pairs");
1581
1635
  var PairingService = class {
1582
1636
  constructor(indexUnitExtractor) {
1583
1637
  this.indexUnitExtractor = indexUnitExtractor;
@@ -1588,7 +1642,7 @@ var PairingService = class {
1588
1642
  */
1589
1643
  pairKeyForUnits(left, right) {
1590
1644
  if (left.unitType !== right.unitType) {
1591
- log6("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
1645
+ log7("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
1592
1646
  return null;
1593
1647
  }
1594
1648
  const type = left.unitType;
@@ -1604,13 +1658,13 @@ var PairingService = class {
1604
1658
  parsePairKey(value) {
1605
1659
  const parts = value.split("|");
1606
1660
  if (parts.length !== 3) {
1607
- log6("Invalid pair key format: %s", value);
1661
+ log7("Invalid pair key format: %s", value);
1608
1662
  return null;
1609
1663
  }
1610
1664
  const [typeRaw, leftRaw, rightRaw] = parts;
1611
1665
  const type = this.stringToUnitType(typeRaw);
1612
1666
  if (!type) {
1613
- log6("Unknown unit type in pair key: %s", typeRaw);
1667
+ log7("Unknown unit type in pair key: %s", typeRaw);
1614
1668
  return null;
1615
1669
  }
1616
1670
  const [left, right] = [leftRaw, rightRaw].sort();
@@ -1744,9 +1798,10 @@ var DryScan = class {
1744
1798
  console.log("[DryScan] Checking for file changes...");
1745
1799
  const start = Date.now();
1746
1800
  await this.ensureDatabase();
1747
- await this.services.updater.updateIndex();
1801
+ const dirtyPaths = await this.services.updater.updateIndex();
1748
1802
  const duration = Date.now() - start;
1749
1803
  console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
1804
+ return dirtyPaths;
1750
1805
  }
1751
1806
  /**
1752
1807
  * Runs duplicate detection and returns a normalized report payload ready for persistence or display.
@@ -1775,12 +1830,12 @@ var DryScan = class {
1775
1830
  await this.ensureDatabase();
1776
1831
  console.log("[DryScan] Updating index...");
1777
1832
  const updateStart = Date.now();
1778
- await this.updateIndex();
1833
+ const dirtyPaths = await this.updateIndex();
1779
1834
  const updateDuration = Date.now() - updateStart;
1780
1835
  console.log(`[DryScan] Index update took ${updateDuration}ms.`);
1781
1836
  console.log("[DryScan] Detecting duplicates...");
1782
1837
  const dupStart = Date.now();
1783
- const result = await this.services.duplicate.findDuplicates(config);
1838
+ const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
1784
1839
  const dupDuration = Date.now() - dupStart;
1785
1840
  console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
1786
1841
  return result;