@goshenkata/dryscan-core 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -221,7 +221,7 @@ declare class DryScan {
221
221
  * 6. Recompute embeddings for affected units
222
222
  * 7. Update file tracking metadata
223
223
  */
224
- updateIndex(): Promise<void>;
224
+ updateIndex(): Promise<string[]>;
225
225
  /**
226
226
  * Runs duplicate detection and returns a normalized report payload ready for persistence or display.
227
227
  */
package/dist/index.js CHANGED
@@ -61,7 +61,7 @@ var DEFAULT_CONFIG = {
61
61
  excludedPairs: [],
62
62
  minLines: 3,
63
63
  minBlockLines: 5,
64
- threshold: 0.88,
64
+ threshold: 0.8,
65
65
  embeddingSource: "http://localhost:11434",
66
66
  contextLength: 2048
67
67
  };
@@ -244,7 +244,8 @@ var JavaExtractor = class {
244
244
  const fnUnit = this.buildFunctionUnit(node, source, fileRelPath, currentClass);
245
245
  const fnLength = fnUnit.endLine - fnUnit.startLine;
246
246
  const bodyNode = this.getFunctionBody(node);
247
- const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength);
247
+ const fnArity = this.getNodeArity(node);
248
+ const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength, fnArity);
248
249
  if (skipFunction) {
249
250
  return;
250
251
  }
@@ -311,22 +312,34 @@ var JavaExtractor = class {
311
312
  const normalized = this.normalizeCode(unit.code);
312
313
  return crypto.createHash(BLOCK_HASH_ALGO).update(normalized).digest("hex");
313
314
  }
314
- shouldSkip(unitType, name, lineCount) {
315
+ shouldSkip(unitType, name, lineCount, arity) {
315
316
  if (!this.config) {
316
317
  throw new Error("Config not loaded before skip evaluation");
317
318
  }
318
319
  const config = this.config;
319
320
  const minLines = unitType === "block" /* BLOCK */ ? Math.max(indexConfig.blockMinLines, config.minBlockLines ?? 0) : config.minLines;
320
321
  const belowMin = minLines > 0 && lineCount < minLines;
321
- const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name);
322
+ const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name, arity ?? 0);
322
323
  return belowMin || trivial;
323
324
  }
324
- isTrivialFunction(fullName) {
325
+ /**
326
+ * A function is trivial if it follows a simple accessor pattern:
327
+ * - getters/isers: name matches get[A-Z] or is[A-Z] with exactly 0 parameters
328
+ * - setters: name matches set[A-Z] with at most 1 parameter
329
+ * Methods like getUserById(Long id) have arity > 0 and are NOT trivial.
330
+ */
331
+ isTrivialFunction(fullName, arity) {
325
332
  const simpleName = fullName.split(".").pop() || fullName;
326
- const isGetter = /^(get|is)[A-Z]/.test(simpleName);
327
- const isSetter = /^set[A-Z]/.test(simpleName);
333
+ const isGetter = /^(get|is)[A-Z]/.test(simpleName) && arity === 0;
334
+ const isSetter = /^set[A-Z]/.test(simpleName) && arity <= 1;
328
335
  return isGetter || isSetter;
329
336
  }
337
+ /** Counts the formal parameters of a method or constructor node. */
338
+ getNodeArity(node) {
339
+ const params = node.childForFieldName?.("parameters");
340
+ if (!params) return 0;
341
+ return params.namedChildren.filter((c) => c.type === "formal_parameter" || c.type === "spread_parameter").length;
342
+ }
330
343
  isDtoClass(node, source, className) {
331
344
  const classBody = node.children.find((child) => child.type === "class_body");
332
345
  if (!classBody) return false;
@@ -344,7 +357,8 @@ var JavaExtractor = class {
344
357
  if (child.type === "method_declaration" || child.type === "constructor_declaration") {
345
358
  const simpleName = this.getSimpleFunctionName(child, source);
346
359
  const fullName = `${className}.${simpleName}`;
347
- if (!this.isTrivialFunction(fullName)) {
360
+ const arity = this.getNodeArity(child);
361
+ if (!this.isTrivialFunction(fullName, arity)) {
348
362
  return false;
349
363
  }
350
364
  continue;
@@ -368,6 +382,7 @@ var JavaExtractor = class {
368
382
  filePath: file,
369
383
  startLine,
370
384
  endLine,
385
+ children: [],
371
386
  code: this.stripComments(source.slice(node.startIndex, node.endIndex)),
372
387
  unitType: "function" /* FUNCTION */,
373
388
  parentId: parentClass?.id,
@@ -402,9 +417,11 @@ var JavaExtractor = class {
402
417
  parentId: parentFunction.id,
403
418
  parent: parentFunction
404
419
  };
420
+ const contextLength = this.config?.contextLength ?? 2048;
421
+ const splitBlocks = this.textSplitBlockIfOverContextLimit(blockUnit, contextLength);
405
422
  parentFunction.children = parentFunction.children || [];
406
- parentFunction.children.push(blockUnit);
407
- blocks.push(blockUnit);
423
+ parentFunction.children.push(...splitBlocks);
424
+ blocks.push(...splitBlocks);
408
425
  }
409
426
  }
410
427
  for (let i = 0; i < n.namedChildCount; i++) {
@@ -450,6 +467,21 @@ var JavaExtractor = class {
450
467
  removeDuplicates(units) {
451
468
  return Array.from(new Map(units.map((u) => [u.id, u])).values());
452
469
  }
470
+ /** Splits a block unit's code into chunks if it exceeds the context length limit. */
471
+ textSplitBlockIfOverContextLimit(unit, contextLength) {
472
+ if (unit.code.length <= contextLength) return [unit];
473
+ const chunks = [];
474
+ let chunkIndex = 0;
475
+ for (let i = 0; i < unit.code.length; i += contextLength) {
476
+ chunks.push({
477
+ ...unit,
478
+ id: `${unit.id}:chunk${chunkIndex}`,
479
+ code: unit.code.slice(i, i + contextLength)
480
+ });
481
+ chunkIndex++;
482
+ }
483
+ return chunks;
484
+ }
453
485
  };
454
486
 
455
487
  // src/Gitignore.ts
@@ -1003,7 +1035,7 @@ var RepositoryInitializer = class {
1003
1035
  };
1004
1036
 
1005
1037
  // src/services/UpdateService.ts
1006
- import debug4 from "debug";
1038
+ import debug5 from "debug";
1007
1039
 
1008
1040
  // src/DryScanUpdater.ts
1009
1041
  import path4 from "path";
@@ -1123,11 +1155,20 @@ async function performIncrementalUpdate(repoPath, extractor, db) {
1123
1155
  }
1124
1156
 
1125
1157
  // src/services/DuplicationCache.ts
1158
+ import debug4 from "debug";
1159
+ import { cosineSimilarity } from "@langchain/core/utils/math";
1160
+ var log4 = debug4("DryScan:DuplicationCache");
1126
1161
  var DuplicationCache = class _DuplicationCache {
1127
1162
  static instance = null;
1128
1163
  comparisons = /* @__PURE__ */ new Map();
1129
1164
  fileIndex = /* @__PURE__ */ new Map();
1130
1165
  initialized = false;
1166
+ /** Per-run similarity matrix from a single batched library call (reset each run). */
1167
+ embSimMatrix = [];
1168
+ /** Maps unit ID to its row/column index in embSimMatrix. */
1169
+ embSimIndex = /* @__PURE__ */ new Map();
1170
+ /** Per-run memoization of parent unit similarity scores (reset each run). */
1171
+ parentSimCache = /* @__PURE__ */ new Map();
1131
1172
  static getInstance() {
1132
1173
  if (!_DuplicationCache.instance) {
1133
1174
  _DuplicationCache.instance = new _DuplicationCache();
@@ -1190,6 +1231,84 @@ var DuplicationCache = class _DuplicationCache {
1190
1231
  this.comparisons.clear();
1191
1232
  this.fileIndex.clear();
1192
1233
  this.initialized = false;
1234
+ this.embSimMatrix = [];
1235
+ this.embSimIndex.clear();
1236
+ this.clearRunCaches();
1237
+ }
1238
+ /**
1239
+ * Resets per-run memoization (parent similarities).
1240
+ * The embedding matrix is intentionally preserved so incremental runs can
1241
+ * reuse clean×clean values across calls.
1242
+ */
1243
+ clearRunCaches() {
1244
+ this.parentSimCache.clear();
1245
+ }
1246
+ /**
1247
+ * Builds or incrementally updates the embedding similarity matrix.
1248
+ *
1249
+ * Full rebuild (default): replaces the entire matrix — O(n²).
1250
+ * Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
1251
+ * cells from the old matrix and recomputes only dirty rows via one batched
1252
+ * cosineSimilarity call — O(d·n) where d = number of dirty units.
1253
+ */
1254
+ buildEmbSimCache(units, dirtyPaths) {
1255
+ const embedded = units.filter((u) => Array.isArray(u.embedding) && u.embedding.length > 0);
1256
+ if (embedded.length < 2) {
1257
+ this.embSimMatrix = [];
1258
+ this.embSimIndex.clear();
1259
+ return;
1260
+ }
1261
+ const embeddings = embedded.map((u) => u.embedding);
1262
+ const newIndex = new Map(embedded.map((u, i) => [u.id, i]));
1263
+ const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
1264
+ const hasPriorMatrix = this.embSimMatrix.length > 0;
1265
+ if (!dirtySet || !hasPriorMatrix) {
1266
+ this.embSimIndex = newIndex;
1267
+ this.embSimMatrix = cosineSimilarity(embeddings, embeddings);
1268
+ log4("Built full embedding similarity matrix: %d units", embedded.length);
1269
+ return;
1270
+ }
1271
+ const dirtyIds = new Set(embedded.filter((u) => dirtySet.has(u.filePath)).map((u) => u.id));
1272
+ if (dirtyIds.size === 0) {
1273
+ log4("Matrix reused: no dirty units detected");
1274
+ return;
1275
+ }
1276
+ const n = embedded.length;
1277
+ const newMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
1278
+ for (let i = 0; i < n; i++) {
1279
+ for (let j = 0; j < n; j++) {
1280
+ if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
1281
+ const oi = this.embSimIndex.get(embedded[i].id);
1282
+ const oj = this.embSimIndex.get(embedded[j].id);
1283
+ if (oi !== void 0 && oj !== void 0) newMatrix[i][j] = this.embSimMatrix[oi][oj];
1284
+ }
1285
+ }
1286
+ const dirtyIndices = embedded.reduce((acc, u, i) => dirtyIds.has(u.id) ? [...acc, i] : acc, []);
1287
+ const dirtyRows = cosineSimilarity(dirtyIndices.map((i) => embeddings[i]), embeddings);
1288
+ dirtyIndices.forEach((rowIdx, di) => {
1289
+ for (let j = 0; j < n; j++) {
1290
+ newMatrix[rowIdx][j] = dirtyRows[di][j];
1291
+ newMatrix[j][rowIdx] = dirtyRows[di][j];
1292
+ }
1293
+ });
1294
+ this.embSimIndex = newIndex;
1295
+ this.embSimMatrix = newMatrix;
1296
+ log4("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
1297
+ }
1298
+ /** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
1299
+ getEmbSim(id1, id2) {
1300
+ const i = this.embSimIndex.get(id1);
1301
+ const j = this.embSimIndex.get(id2);
1302
+ if (i === void 0 || j === void 0) return void 0;
1303
+ return this.embSimMatrix[i][j];
1304
+ }
1305
+ /** Returns the memoized parent similarity for the given stable key, if available. */
1306
+ getParentSim(key) {
1307
+ return this.parentSimCache.get(key);
1308
+ }
1309
+ /** Stores a memoized parent similarity for the given stable key. */
1310
+ setParentSim(key, sim) {
1311
+ this.parentSimCache.set(key, sim);
1193
1312
  }
1194
1313
  addKeyForFile(filePath, key) {
1195
1314
  const current = this.fileIndex.get(filePath) ?? /* @__PURE__ */ new Set();
@@ -1206,125 +1325,106 @@ var DuplicationCache = class _DuplicationCache {
1206
1325
  };
1207
1326
 
1208
1327
  // src/services/UpdateService.ts
1209
- var log4 = debug4("DryScan:UpdateService");
1328
+ var log5 = debug5("DryScan:UpdateService");
1210
1329
  var UpdateService = class {
1211
1330
  constructor(deps, exclusionService) {
1212
1331
  this.deps = deps;
1213
1332
  this.exclusionService = exclusionService;
1214
1333
  }
1334
+ /** Returns the list of file paths that were modified or deleted (dirty). */
1215
1335
  async updateIndex() {
1216
1336
  const extractor = this.deps.extractor;
1217
1337
  const cache = DuplicationCache.getInstance();
1218
1338
  try {
1219
1339
  const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
1220
1340
  await this.exclusionService.cleanupExcludedFiles();
1221
- await cache.invalidate([...changeSet.changed, ...changeSet.deleted]);
1341
+ const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
1342
+ await cache.invalidate(dirtyPaths);
1343
+ return dirtyPaths;
1222
1344
  } catch (err) {
1223
- log4("Error during index update:", err);
1345
+ log5("Error during index update:", err);
1224
1346
  throw err;
1225
1347
  }
1226
1348
  }
1227
1349
  };
1228
1350
 
1229
1351
  // src/services/DuplicateService.ts
1230
- import debug5 from "debug";
1352
+ import debug6 from "debug";
1231
1353
  import shortUuid from "short-uuid";
1232
- import { cosineSimilarity } from "@langchain/core/utils/math";
1233
- var log5 = debug5("DryScan:DuplicateService");
1354
+ var log6 = debug6("DryScan:DuplicateService");
1234
1355
  var DuplicateService = class {
1235
1356
  constructor(deps) {
1236
1357
  this.deps = deps;
1237
1358
  }
1238
1359
  config;
1239
1360
  cache = DuplicationCache.getInstance();
1240
- async findDuplicates(config) {
1361
+ /**
1362
+ * @param dirtyPaths - File paths changed since last run. When provided, only
1363
+ * dirty×all similarities are recomputed; clean×clean values are reused from
1364
+ * the existing matrix. Pass undefined (or omit) for a full rebuild.
1365
+ */
1366
+ async findDuplicates(config, dirtyPaths) {
1241
1367
  this.config = config;
1368
+ const t0 = performance.now();
1242
1369
  const allUnits = await this.deps.db.getAllUnits();
1370
+ log6("Starting duplicate analysis on %d units", allUnits.length);
1243
1371
  if (allUnits.length < 2) {
1244
- const score2 = this.computeDuplicationScore([], allUnits);
1245
- return { duplicates: [], score: score2 };
1372
+ return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
1246
1373
  }
1247
1374
  const thresholds = this.resolveThresholds(config.threshold);
1248
- const duplicates = this.computeDuplicates(allUnits, thresholds);
1249
- const filteredDuplicates = duplicates.filter((group) => !this.isGroupExcluded(group));
1250
- log5("Found %d duplicate groups", filteredDuplicates.length);
1251
- this.cache.update(filteredDuplicates).catch((err) => log5("Cache update failed: %O", err));
1252
- const score = this.computeDuplicationScore(filteredDuplicates, allUnits);
1253
- return { duplicates: filteredDuplicates, score };
1375
+ const duplicates = this.computeDuplicates(allUnits, thresholds, dirtyPaths);
1376
+ const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
1377
+ log6("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
1378
+ this.cache.update(filtered).catch((err) => log6("Cache update failed: %O", err));
1379
+ const score = this.computeDuplicationScore(filtered, allUnits);
1380
+ log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
1381
+ return { duplicates: filtered, score };
1254
1382
  }
1255
1383
  resolveThresholds(functionThreshold) {
1256
- const defaults = indexConfig.thresholds;
1257
- const clamp = (value) => Math.min(1, Math.max(0, value));
1258
- const base = functionThreshold ?? defaults.function;
1259
- const blockOffset = defaults.block - defaults.function;
1260
- const classOffset = defaults.class - defaults.function;
1261
- const functionThresholdValue = clamp(base);
1384
+ const d = indexConfig.thresholds;
1385
+ const clamp = (v) => Math.min(1, Math.max(0, v));
1386
+ const fn = clamp(functionThreshold ?? d.function);
1262
1387
  return {
1263
- function: functionThresholdValue,
1264
- block: clamp(functionThresholdValue + blockOffset),
1265
- class: clamp(functionThresholdValue + classOffset)
1388
+ function: fn,
1389
+ block: clamp(fn + d.block - d.function),
1390
+ class: clamp(fn + d.class - d.function)
1266
1391
  };
1267
1392
  }
1268
- computeDuplicates(units, thresholds) {
1393
+ computeDuplicates(units, thresholds, dirtyPaths) {
1394
+ this.cache.clearRunCaches();
1395
+ this.cache.buildEmbSimCache(units, dirtyPaths);
1269
1396
  const duplicates = [];
1270
- const byType = /* @__PURE__ */ new Map();
1271
- for (const unit of units) {
1272
- const list = byType.get(unit.unitType) ?? [];
1273
- list.push(unit);
1274
- byType.set(unit.unitType, list);
1275
- }
1276
- for (const [type, typedUnits] of byType.entries()) {
1397
+ const t0 = performance.now();
1398
+ for (const [type, typedUnits] of this.groupByType(units)) {
1277
1399
  const threshold = this.getThreshold(type, thresholds);
1400
+ log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
1278
1401
  for (let i = 0; i < typedUnits.length; i++) {
1279
1402
  for (let j = i + 1; j < typedUnits.length; j++) {
1280
- const left = typedUnits[i];
1281
- const right = typedUnits[j];
1403
+ const left = typedUnits[i], right = typedUnits[j];
1282
1404
  if (this.shouldSkipComparison(left, right)) continue;
1283
1405
  const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
1284
- let similarity = null;
1285
- if (cached !== null) {
1286
- similarity = cached;
1287
- } else {
1288
- if (!left.embedding || !right.embedding) continue;
1289
- similarity = this.computeWeightedSimilarity(left, right);
1290
- }
1291
- if (similarity === null) continue;
1292
- if (similarity >= threshold) {
1293
- const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
1294
- if (!exclusionString) continue;
1295
- duplicates.push({
1296
- id: `${left.id}::${right.id}`,
1297
- similarity,
1298
- shortId: shortUuid.generate(),
1299
- exclusionString,
1300
- left: {
1301
- id: left.id,
1302
- name: left.name,
1303
- filePath: left.filePath,
1304
- startLine: left.startLine,
1305
- endLine: left.endLine,
1306
- code: left.code,
1307
- unitType: left.unitType
1308
- },
1309
- right: {
1310
- id: right.id,
1311
- name: right.name,
1312
- filePath: right.filePath,
1313
- startLine: right.startLine,
1314
- endLine: right.endLine,
1315
- code: right.code,
1316
- unitType: right.unitType
1317
- }
1318
- });
1319
- }
1406
+ const hasEmbeddings = left.embedding?.length && right.embedding?.length;
1407
+ const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
1408
+ if (similarity < threshold) continue;
1409
+ const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
1410
+ if (!exclusionString) continue;
1411
+ duplicates.push({
1412
+ id: `${left.id}::${right.id}`,
1413
+ similarity,
1414
+ shortId: shortUuid.generate(),
1415
+ exclusionString,
1416
+ left: this.toMember(left),
1417
+ right: this.toMember(right)
1418
+ });
1320
1419
  }
1321
1420
  }
1322
1421
  }
1422
+ log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
1323
1423
  return duplicates.sort((a, b) => b.similarity - a.similarity);
1324
1424
  }
1325
1425
  isGroupExcluded(group) {
1326
1426
  const config = this.config;
1327
- if (!config || !config.excludedPairs || config.excludedPairs.length === 0) return false;
1427
+ if (!config?.excludedPairs?.length) return false;
1328
1428
  const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
1329
1429
  if (!key) return false;
1330
1430
  const actual = this.deps.pairing.parsePairKey(key);
@@ -1339,109 +1439,108 @@ var DuplicateService = class {
1339
1439
  if (type === "block" /* BLOCK */) return thresholds.block;
1340
1440
  return thresholds.function;
1341
1441
  }
1342
- computeWeightedSimilarity(left, right) {
1343
- const selfSimilarity = this.similarityWithFallback(left, right);
1442
+ computeWeightedSimilarity(left, right, threshold) {
1443
+ const selfSim = this.similarity(left, right);
1344
1444
  if (left.unitType === "class" /* CLASS */) {
1345
- return selfSimilarity * indexConfig.weights.class.self;
1445
+ return selfSim * indexConfig.weights.class.self;
1346
1446
  }
1347
1447
  if (left.unitType === "function" /* FUNCTION */) {
1348
- const weights2 = indexConfig.weights.function;
1349
- const hasParentClass2 = !!this.findParentOfType(left, "class" /* CLASS */) && !!this.findParentOfType(right, "class" /* CLASS */);
1350
- const parentClassSimilarity = hasParentClass2 ? this.parentSimilarity(left, right, "class" /* CLASS */) : 0;
1351
- const totalWeight2 = weights2.self + (hasParentClass2 ? weights2.parentClass : 0);
1352
- return (weights2.self * selfSimilarity + (hasParentClass2 ? weights2.parentClass * parentClassSimilarity : 0)) / totalWeight2;
1448
+ const w2 = indexConfig.weights.function;
1449
+ const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
1450
+ const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
1451
+ if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
1452
+ return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
1353
1453
  }
1354
- const weights = indexConfig.weights.block;
1355
- const hasParentFunction = !!this.findParentOfType(left, "function" /* FUNCTION */) && !!this.findParentOfType(right, "function" /* FUNCTION */);
1356
- const hasParentClass = !!this.findParentOfType(left, "class" /* CLASS */) && !!this.findParentOfType(right, "class" /* CLASS */);
1357
- const parentFuncSim = hasParentFunction ? this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0;
1358
- const parentClassSim = hasParentClass ? this.parentSimilarity(left, right, "class" /* CLASS */) : 0;
1359
- const totalWeight = weights.self + (hasParentFunction ? weights.parentFunction : 0) + (hasParentClass ? weights.parentClass : 0);
1360
- return (weights.self * selfSimilarity + (hasParentFunction ? weights.parentFunction * parentFuncSim : 0) + (hasParentClass ? weights.parentClass * parentClassSim : 0)) / totalWeight;
1361
- }
1362
- parentSimilarity(left, right, targetType) {
1363
- const leftParent = this.findParentOfType(left, targetType);
1364
- const rightParent = this.findParentOfType(right, targetType);
1365
- if (!leftParent || !rightParent) return 0;
1366
- return this.similarityWithFallback(leftParent, rightParent);
1367
- }
1368
- similarityWithFallback(left, right) {
1369
- const leftHasEmbedding = this.hasVector(left);
1370
- const rightHasEmbedding = this.hasVector(right);
1371
- if (leftHasEmbedding && rightHasEmbedding) {
1372
- return cosineSimilarity([left.embedding], [right.embedding])[0][0];
1454
+ const w = indexConfig.weights.block;
1455
+ const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
1456
+ const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
1457
+ const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
1458
+ if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
1459
+ return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
1460
+ }
1461
+ /** Groups all units by type for the comparison loop. Units without embeddings are included
1462
+ * so that cache hits can still be returned for pairs whose embeddings were cleared. */
1463
+ groupByType(units) {
1464
+ const byType = /* @__PURE__ */ new Map();
1465
+ for (const unit of units) {
1466
+ const list = byType.get(unit.unitType) ?? [];
1467
+ list.push(unit);
1468
+ byType.set(unit.unitType, list);
1373
1469
  }
1374
- return this.childSimilarity(left, right);
1470
+ return byType;
1471
+ }
1472
+ toMember(unit) {
1473
+ return {
1474
+ id: unit.id,
1475
+ name: unit.name,
1476
+ filePath: unit.filePath,
1477
+ startLine: unit.startLine,
1478
+ endLine: unit.endLine,
1479
+ code: unit.code,
1480
+ unitType: unit.unitType
1481
+ };
1482
+ }
1483
+ bothHaveParent(left, right, type) {
1484
+ return !!this.findParent(left, type) && !!this.findParent(right, type);
1485
+ }
1486
+ parentSimilarity(left, right, type) {
1487
+ const lp = this.findParent(left, type), rp = this.findParent(right, type);
1488
+ if (!lp || !rp) return 0;
1489
+ const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
1490
+ const cached = this.cache.getParentSim(key);
1491
+ if (cached !== void 0) return cached;
1492
+ const sim = this.similarity(lp, rp);
1493
+ this.cache.setParentSim(key, sim);
1494
+ return sim;
1495
+ }
1496
+ /** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
1497
+ similarity(left, right) {
1498
+ return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
1375
1499
  }
1376
1500
  childSimilarity(left, right) {
1377
- const leftChildren = left.children ?? [];
1378
- const rightChildren = right.children ?? [];
1379
- if (leftChildren.length === 0 || rightChildren.length === 0) return 0;
1501
+ const lc = left.children ?? [], rc = right.children ?? [];
1502
+ if (!lc.length || !rc.length) return 0;
1380
1503
  let best = 0;
1381
- for (const lChild of leftChildren) {
1382
- for (const rChild of rightChildren) {
1383
- if (lChild.unitType !== rChild.unitType) continue;
1384
- const sim = this.similarityWithFallback(lChild, rChild);
1504
+ for (const l of lc) {
1505
+ for (const r of rc) {
1506
+ if (l.unitType !== r.unitType) continue;
1507
+ const sim = this.similarity(l, r);
1385
1508
  if (sim > best) best = sim;
1386
1509
  }
1387
1510
  }
1388
1511
  return best;
1389
1512
  }
1390
- hasVector(unit) {
1391
- return Array.isArray(unit.embedding) && unit.embedding.length > 0;
1392
- }
1393
1513
  shouldSkipComparison(left, right) {
1394
- if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) {
1395
- return false;
1396
- }
1397
- if (left.filePath !== right.filePath) {
1398
- return false;
1399
- }
1400
- const leftContainsRight = left.startLine <= right.startLine && left.endLine >= right.endLine;
1401
- const rightContainsLeft = right.startLine <= left.startLine && right.endLine >= left.endLine;
1402
- return leftContainsRight || rightContainsLeft;
1403
- }
1404
- findParentOfType(unit, targetType) {
1405
- let current = unit.parent;
1406
- while (current) {
1407
- if (current.unitType === targetType) return current;
1408
- current = current.parent;
1514
+ if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
1515
+ if (left.filePath !== right.filePath) return false;
1516
+ return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
1517
+ }
1518
+ findParent(unit, type) {
1519
+ let p = unit.parent;
1520
+ while (p) {
1521
+ if (p.unitType === type) return p;
1522
+ p = p.parent;
1409
1523
  }
1410
1524
  return null;
1411
1525
  }
1412
1526
  computeDuplicationScore(duplicates, allUnits) {
1413
- const totalLines = this.calculateTotalLines(allUnits);
1414
- if (totalLines === 0 || duplicates.length === 0) {
1415
- return {
1416
- score: 0,
1417
- grade: "Excellent",
1418
- totalLines,
1419
- duplicateLines: 0,
1420
- duplicateGroups: 0
1421
- };
1527
+ const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
1528
+ if (!totalLines || !duplicates.length) {
1529
+ return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
1422
1530
  }
1423
- const weightedDuplicateLines = duplicates.reduce((sum, group) => {
1424
- const leftLines = group.left.endLine - group.left.startLine + 1;
1425
- const rightLines = group.right.endLine - group.right.startLine + 1;
1426
- const avgLines = (leftLines + rightLines) / 2;
1427
- return sum + group.similarity * avgLines;
1531
+ const duplicateLines = duplicates.reduce((sum, g) => {
1532
+ const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
1533
+ return sum + g.similarity * avg;
1428
1534
  }, 0);
1429
- const score = weightedDuplicateLines / totalLines * 100;
1430
- const grade = this.getScoreGrade(score);
1535
+ const score = duplicateLines / totalLines * 100;
1431
1536
  return {
1432
1537
  score,
1433
- grade,
1538
+ grade: this.getScoreGrade(score),
1434
1539
  totalLines,
1435
- duplicateLines: Math.round(weightedDuplicateLines),
1540
+ duplicateLines: Math.round(duplicateLines),
1436
1541
  duplicateGroups: duplicates.length
1437
1542
  };
1438
1543
  }
1439
- calculateTotalLines(units) {
1440
- return units.reduce((sum, unit) => {
1441
- const lines = unit.endLine - unit.startLine + 1;
1442
- return sum + lines;
1443
- }, 0);
1444
- }
1445
1544
  getScoreGrade(score) {
1446
1545
  if (score < 5) return "Excellent";
1447
1546
  if (score < 15) return "Good";
@@ -1537,9 +1636,9 @@ var ExclusionService = class {
1537
1636
 
1538
1637
  // src/services/PairingService.ts
1539
1638
  import crypto3 from "crypto";
1540
- import debug6 from "debug";
1639
+ import debug7 from "debug";
1541
1640
  import { minimatch as minimatch2 } from "minimatch";
1542
- var log6 = debug6("DryScan:pairs");
1641
+ var log7 = debug7("DryScan:pairs");
1543
1642
  var PairingService = class {
1544
1643
  constructor(indexUnitExtractor) {
1545
1644
  this.indexUnitExtractor = indexUnitExtractor;
@@ -1550,7 +1649,7 @@ var PairingService = class {
1550
1649
  */
1551
1650
  pairKeyForUnits(left, right) {
1552
1651
  if (left.unitType !== right.unitType) {
1553
- log6("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
1652
+ log7("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
1554
1653
  return null;
1555
1654
  }
1556
1655
  const type = left.unitType;
@@ -1566,13 +1665,13 @@ var PairingService = class {
1566
1665
  parsePairKey(value) {
1567
1666
  const parts = value.split("|");
1568
1667
  if (parts.length !== 3) {
1569
- log6("Invalid pair key format: %s", value);
1668
+ log7("Invalid pair key format: %s", value);
1570
1669
  return null;
1571
1670
  }
1572
1671
  const [typeRaw, leftRaw, rightRaw] = parts;
1573
1672
  const type = this.stringToUnitType(typeRaw);
1574
1673
  if (!type) {
1575
- log6("Unknown unit type in pair key: %s", typeRaw);
1674
+ log7("Unknown unit type in pair key: %s", typeRaw);
1576
1675
  return null;
1577
1676
  }
1578
1677
  const [left, right] = [leftRaw, rightRaw].sort();
@@ -1706,9 +1805,10 @@ var DryScan = class {
1706
1805
  console.log("[DryScan] Checking for file changes...");
1707
1806
  const start = Date.now();
1708
1807
  await this.ensureDatabase();
1709
- await this.services.updater.updateIndex();
1808
+ const dirtyPaths = await this.services.updater.updateIndex();
1710
1809
  const duration = Date.now() - start;
1711
1810
  console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
1811
+ return dirtyPaths;
1712
1812
  }
1713
1813
  /**
1714
1814
  * Runs duplicate detection and returns a normalized report payload ready for persistence or display.
@@ -1737,12 +1837,12 @@ var DryScan = class {
1737
1837
  await this.ensureDatabase();
1738
1838
  console.log("[DryScan] Updating index...");
1739
1839
  const updateStart = Date.now();
1740
- await this.updateIndex();
1840
+ const dirtyPaths = await this.updateIndex();
1741
1841
  const updateDuration = Date.now() - updateStart;
1742
1842
  console.log(`[DryScan] Index update took ${updateDuration}ms.`);
1743
1843
  console.log("[DryScan] Detecting duplicates...");
1744
1844
  const dupStart = Date.now();
1745
- const result = await this.services.duplicate.findDuplicates(config);
1845
+ const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
1746
1846
  const dupDuration = Date.now() - dupStart;
1747
1847
  console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
1748
1848
  return result;