@goshenkata/dryscan-core 1.2.8 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/chunk-EUXUH3YW.js +15 -0
  2. package/dist/chunk-EUXUH3YW.js.map +1 -0
  3. package/dist/index.d.ts +2 -0
  4. package/dist/index.js +306 -386
  5. package/dist/index.js.map +1 -1
  6. package/dist/services/ParallelSimilarity.js +37 -3
  7. package/dist/services/ParallelSimilarity.js.map +1 -1
  8. package/package.json +4 -1
  9. package/dist/chunk-ZZC4V5LV.js +0 -52
  10. package/dist/chunk-ZZC4V5LV.js.map +0 -1
  11. package/src/DryScan.ts +0 -166
  12. package/src/DryScanUpdater.ts +0 -236
  13. package/src/Gitignore.ts +0 -71
  14. package/src/IndexUnitExtractor.ts +0 -208
  15. package/src/config/configStore.ts +0 -55
  16. package/src/config/dryconfig.ts +0 -115
  17. package/src/config/indexConfig.ts +0 -13
  18. package/src/const.ts +0 -5
  19. package/src/db/DryScanDatabase.ts +0 -133
  20. package/src/db/entities/FileEntity.ts +0 -29
  21. package/src/db/entities/IndexUnitEntity.ts +0 -50
  22. package/src/extractors/LanguageExtractor.ts +0 -9
  23. package/src/extractors/java.ts +0 -376
  24. package/src/index.ts +0 -9
  25. package/src/services/DuplicateService.ts +0 -257
  26. package/src/services/DuplicationCache.ts +0 -210
  27. package/src/services/EmbeddingService.ts +0 -81
  28. package/src/services/ExclusionService.ts +0 -102
  29. package/src/services/PairingService.ts +0 -145
  30. package/src/services/ParallelSimilarity.ts +0 -59
  31. package/src/services/RepositoryInitializer.ts +0 -93
  32. package/src/services/UpdateService.ts +0 -31
  33. package/src/services/cosineSimilarityWorker.ts +0 -20
  34. package/src/services/types.ts +0 -10
  35. package/src/types/glob-gitignore.d.ts +0 -7
  36. package/src/types/short-uuid.d.ts +0 -7
  37. package/src/types/tree-sitter-langs.d.ts +0 -4
  38. package/src/types.ts +0 -76
  39. package/tsup.config.ts +0 -15
package/dist/index.js CHANGED
@@ -1,7 +1,6 @@
1
1
  import {
2
- __decorateClass,
3
- parallelCosineSimilarity
4
- } from "./chunk-ZZC4V5LV.js";
2
+ __decorateClass
3
+ } from "./chunk-EUXUH3YW.js";
5
4
 
6
5
  // src/DryScan.ts
7
6
  import upath6 from "upath";
@@ -10,6 +9,7 @@ import fs7 from "fs/promises";
10
9
  // src/const.ts
11
10
  var DRYSCAN_DIR = ".dry";
12
11
  var INDEX_DB = "index.db";
12
+ var REPORTS_DIR = "reports";
13
13
  var FILE_CHECKSUM_ALGO = "md5";
14
14
  var BLOCK_HASH_ALGO = "sha1";
15
15
 
@@ -1029,7 +1029,7 @@ var RepositoryInitializer = class {
1029
1029
  };
1030
1030
 
1031
1031
  // src/services/UpdateService.ts
1032
- import debug5 from "debug";
1032
+ import debug4 from "debug";
1033
1033
 
1034
1034
  // src/DryScanUpdater.ts
1035
1035
  import path4 from "path";
@@ -1148,177 +1148,8 @@ async function performIncrementalUpdate(repoPath, extractor, db) {
1148
1148
  return changeSet;
1149
1149
  }
1150
1150
 
1151
- // src/services/DuplicationCache.ts
1152
- import debug4 from "debug";
1153
- var log4 = debug4("DryScan:DuplicationCache");
1154
- var DuplicationCache = class _DuplicationCache {
1155
- static instance = null;
1156
- comparisons = /* @__PURE__ */ new Map();
1157
- fileIndex = /* @__PURE__ */ new Map();
1158
- initialized = false;
1159
- /** Per-run similarity matrix from a single batched library call (reset each run). */
1160
- embSimMatrix = [];
1161
- /** Maps unit ID to its row/column index in embSimMatrix. */
1162
- embSimIndex = /* @__PURE__ */ new Map();
1163
- /** Per-run memoization of parent unit similarity scores (reset each run). */
1164
- parentSimCache = /* @__PURE__ */ new Map();
1165
- static getInstance() {
1166
- if (!_DuplicationCache.instance) {
1167
- _DuplicationCache.instance = new _DuplicationCache();
1168
- }
1169
- return _DuplicationCache.instance;
1170
- }
1171
- /**
1172
- * Updates the cache with fresh duplicate groups. Not awaited by callers to avoid blocking.
1173
- */
1174
- async update(groups) {
1175
- if (!groups) return;
1176
- for (const group of groups) {
1177
- const key = this.makeKey(group.left.id, group.right.id);
1178
- this.comparisons.set(key, group.similarity);
1179
- this.addKeyForFile(group.left.filePath, key);
1180
- this.addKeyForFile(group.right.filePath, key);
1181
- }
1182
- this.initialized = this.initialized || groups.length > 0;
1183
- }
1184
- /**
1185
- * Retrieves a cached similarity if present and valid for both file paths.
1186
- * Returns null when the cache has not been initialized or when the pair is missing.
1187
- */
1188
- get(leftId, rightId, leftFilePath, rightFilePath) {
1189
- if (!this.initialized) return null;
1190
- const key = this.makeKey(leftId, rightId);
1191
- if (!this.fileHasKey(leftFilePath, key) || !this.fileHasKey(rightFilePath, key)) {
1192
- return null;
1193
- }
1194
- const value = this.comparisons.get(key);
1195
- return typeof value === "number" ? value : null;
1196
- }
1197
- /**
1198
- * Invalidates all cached comparisons involving the provided file paths.
1199
- */
1200
- async invalidate(paths) {
1201
- if (!this.initialized || !paths || paths.length === 0) return;
1202
- const unique = new Set(paths);
1203
- for (const filePath of unique) {
1204
- const keys = this.fileIndex.get(filePath);
1205
- if (!keys) continue;
1206
- for (const key of keys) {
1207
- this.comparisons.delete(key);
1208
- for (const [otherPath, otherKeys] of this.fileIndex.entries()) {
1209
- if (otherKeys.delete(key) && otherKeys.size === 0) {
1210
- this.fileIndex.delete(otherPath);
1211
- }
1212
- }
1213
- }
1214
- this.fileIndex.delete(filePath);
1215
- }
1216
- if (this.comparisons.size === 0) {
1217
- this.initialized = false;
1218
- }
1219
- }
1220
- /**
1221
- * Clears all cached data. Intended for test setup.
1222
- */
1223
- clear() {
1224
- this.comparisons.clear();
1225
- this.fileIndex.clear();
1226
- this.initialized = false;
1227
- this.embSimMatrix = [];
1228
- this.embSimIndex.clear();
1229
- this.clearRunCaches();
1230
- }
1231
- /**
1232
- * Resets per-run memoization (parent similarities).
1233
- * The embedding matrix is intentionally preserved so incremental runs can
1234
- * reuse clean×clean values across calls.
1235
- */
1236
- clearRunCaches() {
1237
- this.parentSimCache.clear();
1238
- }
1239
- /**
1240
- * Builds or incrementally updates the embedding similarity matrix.
1241
- *
1242
- * Full rebuild (default): replaces the entire matrix — O(n²).
1243
- * Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
1244
- * cells from the old matrix and recomputes only dirty rows via one batched
1245
- * cosineSimilarity call — O(d·n) where d = number of dirty units.
1246
- */
1247
- async buildEmbSimCache(units, dirtyPaths) {
1248
- const embedded = units.filter((u) => Array.isArray(u.embedding) && u.embedding.length > 0);
1249
- if (embedded.length < 2) {
1250
- this.embSimMatrix = [];
1251
- this.embSimIndex.clear();
1252
- return;
1253
- }
1254
- const embeddings = embedded.map((u) => u.embedding);
1255
- const newIndex = new Map(embedded.map((u, i) => [u.id, i]));
1256
- const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
1257
- const hasPriorMatrix = this.embSimMatrix.length > 0;
1258
- if (!dirtySet || !hasPriorMatrix) {
1259
- this.embSimIndex = newIndex;
1260
- this.embSimMatrix = await parallelCosineSimilarity(embeddings, embeddings);
1261
- log4("Built full embedding similarity matrix: %d units", embedded.length);
1262
- return;
1263
- }
1264
- const dirtyIds = new Set(embedded.filter((u) => dirtySet.has(u.filePath)).map((u) => u.id));
1265
- if (dirtyIds.size === 0) {
1266
- log4("Matrix reused: no dirty units detected");
1267
- return;
1268
- }
1269
- const n = embedded.length;
1270
- const newMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
1271
- for (let i = 0; i < n; i++) {
1272
- for (let j = 0; j < n; j++) {
1273
- if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
1274
- const oi = this.embSimIndex.get(embedded[i].id);
1275
- const oj = this.embSimIndex.get(embedded[j].id);
1276
- if (oi !== void 0 && oj !== void 0) newMatrix[i][j] = this.embSimMatrix[oi][oj];
1277
- }
1278
- }
1279
- const dirtyIndices = embedded.reduce((acc, u, i) => dirtyIds.has(u.id) ? [...acc, i] : acc, []);
1280
- const dirtyRows = await parallelCosineSimilarity(dirtyIndices.map((i) => embeddings[i]), embeddings);
1281
- dirtyIndices.forEach((rowIdx, di) => {
1282
- for (let j = 0; j < n; j++) {
1283
- newMatrix[rowIdx][j] = dirtyRows[di][j];
1284
- newMatrix[j][rowIdx] = dirtyRows[di][j];
1285
- }
1286
- });
1287
- this.embSimIndex = newIndex;
1288
- this.embSimMatrix = newMatrix;
1289
- log4("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
1290
- }
1291
- /** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
1292
- getEmbSim(id1, id2) {
1293
- const i = this.embSimIndex.get(id1);
1294
- const j = this.embSimIndex.get(id2);
1295
- if (i === void 0 || j === void 0) return void 0;
1296
- return this.embSimMatrix[i][j];
1297
- }
1298
- /** Returns the memoized parent similarity for the given stable key, if available. */
1299
- getParentSim(key) {
1300
- return this.parentSimCache.get(key);
1301
- }
1302
- /** Stores a memoized parent similarity for the given stable key. */
1303
- setParentSim(key, sim) {
1304
- this.parentSimCache.set(key, sim);
1305
- }
1306
- addKeyForFile(filePath, key) {
1307
- const current = this.fileIndex.get(filePath) ?? /* @__PURE__ */ new Set();
1308
- current.add(key);
1309
- this.fileIndex.set(filePath, current);
1310
- }
1311
- fileHasKey(filePath, key) {
1312
- const keys = this.fileIndex.get(filePath);
1313
- return keys ? keys.has(key) : false;
1314
- }
1315
- makeKey(leftId, rightId) {
1316
- return [leftId, rightId].sort().join("::");
1317
- }
1318
- };
1319
-
1320
1151
  // src/services/UpdateService.ts
1321
- var log5 = debug5("DryScan:UpdateService");
1152
+ var log4 = debug4("DryScan:UpdateService");
1322
1153
  var UpdateService = class {
1323
1154
  constructor(deps, exclusionService) {
1324
1155
  this.deps = deps;
@@ -1327,222 +1158,18 @@ var UpdateService = class {
1327
1158
  /** Returns the list of file paths that were modified or deleted (dirty). */
1328
1159
  async updateIndex() {
1329
1160
  const extractor = this.deps.extractor;
1330
- const cache = DuplicationCache.getInstance();
1331
1161
  try {
1332
1162
  const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
1333
1163
  await this.exclusionService.cleanupExcludedFiles();
1334
1164
  const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
1335
- await cache.invalidate(dirtyPaths);
1336
1165
  return dirtyPaths;
1337
1166
  } catch (err) {
1338
- log5("Error during index update:", err);
1167
+ log4("Error during index update:", err);
1339
1168
  throw err;
1340
1169
  }
1341
1170
  }
1342
1171
  };
1343
1172
 
1344
- // src/services/DuplicateService.ts
1345
- import debug6 from "debug";
1346
- import shortUuid from "short-uuid";
1347
- var log6 = debug6("DryScan:DuplicateService");
1348
- var DuplicateService = class {
1349
- constructor(deps) {
1350
- this.deps = deps;
1351
- }
1352
- config;
1353
- cache = DuplicationCache.getInstance();
1354
- /**
1355
- * @param dirtyPaths - File paths changed since last run. When provided, only
1356
- * dirty×all similarities are recomputed; clean×clean values are reused from
1357
- * the existing matrix. Pass undefined (or omit) for a full rebuild.
1358
- */
1359
- async findDuplicates(config, dirtyPaths) {
1360
- this.config = config;
1361
- const t0 = performance.now();
1362
- const allUnits = await this.deps.db.getAllUnits();
1363
- log6("Starting duplicate analysis on %d units", allUnits.length);
1364
- if (allUnits.length < 2) {
1365
- return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
1366
- }
1367
- const thresholds = this.resolveThresholds(config.threshold);
1368
- const duplicates = await this.computeDuplicates(allUnits, thresholds, dirtyPaths);
1369
- const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
1370
- log6("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
1371
- this.cache.update(filtered).catch((err) => log6("Cache update failed: %O", err));
1372
- const score = this.computeDuplicationScore(filtered, allUnits);
1373
- log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
1374
- return { duplicates: filtered, score };
1375
- }
1376
- resolveThresholds(functionThreshold) {
1377
- const d = indexConfig.thresholds;
1378
- const clamp = (v) => Math.min(1, Math.max(0, v));
1379
- const fn = clamp(functionThreshold ?? d.function);
1380
- return {
1381
- function: fn,
1382
- block: clamp(fn + d.block - d.function),
1383
- class: clamp(fn + d.class - d.function)
1384
- };
1385
- }
1386
- async computeDuplicates(units, thresholds, dirtyPaths) {
1387
- this.cache.clearRunCaches();
1388
- await this.cache.buildEmbSimCache(units, dirtyPaths);
1389
- const duplicates = [];
1390
- const t0 = performance.now();
1391
- for (const [type, typedUnits] of this.groupByType(units)) {
1392
- const threshold = this.getThreshold(type, thresholds);
1393
- log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
1394
- for (let i = 0; i < typedUnits.length; i++) {
1395
- for (let j = i + 1; j < typedUnits.length; j++) {
1396
- const left = typedUnits[i], right = typedUnits[j];
1397
- if (this.shouldSkipComparison(left, right)) continue;
1398
- const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
1399
- const hasEmbeddings = left.embedding?.length && right.embedding?.length;
1400
- const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
1401
- if (similarity < threshold) continue;
1402
- const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
1403
- if (!exclusionString) continue;
1404
- duplicates.push({
1405
- id: `${left.id}::${right.id}`,
1406
- similarity,
1407
- shortId: shortUuid.generate(),
1408
- exclusionString,
1409
- left: this.toMember(left),
1410
- right: this.toMember(right)
1411
- });
1412
- }
1413
- }
1414
- }
1415
- log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
1416
- return duplicates.sort((a, b) => b.similarity - a.similarity);
1417
- }
1418
- isGroupExcluded(group) {
1419
- const config = this.config;
1420
- if (!config?.excludedPairs?.length) return false;
1421
- const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
1422
- if (!key) return false;
1423
- const actual = this.deps.pairing.parsePairKey(key);
1424
- if (!actual) return false;
1425
- return config.excludedPairs.some((entry) => {
1426
- const parsed = this.deps.pairing.parsePairKey(entry);
1427
- return parsed ? this.deps.pairing.pairKeyMatches(actual, parsed) : false;
1428
- });
1429
- }
1430
- getThreshold(type, thresholds) {
1431
- if (type === "class" /* CLASS */) return thresholds.class;
1432
- if (type === "block" /* BLOCK */) return thresholds.block;
1433
- return thresholds.function;
1434
- }
1435
- computeWeightedSimilarity(left, right, threshold) {
1436
- const selfSim = this.similarity(left, right);
1437
- if (left.unitType === "class" /* CLASS */) {
1438
- return selfSim * indexConfig.weights.class.self;
1439
- }
1440
- if (left.unitType === "function" /* FUNCTION */) {
1441
- const w2 = indexConfig.weights.function;
1442
- const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
1443
- const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
1444
- if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
1445
- return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
1446
- }
1447
- const w = indexConfig.weights.block;
1448
- const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
1449
- const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
1450
- const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
1451
- if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
1452
- return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
1453
- }
1454
- /** Groups all units by type for the comparison loop. Units without embeddings are included
1455
- * so that cache hits can still be returned for pairs whose embeddings were cleared. */
1456
- groupByType(units) {
1457
- const byType = /* @__PURE__ */ new Map();
1458
- for (const unit of units) {
1459
- const list = byType.get(unit.unitType) ?? [];
1460
- list.push(unit);
1461
- byType.set(unit.unitType, list);
1462
- }
1463
- return byType;
1464
- }
1465
- toMember(unit) {
1466
- return {
1467
- id: unit.id,
1468
- name: unit.name,
1469
- filePath: unit.filePath,
1470
- startLine: unit.startLine,
1471
- endLine: unit.endLine,
1472
- code: unit.code,
1473
- unitType: unit.unitType
1474
- };
1475
- }
1476
- bothHaveParent(left, right, type) {
1477
- return !!this.findParent(left, type) && !!this.findParent(right, type);
1478
- }
1479
- parentSimilarity(left, right, type) {
1480
- const lp = this.findParent(left, type), rp = this.findParent(right, type);
1481
- if (!lp || !rp) return 0;
1482
- const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
1483
- const cached = this.cache.getParentSim(key);
1484
- if (cached !== void 0) return cached;
1485
- const sim = this.similarity(lp, rp);
1486
- this.cache.setParentSim(key, sim);
1487
- return sim;
1488
- }
1489
- /** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
1490
- similarity(left, right) {
1491
- return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
1492
- }
1493
- childSimilarity(left, right) {
1494
- const lc = left.children ?? [], rc = right.children ?? [];
1495
- if (!lc.length || !rc.length) return 0;
1496
- let best = 0;
1497
- for (const l of lc) {
1498
- for (const r of rc) {
1499
- if (l.unitType !== r.unitType) continue;
1500
- const sim = this.similarity(l, r);
1501
- if (sim > best) best = sim;
1502
- }
1503
- }
1504
- return best;
1505
- }
1506
- shouldSkipComparison(left, right) {
1507
- if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
1508
- if (left.filePath !== right.filePath) return false;
1509
- return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
1510
- }
1511
- findParent(unit, type) {
1512
- let p = unit.parent;
1513
- while (p) {
1514
- if (p.unitType === type) return p;
1515
- p = p.parent;
1516
- }
1517
- return null;
1518
- }
1519
- computeDuplicationScore(duplicates, allUnits) {
1520
- const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
1521
- if (!totalLines || !duplicates.length) {
1522
- return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
1523
- }
1524
- const duplicateLines = duplicates.reduce((sum, g) => {
1525
- const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
1526
- return sum + g.similarity * avg;
1527
- }, 0);
1528
- const score = duplicateLines / totalLines * 100;
1529
- return {
1530
- score,
1531
- grade: this.getScoreGrade(score),
1532
- totalLines,
1533
- duplicateLines: Math.round(duplicateLines),
1534
- duplicateGroups: duplicates.length
1535
- };
1536
- }
1537
- getScoreGrade(score) {
1538
- if (score < 5) return "Excellent";
1539
- if (score < 15) return "Good";
1540
- if (score < 30) return "Fair";
1541
- if (score < 50) return "Poor";
1542
- return "Critical";
1543
- }
1544
- };
1545
-
1546
1173
  // src/services/ExclusionService.ts
1547
1174
  import { minimatch } from "minimatch";
1548
1175
  var ExclusionService = class {
@@ -1629,9 +1256,9 @@ var ExclusionService = class {
1629
1256
 
1630
1257
  // src/services/PairingService.ts
1631
1258
  import crypto3 from "crypto";
1632
- import debug7 from "debug";
1259
+ import debug5 from "debug";
1633
1260
  import { minimatch as minimatch2 } from "minimatch";
1634
- var log7 = debug7("DryScan:pairs");
1261
+ var log5 = debug5("DryScan:pairs");
1635
1262
  var PairingService = class {
1636
1263
  constructor(indexUnitExtractor) {
1637
1264
  this.indexUnitExtractor = indexUnitExtractor;
@@ -1642,7 +1269,7 @@ var PairingService = class {
1642
1269
  */
1643
1270
  pairKeyForUnits(left, right) {
1644
1271
  if (left.unitType !== right.unitType) {
1645
- log7("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
1272
+ log5("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
1646
1273
  return null;
1647
1274
  }
1648
1275
  const type = left.unitType;
@@ -1658,13 +1285,13 @@ var PairingService = class {
1658
1285
  parsePairKey(value) {
1659
1286
  const parts = value.split("|");
1660
1287
  if (parts.length !== 3) {
1661
- log7("Invalid pair key format: %s", value);
1288
+ log5("Invalid pair key format: %s", value);
1662
1289
  return null;
1663
1290
  }
1664
1291
  const [typeRaw, leftRaw, rightRaw] = parts;
1665
1292
  const type = this.stringToUnitType(typeRaw);
1666
1293
  if (!type) {
1667
- log7("Unknown unit type in pair key: %s", typeRaw);
1294
+ log5("Unknown unit type in pair key: %s", typeRaw);
1668
1295
  return null;
1669
1296
  }
1670
1297
  const [left, right] = [leftRaw, rightRaw].sort();
@@ -1736,6 +1363,259 @@ var PairingService = class {
1736
1363
 
1737
1364
  // src/DryScan.ts
1738
1365
  import { existsSync } from "fs";
1366
+
1367
+ // src/services/DuplicateService.ts
1368
+ import debug6 from "debug";
1369
+ import shortUuid from "short-uuid";
1370
+ import { cosineSimilarity } from "@langchain/core/utils/math";
1371
+ var log6 = debug6("DryScan:DuplicateService");
1372
+ var DuplicateService = class {
1373
+ constructor(deps) {
1374
+ this.deps = deps;
1375
+ }
1376
+ config;
1377
+ similarityCache = /* @__PURE__ */ new Map();
1378
+ parentSimCache = /* @__PURE__ */ new Map();
1379
+ async findDuplicates(config, dirtyPaths = [], previousReport) {
1380
+ this.config = config;
1381
+ this.similarityCache = /* @__PURE__ */ new Map();
1382
+ this.parentSimCache = /* @__PURE__ */ new Map();
1383
+ const t0 = performance.now();
1384
+ const allUnits = await this.deps.db.getAllUnits();
1385
+ log6("Starting duplicate analysis on %d units", allUnits.length);
1386
+ if (allUnits.length < 2) {
1387
+ return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
1388
+ }
1389
+ const thresholds = this.resolveThresholds(config.threshold);
1390
+ const dirtySet = new Set(dirtyPaths);
1391
+ const canReuseFromReport = Boolean(previousReport && previousReport.threshold === config.threshold);
1392
+ const reusableClean = canReuseFromReport ? this.reuseCleanPairsFromPreviousReport(previousReport, allUnits, dirtySet) : [];
1393
+ const recomputed = this.computeDuplicates(
1394
+ allUnits,
1395
+ thresholds,
1396
+ canReuseFromReport ? dirtySet : null
1397
+ );
1398
+ const merged = this.mergeDuplicates(reusableClean, recomputed);
1399
+ const filtered = merged.filter((g) => !this.isGroupExcluded(g));
1400
+ log6(
1401
+ "Found %d duplicate groups (%d excluded, %d reused)",
1402
+ filtered.length,
1403
+ merged.length - filtered.length,
1404
+ reusableClean.length
1405
+ );
1406
+ const score = this.computeDuplicationScore(filtered, allUnits);
1407
+ log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
1408
+ return { duplicates: filtered, score };
1409
+ }
1410
+ resolveThresholds(functionThreshold) {
1411
+ const d = indexConfig.thresholds;
1412
+ const clamp = (v) => Math.min(1, Math.max(0, v));
1413
+ const fn = clamp(functionThreshold ?? d.function);
1414
+ return {
1415
+ function: fn,
1416
+ block: clamp(fn + d.block - d.function),
1417
+ class: clamp(fn + d.class - d.function)
1418
+ };
1419
+ }
1420
+ computeDuplicates(units, thresholds, dirtySet) {
1421
+ if (dirtySet && dirtySet.size === 0) {
1422
+ log6("Skipping recomputation: no dirty files and previous report threshold matches");
1423
+ return [];
1424
+ }
1425
+ const duplicates = [];
1426
+ const t0 = performance.now();
1427
+ for (const [type, typedUnits] of this.groupByType(units)) {
1428
+ const threshold = this.getThreshold(type, thresholds);
1429
+ log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
1430
+ for (let i = 0; i < typedUnits.length; i++) {
1431
+ for (let j = i + 1; j < typedUnits.length; j++) {
1432
+ const left = typedUnits[i];
1433
+ const right = typedUnits[j];
1434
+ if (this.shouldSkipComparison(left, right)) continue;
1435
+ if (dirtySet && !dirtySet.has(left.filePath) && !dirtySet.has(right.filePath)) {
1436
+ continue;
1437
+ }
1438
+ const hasEmbeddings = left.embedding?.length && right.embedding?.length;
1439
+ const similarity = hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0;
1440
+ if (similarity < threshold) continue;
1441
+ const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
1442
+ if (!exclusionString) continue;
1443
+ duplicates.push({
1444
+ id: `${left.id}::${right.id}`,
1445
+ similarity,
1446
+ shortId: shortUuid.generate(),
1447
+ exclusionString,
1448
+ left: this.toMember(left),
1449
+ right: this.toMember(right)
1450
+ });
1451
+ }
1452
+ }
1453
+ }
1454
+ log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
1455
+ return duplicates.sort((a, b) => b.similarity - a.similarity);
1456
+ }
1457
+ reuseCleanPairsFromPreviousReport(report, units, dirtySet) {
1458
+ const unitIds = new Set(units.map((u) => u.id));
1459
+ const reusable = report.duplicates.filter((group) => {
1460
+ const leftDirty = dirtySet.has(group.left.filePath);
1461
+ const rightDirty = dirtySet.has(group.right.filePath);
1462
+ if (leftDirty || rightDirty) return false;
1463
+ return unitIds.has(group.left.id) && unitIds.has(group.right.id);
1464
+ });
1465
+ log6("Reused %d clean-clean duplicate groups from previous report", reusable.length);
1466
+ return reusable;
1467
+ }
1468
+ mergeDuplicates(reused, recomputed) {
1469
+ const merged = /* @__PURE__ */ new Map();
1470
+ for (const group of reused) {
1471
+ merged.set(this.groupKey(group), group);
1472
+ }
1473
+ for (const group of recomputed) {
1474
+ merged.set(this.groupKey(group), group);
1475
+ }
1476
+ return Array.from(merged.values()).sort((a, b) => b.similarity - a.similarity);
1477
+ }
1478
+ groupKey(group) {
1479
+ return [group.left.id, group.right.id].sort().join("::");
1480
+ }
1481
+ isGroupExcluded(group) {
1482
+ const config = this.config;
1483
+ if (!config?.excludedPairs?.length) return false;
1484
+ const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
1485
+ if (!key) return false;
1486
+ const actual = this.deps.pairing.parsePairKey(key);
1487
+ if (!actual) return false;
1488
+ return config.excludedPairs.some((entry) => {
1489
+ const parsed = this.deps.pairing.parsePairKey(entry);
1490
+ return parsed ? this.deps.pairing.pairKeyMatches(actual, parsed) : false;
1491
+ });
1492
+ }
1493
+ getThreshold(type, thresholds) {
1494
+ if (type === "class" /* CLASS */) return thresholds.class;
1495
+ if (type === "block" /* BLOCK */) return thresholds.block;
1496
+ return thresholds.function;
1497
+ }
1498
+ computeWeightedSimilarity(left, right, threshold) {
1499
+ const selfSim = this.similarity(left, right);
1500
+ if (left.unitType === "class" /* CLASS */) {
1501
+ return selfSim * indexConfig.weights.class.self;
1502
+ }
1503
+ if (left.unitType === "function" /* FUNCTION */) {
1504
+ const w2 = indexConfig.weights.function;
1505
+ const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
1506
+ const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
1507
+ if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
1508
+ return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
1509
+ }
1510
+ const w = indexConfig.weights.block;
1511
+ const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
1512
+ const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
1513
+ const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
1514
+ if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
1515
+ return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
1516
+ }
1517
+ groupByType(units) {
1518
+ const byType = /* @__PURE__ */ new Map();
1519
+ for (const unit of units) {
1520
+ const list = byType.get(unit.unitType) ?? [];
1521
+ list.push(unit);
1522
+ byType.set(unit.unitType, list);
1523
+ }
1524
+ return byType;
1525
+ }
1526
+ toMember(unit) {
1527
+ return {
1528
+ id: unit.id,
1529
+ name: unit.name,
1530
+ filePath: unit.filePath,
1531
+ startLine: unit.startLine,
1532
+ endLine: unit.endLine,
1533
+ code: unit.code,
1534
+ unitType: unit.unitType
1535
+ };
1536
+ }
1537
+ bothHaveParent(left, right, type) {
1538
+ return !!this.findParent(left, type) && !!this.findParent(right, type);
1539
+ }
1540
+ parentSimilarity(left, right, type) {
1541
+ const lp = this.findParent(left, type);
1542
+ const rp = this.findParent(right, type);
1543
+ if (!lp || !rp) return 0;
1544
+ const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
1545
+ const cached = this.parentSimCache.get(key);
1546
+ if (cached !== void 0) return cached;
1547
+ const sim = this.similarity(lp, rp);
1548
+ this.parentSimCache.set(key, sim);
1549
+ return sim;
1550
+ }
1551
+ similarity(left, right) {
1552
+ const key = left.id < right.id ? `${left.id}::${right.id}` : `${right.id}::${left.id}`;
1553
+ const cached = this.similarityCache.get(key);
1554
+ if (cached !== void 0) return cached;
1555
+ let value = 0;
1556
+ if (left.embedding?.length && right.embedding?.length) {
1557
+ value = cosineSimilarity([left.embedding], [right.embedding])[0][0] ?? 0;
1558
+ } else {
1559
+ value = this.childSimilarity(left, right);
1560
+ }
1561
+ this.similarityCache.set(key, value);
1562
+ return value;
1563
+ }
1564
+ childSimilarity(left, right) {
1565
+ const lc = left.children ?? [];
1566
+ const rc = right.children ?? [];
1567
+ if (!lc.length || !rc.length) return 0;
1568
+ let best = 0;
1569
+ for (const l of lc) {
1570
+ for (const r of rc) {
1571
+ if (l.unitType !== r.unitType) continue;
1572
+ const sim = this.similarity(l, r);
1573
+ if (sim > best) best = sim;
1574
+ }
1575
+ }
1576
+ return best;
1577
+ }
1578
+ shouldSkipComparison(left, right) {
1579
+ if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
1580
+ if (left.filePath !== right.filePath) return false;
1581
+ return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
1582
+ }
1583
+ findParent(unit, type) {
1584
+ let p = unit.parent;
1585
+ while (p) {
1586
+ if (p.unitType === type) return p;
1587
+ p = p.parent;
1588
+ }
1589
+ return null;
1590
+ }
1591
+ computeDuplicationScore(duplicates, allUnits) {
1592
+ const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
1593
+ if (!totalLines || !duplicates.length) {
1594
+ return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
1595
+ }
1596
+ const duplicateLines = duplicates.reduce((sum, g) => {
1597
+ const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
1598
+ return sum + g.similarity * avg;
1599
+ }, 0);
1600
+ const score = duplicateLines / totalLines * 100;
1601
+ return {
1602
+ score,
1603
+ grade: this.getScoreGrade(score),
1604
+ totalLines,
1605
+ duplicateLines: Math.round(duplicateLines),
1606
+ duplicateGroups: duplicates.length
1607
+ };
1608
+ }
1609
+ getScoreGrade(score) {
1610
+ if (score < 5) return "Excellent";
1611
+ if (score < 15) return "Good";
1612
+ if (score < 30) return "Fair";
1613
+ if (score < 50) return "Poor";
1614
+ return "Critical";
1615
+ }
1616
+ };
1617
+
1618
+ // src/DryScan.ts
1739
1619
  var DryScan = class {
1740
1620
  repoPath;
1741
1621
  extractor;
@@ -1809,7 +1689,7 @@ var DryScan = class {
1809
1689
  async buildDuplicateReport() {
1810
1690
  const config = await this.loadConfig();
1811
1691
  const analysis = await this.findDuplicates(config);
1812
- return {
1692
+ const report = {
1813
1693
  version: 1,
1814
1694
  generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
1815
1695
  threshold: config.threshold,
@@ -1817,6 +1697,8 @@ var DryScan = class {
1817
1697
  score: analysis.score,
1818
1698
  duplicates: analysis.duplicates
1819
1699
  };
1700
+ await this.saveReport(report);
1701
+ return report;
1820
1702
  }
1821
1703
  /**
1822
1704
  * Finds duplicate code blocks using cosine similarity on embeddings.
@@ -1833,9 +1715,13 @@ var DryScan = class {
1833
1715
  const dirtyPaths = await this.updateIndex();
1834
1716
  const updateDuration = Date.now() - updateStart;
1835
1717
  console.log(`[DryScan] Index update took ${updateDuration}ms.`);
1718
+ const previousReport = await this.loadLatestReport();
1719
+ if (previousReport?.threshold === config.threshold) {
1720
+ console.log("[DryScan] Reusing clean-clean duplicates from latest report (threshold unchanged).");
1721
+ }
1836
1722
  console.log("[DryScan] Detecting duplicates...");
1837
1723
  const dupStart = Date.now();
1838
- const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
1724
+ const result = await this.services.duplicate.findDuplicates(config, dirtyPaths, previousReport);
1839
1725
  const dupDuration = Date.now() - dupStart;
1840
1726
  console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
1841
1727
  return result;
@@ -1857,6 +1743,40 @@ var DryScan = class {
1857
1743
  async loadConfig() {
1858
1744
  return configStore.get(this.repoPath);
1859
1745
  }
1746
+ async saveReport(report) {
1747
+ const reportDir = upath6.join(this.repoPath, DRYSCAN_DIR, REPORTS_DIR);
1748
+ await fs7.mkdir(reportDir, { recursive: true });
1749
+ const safeTimestamp = report.generatedAt.replace(/[:.]/g, "-");
1750
+ const reportPath = upath6.join(reportDir, `dupes-${safeTimestamp}.json`);
1751
+ await fs7.writeFile(reportPath, JSON.stringify(report, null, 2), "utf8");
1752
+ }
1753
+ async loadLatestReport() {
1754
+ const reportDir = upath6.join(this.repoPath, DRYSCAN_DIR, REPORTS_DIR);
1755
+ let entries;
1756
+ try {
1757
+ entries = await fs7.readdir(reportDir);
1758
+ } catch (err) {
1759
+ if (err?.code === "ENOENT") return null;
1760
+ throw err;
1761
+ }
1762
+ const jsonReports = entries.filter((name) => name.endsWith(".json"));
1763
+ if (jsonReports.length === 0) return null;
1764
+ const withStats = await Promise.all(
1765
+ jsonReports.map(async (name) => {
1766
+ const fullPath = upath6.join(reportDir, name);
1767
+ const stat = await fs7.stat(fullPath);
1768
+ return { fullPath, mtimeMs: stat.mtimeMs };
1769
+ })
1770
+ );
1771
+ withStats.sort((a, b) => b.mtimeMs - a.mtimeMs);
1772
+ const latest = withStats[0];
1773
+ const raw = await fs7.readFile(latest.fullPath, "utf8");
1774
+ const parsed = JSON.parse(raw);
1775
+ if (!parsed || !Array.isArray(parsed.duplicates) || typeof parsed.threshold !== "number") {
1776
+ return null;
1777
+ }
1778
+ return parsed;
1779
+ }
1860
1780
  };
1861
1781
  export {
1862
1782
  DryScan,