@goshenkata/dryscan-core 1.2.5 → 1.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-ZZC4V5LV.js +52 -0
- package/dist/chunk-ZZC4V5LV.js.map +1 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +249 -194
- package/dist/index.js.map +1 -1
- package/dist/services/ParallelSimilarity.d.ts +8 -0
- package/dist/services/ParallelSimilarity.js +7 -0
- package/dist/services/ParallelSimilarity.js.map +1 -0
- package/dist/services/cosineSimilarityWorker.d.ts +2 -0
- package/dist/services/cosineSimilarityWorker.js +12 -0
- package/dist/services/cosineSimilarityWorker.js.map +1 -0
- package/package.json +1 -1
- package/src/DryScan.ts +5 -4
- package/src/config/dryconfig.ts +1 -1
- package/src/extractors/java.ts +22 -7
- package/src/services/DuplicateService.ts +135 -186
- package/src/services/DuplicationCache.ts +107 -1
- package/src/services/ParallelSimilarity.ts +59 -0
- package/src/services/UpdateService.ts +5 -2
- package/src/services/cosineSimilarityWorker.ts +20 -0
- package/tsup.config.ts +1 -1
package/dist/index.js
CHANGED
|
@@ -1,13 +1,7 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
for (var i = decorators.length - 1, decorator; i >= 0; i--)
|
|
6
|
-
if (decorator = decorators[i])
|
|
7
|
-
result = (kind ? decorator(target, key, result) : decorator(result)) || result;
|
|
8
|
-
if (kind && result) __defProp(target, key, result);
|
|
9
|
-
return result;
|
|
10
|
-
};
|
|
1
|
+
import {
|
|
2
|
+
__decorateClass,
|
|
3
|
+
parallelCosineSimilarity
|
|
4
|
+
} from "./chunk-ZZC4V5LV.js";
|
|
11
5
|
|
|
12
6
|
// src/DryScan.ts
|
|
13
7
|
import upath6 from "upath";
|
|
@@ -61,7 +55,7 @@ var DEFAULT_CONFIG = {
|
|
|
61
55
|
excludedPairs: [],
|
|
62
56
|
minLines: 3,
|
|
63
57
|
minBlockLines: 5,
|
|
64
|
-
threshold: 0.
|
|
58
|
+
threshold: 0.8,
|
|
65
59
|
embeddingSource: "http://localhost:11434",
|
|
66
60
|
contextLength: 2048
|
|
67
61
|
};
|
|
@@ -244,7 +238,8 @@ var JavaExtractor = class {
|
|
|
244
238
|
const fnUnit = this.buildFunctionUnit(node, source, fileRelPath, currentClass);
|
|
245
239
|
const fnLength = fnUnit.endLine - fnUnit.startLine;
|
|
246
240
|
const bodyNode = this.getFunctionBody(node);
|
|
247
|
-
const
|
|
241
|
+
const fnArity = this.getNodeArity(node);
|
|
242
|
+
const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength, fnArity);
|
|
248
243
|
if (skipFunction) {
|
|
249
244
|
return;
|
|
250
245
|
}
|
|
@@ -311,22 +306,34 @@ var JavaExtractor = class {
|
|
|
311
306
|
const normalized = this.normalizeCode(unit.code);
|
|
312
307
|
return crypto.createHash(BLOCK_HASH_ALGO).update(normalized).digest("hex");
|
|
313
308
|
}
|
|
314
|
-
shouldSkip(unitType, name, lineCount) {
|
|
309
|
+
shouldSkip(unitType, name, lineCount, arity) {
|
|
315
310
|
if (!this.config) {
|
|
316
311
|
throw new Error("Config not loaded before skip evaluation");
|
|
317
312
|
}
|
|
318
313
|
const config = this.config;
|
|
319
314
|
const minLines = unitType === "block" /* BLOCK */ ? Math.max(indexConfig.blockMinLines, config.minBlockLines ?? 0) : config.minLines;
|
|
320
315
|
const belowMin = minLines > 0 && lineCount < minLines;
|
|
321
|
-
const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name);
|
|
316
|
+
const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name, arity ?? 0);
|
|
322
317
|
return belowMin || trivial;
|
|
323
318
|
}
|
|
324
|
-
|
|
319
|
+
/**
|
|
320
|
+
* A function is trivial if it follows a simple accessor pattern:
|
|
321
|
+
* - getters/isers: name matches get[A-Z] or is[A-Z] with exactly 0 parameters
|
|
322
|
+
* - setters: name matches set[A-Z] with at most 1 parameter
|
|
323
|
+
* Methods like getUserById(Long id) have arity > 0 and are NOT trivial.
|
|
324
|
+
*/
|
|
325
|
+
isTrivialFunction(fullName, arity) {
|
|
325
326
|
const simpleName = fullName.split(".").pop() || fullName;
|
|
326
|
-
const isGetter = /^(get|is)[A-Z]/.test(simpleName);
|
|
327
|
-
const isSetter = /^set[A-Z]/.test(simpleName);
|
|
327
|
+
const isGetter = /^(get|is)[A-Z]/.test(simpleName) && arity === 0;
|
|
328
|
+
const isSetter = /^set[A-Z]/.test(simpleName) && arity <= 1;
|
|
328
329
|
return isGetter || isSetter;
|
|
329
330
|
}
|
|
331
|
+
/** Counts the formal parameters of a method or constructor node. */
|
|
332
|
+
getNodeArity(node) {
|
|
333
|
+
const params = node.childForFieldName?.("parameters");
|
|
334
|
+
if (!params) return 0;
|
|
335
|
+
return params.namedChildren.filter((c) => c.type === "formal_parameter" || c.type === "spread_parameter").length;
|
|
336
|
+
}
|
|
330
337
|
isDtoClass(node, source, className) {
|
|
331
338
|
const classBody = node.children.find((child) => child.type === "class_body");
|
|
332
339
|
if (!classBody) return false;
|
|
@@ -344,7 +351,8 @@ var JavaExtractor = class {
|
|
|
344
351
|
if (child.type === "method_declaration" || child.type === "constructor_declaration") {
|
|
345
352
|
const simpleName = this.getSimpleFunctionName(child, source);
|
|
346
353
|
const fullName = `${className}.${simpleName}`;
|
|
347
|
-
|
|
354
|
+
const arity = this.getNodeArity(child);
|
|
355
|
+
if (!this.isTrivialFunction(fullName, arity)) {
|
|
348
356
|
return false;
|
|
349
357
|
}
|
|
350
358
|
continue;
|
|
@@ -1021,7 +1029,7 @@ var RepositoryInitializer = class {
|
|
|
1021
1029
|
};
|
|
1022
1030
|
|
|
1023
1031
|
// src/services/UpdateService.ts
|
|
1024
|
-
import
|
|
1032
|
+
import debug5 from "debug";
|
|
1025
1033
|
|
|
1026
1034
|
// src/DryScanUpdater.ts
|
|
1027
1035
|
import path4 from "path";
|
|
@@ -1141,11 +1149,19 @@ async function performIncrementalUpdate(repoPath, extractor, db) {
|
|
|
1141
1149
|
}
|
|
1142
1150
|
|
|
1143
1151
|
// src/services/DuplicationCache.ts
|
|
1152
|
+
import debug4 from "debug";
|
|
1153
|
+
var log4 = debug4("DryScan:DuplicationCache");
|
|
1144
1154
|
var DuplicationCache = class _DuplicationCache {
|
|
1145
1155
|
static instance = null;
|
|
1146
1156
|
comparisons = /* @__PURE__ */ new Map();
|
|
1147
1157
|
fileIndex = /* @__PURE__ */ new Map();
|
|
1148
1158
|
initialized = false;
|
|
1159
|
+
/** Per-run similarity matrix from a single batched library call (reset each run). */
|
|
1160
|
+
embSimMatrix = [];
|
|
1161
|
+
/** Maps unit ID to its row/column index in embSimMatrix. */
|
|
1162
|
+
embSimIndex = /* @__PURE__ */ new Map();
|
|
1163
|
+
/** Per-run memoization of parent unit similarity scores (reset each run). */
|
|
1164
|
+
parentSimCache = /* @__PURE__ */ new Map();
|
|
1149
1165
|
static getInstance() {
|
|
1150
1166
|
if (!_DuplicationCache.instance) {
|
|
1151
1167
|
_DuplicationCache.instance = new _DuplicationCache();
|
|
@@ -1208,6 +1224,84 @@ var DuplicationCache = class _DuplicationCache {
|
|
|
1208
1224
|
this.comparisons.clear();
|
|
1209
1225
|
this.fileIndex.clear();
|
|
1210
1226
|
this.initialized = false;
|
|
1227
|
+
this.embSimMatrix = [];
|
|
1228
|
+
this.embSimIndex.clear();
|
|
1229
|
+
this.clearRunCaches();
|
|
1230
|
+
}
|
|
1231
|
+
/**
|
|
1232
|
+
* Resets per-run memoization (parent similarities).
|
|
1233
|
+
* The embedding matrix is intentionally preserved so incremental runs can
|
|
1234
|
+
* reuse clean×clean values across calls.
|
|
1235
|
+
*/
|
|
1236
|
+
clearRunCaches() {
|
|
1237
|
+
this.parentSimCache.clear();
|
|
1238
|
+
}
|
|
1239
|
+
/**
|
|
1240
|
+
* Builds or incrementally updates the embedding similarity matrix.
|
|
1241
|
+
*
|
|
1242
|
+
* Full rebuild (default): replaces the entire matrix — O(n²).
|
|
1243
|
+
* Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
|
|
1244
|
+
* cells from the old matrix and recomputes only dirty rows via one batched
|
|
1245
|
+
* cosineSimilarity call — O(d·n) where d = number of dirty units.
|
|
1246
|
+
*/
|
|
1247
|
+
async buildEmbSimCache(units, dirtyPaths) {
|
|
1248
|
+
const embedded = units.filter((u) => Array.isArray(u.embedding) && u.embedding.length > 0);
|
|
1249
|
+
if (embedded.length < 2) {
|
|
1250
|
+
this.embSimMatrix = [];
|
|
1251
|
+
this.embSimIndex.clear();
|
|
1252
|
+
return;
|
|
1253
|
+
}
|
|
1254
|
+
const embeddings = embedded.map((u) => u.embedding);
|
|
1255
|
+
const newIndex = new Map(embedded.map((u, i) => [u.id, i]));
|
|
1256
|
+
const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
|
|
1257
|
+
const hasPriorMatrix = this.embSimMatrix.length > 0;
|
|
1258
|
+
if (!dirtySet || !hasPriorMatrix) {
|
|
1259
|
+
this.embSimIndex = newIndex;
|
|
1260
|
+
this.embSimMatrix = await parallelCosineSimilarity(embeddings, embeddings);
|
|
1261
|
+
log4("Built full embedding similarity matrix: %d units", embedded.length);
|
|
1262
|
+
return;
|
|
1263
|
+
}
|
|
1264
|
+
const dirtyIds = new Set(embedded.filter((u) => dirtySet.has(u.filePath)).map((u) => u.id));
|
|
1265
|
+
if (dirtyIds.size === 0) {
|
|
1266
|
+
log4("Matrix reused: no dirty units detected");
|
|
1267
|
+
return;
|
|
1268
|
+
}
|
|
1269
|
+
const n = embedded.length;
|
|
1270
|
+
const newMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
1271
|
+
for (let i = 0; i < n; i++) {
|
|
1272
|
+
for (let j = 0; j < n; j++) {
|
|
1273
|
+
if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
|
|
1274
|
+
const oi = this.embSimIndex.get(embedded[i].id);
|
|
1275
|
+
const oj = this.embSimIndex.get(embedded[j].id);
|
|
1276
|
+
if (oi !== void 0 && oj !== void 0) newMatrix[i][j] = this.embSimMatrix[oi][oj];
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
const dirtyIndices = embedded.reduce((acc, u, i) => dirtyIds.has(u.id) ? [...acc, i] : acc, []);
|
|
1280
|
+
const dirtyRows = await parallelCosineSimilarity(dirtyIndices.map((i) => embeddings[i]), embeddings);
|
|
1281
|
+
dirtyIndices.forEach((rowIdx, di) => {
|
|
1282
|
+
for (let j = 0; j < n; j++) {
|
|
1283
|
+
newMatrix[rowIdx][j] = dirtyRows[di][j];
|
|
1284
|
+
newMatrix[j][rowIdx] = dirtyRows[di][j];
|
|
1285
|
+
}
|
|
1286
|
+
});
|
|
1287
|
+
this.embSimIndex = newIndex;
|
|
1288
|
+
this.embSimMatrix = newMatrix;
|
|
1289
|
+
log4("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
|
|
1290
|
+
}
|
|
1291
|
+
/** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
|
|
1292
|
+
getEmbSim(id1, id2) {
|
|
1293
|
+
const i = this.embSimIndex.get(id1);
|
|
1294
|
+
const j = this.embSimIndex.get(id2);
|
|
1295
|
+
if (i === void 0 || j === void 0) return void 0;
|
|
1296
|
+
return this.embSimMatrix[i][j];
|
|
1297
|
+
}
|
|
1298
|
+
/** Returns the memoized parent similarity for the given stable key, if available. */
|
|
1299
|
+
getParentSim(key) {
|
|
1300
|
+
return this.parentSimCache.get(key);
|
|
1301
|
+
}
|
|
1302
|
+
/** Stores a memoized parent similarity for the given stable key. */
|
|
1303
|
+
setParentSim(key, sim) {
|
|
1304
|
+
this.parentSimCache.set(key, sim);
|
|
1211
1305
|
}
|
|
1212
1306
|
addKeyForFile(filePath, key) {
|
|
1213
1307
|
const current = this.fileIndex.get(filePath) ?? /* @__PURE__ */ new Set();
|
|
@@ -1224,145 +1318,106 @@ var DuplicationCache = class _DuplicationCache {
|
|
|
1224
1318
|
};
|
|
1225
1319
|
|
|
1226
1320
|
// src/services/UpdateService.ts
|
|
1227
|
-
var
|
|
1321
|
+
var log5 = debug5("DryScan:UpdateService");
|
|
1228
1322
|
var UpdateService = class {
|
|
1229
1323
|
constructor(deps, exclusionService) {
|
|
1230
1324
|
this.deps = deps;
|
|
1231
1325
|
this.exclusionService = exclusionService;
|
|
1232
1326
|
}
|
|
1327
|
+
/** Returns the list of file paths that were modified or deleted (dirty). */
|
|
1233
1328
|
async updateIndex() {
|
|
1234
1329
|
const extractor = this.deps.extractor;
|
|
1235
1330
|
const cache = DuplicationCache.getInstance();
|
|
1236
1331
|
try {
|
|
1237
1332
|
const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
|
|
1238
1333
|
await this.exclusionService.cleanupExcludedFiles();
|
|
1239
|
-
|
|
1334
|
+
const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
|
|
1335
|
+
await cache.invalidate(dirtyPaths);
|
|
1336
|
+
return dirtyPaths;
|
|
1240
1337
|
} catch (err) {
|
|
1241
|
-
|
|
1338
|
+
log5("Error during index update:", err);
|
|
1242
1339
|
throw err;
|
|
1243
1340
|
}
|
|
1244
1341
|
}
|
|
1245
1342
|
};
|
|
1246
1343
|
|
|
1247
1344
|
// src/services/DuplicateService.ts
|
|
1248
|
-
import
|
|
1345
|
+
import debug6 from "debug";
|
|
1249
1346
|
import shortUuid from "short-uuid";
|
|
1250
|
-
|
|
1251
|
-
var log5 = debug5("DryScan:DuplicateService");
|
|
1347
|
+
var log6 = debug6("DryScan:DuplicateService");
|
|
1252
1348
|
var DuplicateService = class {
|
|
1253
1349
|
constructor(deps) {
|
|
1254
1350
|
this.deps = deps;
|
|
1255
1351
|
}
|
|
1256
1352
|
config;
|
|
1257
1353
|
cache = DuplicationCache.getInstance();
|
|
1258
|
-
|
|
1259
|
-
|
|
1354
|
+
/**
|
|
1355
|
+
* @param dirtyPaths - File paths changed since last run. When provided, only
|
|
1356
|
+
* dirty×all similarities are recomputed; clean×clean values are reused from
|
|
1357
|
+
* the existing matrix. Pass undefined (or omit) for a full rebuild.
|
|
1358
|
+
*/
|
|
1359
|
+
async findDuplicates(config, dirtyPaths) {
|
|
1260
1360
|
this.config = config;
|
|
1261
1361
|
const t0 = performance.now();
|
|
1262
1362
|
const allUnits = await this.deps.db.getAllUnits();
|
|
1263
|
-
|
|
1363
|
+
log6("Starting duplicate analysis on %d units", allUnits.length);
|
|
1264
1364
|
if (allUnits.length < 2) {
|
|
1265
|
-
|
|
1266
|
-
const score2 = this.computeDuplicationScore([], allUnits);
|
|
1267
|
-
return { duplicates: [], score: score2 };
|
|
1365
|
+
return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
|
|
1268
1366
|
}
|
|
1269
1367
|
const thresholds = this.resolveThresholds(config.threshold);
|
|
1270
|
-
|
|
1271
|
-
const
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
this.
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
return { duplicates: filteredDuplicates, score };
|
|
1368
|
+
const duplicates = await this.computeDuplicates(allUnits, thresholds, dirtyPaths);
|
|
1369
|
+
const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
|
|
1370
|
+
log6("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
|
|
1371
|
+
this.cache.update(filtered).catch((err) => log6("Cache update failed: %O", err));
|
|
1372
|
+
const score = this.computeDuplicationScore(filtered, allUnits);
|
|
1373
|
+
log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
|
|
1374
|
+
return { duplicates: filtered, score };
|
|
1278
1375
|
}
|
|
1279
1376
|
resolveThresholds(functionThreshold) {
|
|
1280
|
-
const
|
|
1281
|
-
const clamp = (
|
|
1282
|
-
const
|
|
1283
|
-
const blockOffset = defaults.block - defaults.function;
|
|
1284
|
-
const classOffset = defaults.class - defaults.function;
|
|
1285
|
-
const functionThresholdValue = clamp(base);
|
|
1377
|
+
const d = indexConfig.thresholds;
|
|
1378
|
+
const clamp = (v) => Math.min(1, Math.max(0, v));
|
|
1379
|
+
const fn = clamp(functionThreshold ?? d.function);
|
|
1286
1380
|
return {
|
|
1287
|
-
function:
|
|
1288
|
-
block: clamp(
|
|
1289
|
-
class: clamp(
|
|
1381
|
+
function: fn,
|
|
1382
|
+
block: clamp(fn + d.block - d.function),
|
|
1383
|
+
class: clamp(fn + d.class - d.function)
|
|
1290
1384
|
};
|
|
1291
1385
|
}
|
|
1292
|
-
computeDuplicates(units, thresholds) {
|
|
1386
|
+
async computeDuplicates(units, thresholds, dirtyPaths) {
|
|
1387
|
+
this.cache.clearRunCaches();
|
|
1388
|
+
await this.cache.buildEmbSimCache(units, dirtyPaths);
|
|
1293
1389
|
const duplicates = [];
|
|
1294
|
-
const byType = /* @__PURE__ */ new Map();
|
|
1295
|
-
for (const unit of units) {
|
|
1296
|
-
const list = byType.get(unit.unitType) ?? [];
|
|
1297
|
-
list.push(unit);
|
|
1298
|
-
byType.set(unit.unitType, list);
|
|
1299
|
-
}
|
|
1300
1390
|
const t0 = performance.now();
|
|
1301
|
-
for (const [type, typedUnits] of
|
|
1391
|
+
for (const [type, typedUnits] of this.groupByType(units)) {
|
|
1302
1392
|
const threshold = this.getThreshold(type, thresholds);
|
|
1303
|
-
|
|
1304
|
-
const typeStart = performance.now();
|
|
1393
|
+
log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
|
|
1305
1394
|
for (let i = 0; i < typedUnits.length; i++) {
|
|
1306
1395
|
for (let j = i + 1; j < typedUnits.length; j++) {
|
|
1307
|
-
const left = typedUnits[i];
|
|
1308
|
-
|
|
1309
|
-
if (this.shouldSkipComparison(left, right)) {
|
|
1310
|
-
log5("Skipping nested block comparison: '%s' and '%s'", left.name, right.name);
|
|
1311
|
-
continue;
|
|
1312
|
-
}
|
|
1396
|
+
const left = typedUnits[i], right = typedUnits[j];
|
|
1397
|
+
if (this.shouldSkipComparison(left, right)) continue;
|
|
1313
1398
|
const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
if (similarity >= threshold) {
|
|
1328
|
-
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
1329
|
-
if (!exclusionString) continue;
|
|
1330
|
-
log5("Duplicate found: '%s' <-> '%s' (similarity=%d)", left.name, right.name, similarity);
|
|
1331
|
-
duplicates.push({
|
|
1332
|
-
id: `${left.id}::${right.id}`,
|
|
1333
|
-
similarity,
|
|
1334
|
-
shortId: shortUuid.generate(),
|
|
1335
|
-
exclusionString,
|
|
1336
|
-
left: {
|
|
1337
|
-
id: left.id,
|
|
1338
|
-
name: left.name,
|
|
1339
|
-
filePath: left.filePath,
|
|
1340
|
-
startLine: left.startLine,
|
|
1341
|
-
endLine: left.endLine,
|
|
1342
|
-
code: left.code,
|
|
1343
|
-
unitType: left.unitType
|
|
1344
|
-
},
|
|
1345
|
-
right: {
|
|
1346
|
-
id: right.id,
|
|
1347
|
-
name: right.name,
|
|
1348
|
-
filePath: right.filePath,
|
|
1349
|
-
startLine: right.startLine,
|
|
1350
|
-
endLine: right.endLine,
|
|
1351
|
-
code: right.code,
|
|
1352
|
-
unitType: right.unitType
|
|
1353
|
-
}
|
|
1354
|
-
});
|
|
1355
|
-
}
|
|
1399
|
+
const hasEmbeddings = left.embedding?.length && right.embedding?.length;
|
|
1400
|
+
const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
|
|
1401
|
+
if (similarity < threshold) continue;
|
|
1402
|
+
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
1403
|
+
if (!exclusionString) continue;
|
|
1404
|
+
duplicates.push({
|
|
1405
|
+
id: `${left.id}::${right.id}`,
|
|
1406
|
+
similarity,
|
|
1407
|
+
shortId: shortUuid.generate(),
|
|
1408
|
+
exclusionString,
|
|
1409
|
+
left: this.toMember(left),
|
|
1410
|
+
right: this.toMember(right)
|
|
1411
|
+
});
|
|
1356
1412
|
}
|
|
1357
1413
|
}
|
|
1358
|
-
log5("Type '%s' comparisons completed in %dms", type, (performance.now() - typeStart).toFixed(2));
|
|
1359
1414
|
}
|
|
1360
|
-
|
|
1415
|
+
log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
|
|
1361
1416
|
return duplicates.sort((a, b) => b.similarity - a.similarity);
|
|
1362
1417
|
}
|
|
1363
1418
|
isGroupExcluded(group) {
|
|
1364
1419
|
const config = this.config;
|
|
1365
|
-
if (!config
|
|
1420
|
+
if (!config?.excludedPairs?.length) return false;
|
|
1366
1421
|
const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
|
|
1367
1422
|
if (!key) return false;
|
|
1368
1423
|
const actual = this.deps.pairing.parsePairKey(key);
|
|
@@ -1377,109 +1432,108 @@ var DuplicateService = class {
|
|
|
1377
1432
|
if (type === "block" /* BLOCK */) return thresholds.block;
|
|
1378
1433
|
return thresholds.function;
|
|
1379
1434
|
}
|
|
1380
|
-
computeWeightedSimilarity(left, right) {
|
|
1381
|
-
const
|
|
1435
|
+
computeWeightedSimilarity(left, right, threshold) {
|
|
1436
|
+
const selfSim = this.similarity(left, right);
|
|
1382
1437
|
if (left.unitType === "class" /* CLASS */) {
|
|
1383
|
-
return
|
|
1438
|
+
return selfSim * indexConfig.weights.class.self;
|
|
1384
1439
|
}
|
|
1385
1440
|
if (left.unitType === "function" /* FUNCTION */) {
|
|
1386
|
-
const
|
|
1387
|
-
const
|
|
1388
|
-
const
|
|
1389
|
-
|
|
1390
|
-
return (
|
|
1441
|
+
const w2 = indexConfig.weights.function;
|
|
1442
|
+
const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1443
|
+
const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
|
|
1444
|
+
if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
|
|
1445
|
+
return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
|
|
1391
1446
|
}
|
|
1392
|
-
const
|
|
1393
|
-
const
|
|
1394
|
-
const
|
|
1395
|
-
const
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
const
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
const leftHasEmbedding = this.hasVector(left);
|
|
1408
|
-
const rightHasEmbedding = this.hasVector(right);
|
|
1409
|
-
if (leftHasEmbedding && rightHasEmbedding) {
|
|
1410
|
-
return cosineSimilarity([left.embedding], [right.embedding])[0][0];
|
|
1447
|
+
const w = indexConfig.weights.block;
|
|
1448
|
+
const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
|
|
1449
|
+
const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1450
|
+
const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
|
|
1451
|
+
if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
|
|
1452
|
+
return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
|
|
1453
|
+
}
|
|
1454
|
+
/** Groups all units by type for the comparison loop. Units without embeddings are included
|
|
1455
|
+
* so that cache hits can still be returned for pairs whose embeddings were cleared. */
|
|
1456
|
+
groupByType(units) {
|
|
1457
|
+
const byType = /* @__PURE__ */ new Map();
|
|
1458
|
+
for (const unit of units) {
|
|
1459
|
+
const list = byType.get(unit.unitType) ?? [];
|
|
1460
|
+
list.push(unit);
|
|
1461
|
+
byType.set(unit.unitType, list);
|
|
1411
1462
|
}
|
|
1412
|
-
return
|
|
1463
|
+
return byType;
|
|
1464
|
+
}
|
|
1465
|
+
toMember(unit) {
|
|
1466
|
+
return {
|
|
1467
|
+
id: unit.id,
|
|
1468
|
+
name: unit.name,
|
|
1469
|
+
filePath: unit.filePath,
|
|
1470
|
+
startLine: unit.startLine,
|
|
1471
|
+
endLine: unit.endLine,
|
|
1472
|
+
code: unit.code,
|
|
1473
|
+
unitType: unit.unitType
|
|
1474
|
+
};
|
|
1475
|
+
}
|
|
1476
|
+
bothHaveParent(left, right, type) {
|
|
1477
|
+
return !!this.findParent(left, type) && !!this.findParent(right, type);
|
|
1478
|
+
}
|
|
1479
|
+
parentSimilarity(left, right, type) {
|
|
1480
|
+
const lp = this.findParent(left, type), rp = this.findParent(right, type);
|
|
1481
|
+
if (!lp || !rp) return 0;
|
|
1482
|
+
const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
|
|
1483
|
+
const cached = this.cache.getParentSim(key);
|
|
1484
|
+
if (cached !== void 0) return cached;
|
|
1485
|
+
const sim = this.similarity(lp, rp);
|
|
1486
|
+
this.cache.setParentSim(key, sim);
|
|
1487
|
+
return sim;
|
|
1488
|
+
}
|
|
1489
|
+
/** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
|
|
1490
|
+
similarity(left, right) {
|
|
1491
|
+
return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
|
|
1413
1492
|
}
|
|
1414
1493
|
childSimilarity(left, right) {
|
|
1415
|
-
const
|
|
1416
|
-
|
|
1417
|
-
if (leftChildren.length === 0 || rightChildren.length === 0) return 0;
|
|
1494
|
+
const lc = left.children ?? [], rc = right.children ?? [];
|
|
1495
|
+
if (!lc.length || !rc.length) return 0;
|
|
1418
1496
|
let best = 0;
|
|
1419
|
-
for (const
|
|
1420
|
-
for (const
|
|
1421
|
-
if (
|
|
1422
|
-
const sim = this.
|
|
1497
|
+
for (const l of lc) {
|
|
1498
|
+
for (const r of rc) {
|
|
1499
|
+
if (l.unitType !== r.unitType) continue;
|
|
1500
|
+
const sim = this.similarity(l, r);
|
|
1423
1501
|
if (sim > best) best = sim;
|
|
1424
1502
|
}
|
|
1425
1503
|
}
|
|
1426
1504
|
return best;
|
|
1427
1505
|
}
|
|
1428
|
-
hasVector(unit) {
|
|
1429
|
-
return Array.isArray(unit.embedding) && unit.embedding.length > 0;
|
|
1430
|
-
}
|
|
1431
1506
|
shouldSkipComparison(left, right) {
|
|
1432
|
-
if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */)
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
}
|
|
1442
|
-
findParentOfType(unit, targetType) {
|
|
1443
|
-
let current = unit.parent;
|
|
1444
|
-
while (current) {
|
|
1445
|
-
if (current.unitType === targetType) return current;
|
|
1446
|
-
current = current.parent;
|
|
1507
|
+
if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
|
|
1508
|
+
if (left.filePath !== right.filePath) return false;
|
|
1509
|
+
return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
|
|
1510
|
+
}
|
|
1511
|
+
findParent(unit, type) {
|
|
1512
|
+
let p = unit.parent;
|
|
1513
|
+
while (p) {
|
|
1514
|
+
if (p.unitType === type) return p;
|
|
1515
|
+
p = p.parent;
|
|
1447
1516
|
}
|
|
1448
1517
|
return null;
|
|
1449
1518
|
}
|
|
1450
1519
|
computeDuplicationScore(duplicates, allUnits) {
|
|
1451
|
-
const totalLines =
|
|
1452
|
-
if (totalLines
|
|
1453
|
-
return {
|
|
1454
|
-
score: 0,
|
|
1455
|
-
grade: "Excellent",
|
|
1456
|
-
totalLines,
|
|
1457
|
-
duplicateLines: 0,
|
|
1458
|
-
duplicateGroups: 0
|
|
1459
|
-
};
|
|
1520
|
+
const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
|
|
1521
|
+
if (!totalLines || !duplicates.length) {
|
|
1522
|
+
return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
|
|
1460
1523
|
}
|
|
1461
|
-
const
|
|
1462
|
-
const
|
|
1463
|
-
|
|
1464
|
-
const avgLines = (leftLines + rightLines) / 2;
|
|
1465
|
-
return sum + group.similarity * avgLines;
|
|
1524
|
+
const duplicateLines = duplicates.reduce((sum, g) => {
|
|
1525
|
+
const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
|
|
1526
|
+
return sum + g.similarity * avg;
|
|
1466
1527
|
}, 0);
|
|
1467
|
-
const score =
|
|
1468
|
-
const grade = this.getScoreGrade(score);
|
|
1528
|
+
const score = duplicateLines / totalLines * 100;
|
|
1469
1529
|
return {
|
|
1470
1530
|
score,
|
|
1471
|
-
grade,
|
|
1531
|
+
grade: this.getScoreGrade(score),
|
|
1472
1532
|
totalLines,
|
|
1473
|
-
duplicateLines: Math.round(
|
|
1533
|
+
duplicateLines: Math.round(duplicateLines),
|
|
1474
1534
|
duplicateGroups: duplicates.length
|
|
1475
1535
|
};
|
|
1476
1536
|
}
|
|
1477
|
-
calculateTotalLines(units) {
|
|
1478
|
-
return units.reduce((sum, unit) => {
|
|
1479
|
-
const lines = unit.endLine - unit.startLine + 1;
|
|
1480
|
-
return sum + lines;
|
|
1481
|
-
}, 0);
|
|
1482
|
-
}
|
|
1483
1537
|
getScoreGrade(score) {
|
|
1484
1538
|
if (score < 5) return "Excellent";
|
|
1485
1539
|
if (score < 15) return "Good";
|
|
@@ -1575,9 +1629,9 @@ var ExclusionService = class {
|
|
|
1575
1629
|
|
|
1576
1630
|
// src/services/PairingService.ts
|
|
1577
1631
|
import crypto3 from "crypto";
|
|
1578
|
-
import
|
|
1632
|
+
import debug7 from "debug";
|
|
1579
1633
|
import { minimatch as minimatch2 } from "minimatch";
|
|
1580
|
-
var
|
|
1634
|
+
var log7 = debug7("DryScan:pairs");
|
|
1581
1635
|
var PairingService = class {
|
|
1582
1636
|
constructor(indexUnitExtractor) {
|
|
1583
1637
|
this.indexUnitExtractor = indexUnitExtractor;
|
|
@@ -1588,7 +1642,7 @@ var PairingService = class {
|
|
|
1588
1642
|
*/
|
|
1589
1643
|
pairKeyForUnits(left, right) {
|
|
1590
1644
|
if (left.unitType !== right.unitType) {
|
|
1591
|
-
|
|
1645
|
+
log7("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
|
|
1592
1646
|
return null;
|
|
1593
1647
|
}
|
|
1594
1648
|
const type = left.unitType;
|
|
@@ -1604,13 +1658,13 @@ var PairingService = class {
|
|
|
1604
1658
|
parsePairKey(value) {
|
|
1605
1659
|
const parts = value.split("|");
|
|
1606
1660
|
if (parts.length !== 3) {
|
|
1607
|
-
|
|
1661
|
+
log7("Invalid pair key format: %s", value);
|
|
1608
1662
|
return null;
|
|
1609
1663
|
}
|
|
1610
1664
|
const [typeRaw, leftRaw, rightRaw] = parts;
|
|
1611
1665
|
const type = this.stringToUnitType(typeRaw);
|
|
1612
1666
|
if (!type) {
|
|
1613
|
-
|
|
1667
|
+
log7("Unknown unit type in pair key: %s", typeRaw);
|
|
1614
1668
|
return null;
|
|
1615
1669
|
}
|
|
1616
1670
|
const [left, right] = [leftRaw, rightRaw].sort();
|
|
@@ -1744,9 +1798,10 @@ var DryScan = class {
|
|
|
1744
1798
|
console.log("[DryScan] Checking for file changes...");
|
|
1745
1799
|
const start = Date.now();
|
|
1746
1800
|
await this.ensureDatabase();
|
|
1747
|
-
await this.services.updater.updateIndex();
|
|
1801
|
+
const dirtyPaths = await this.services.updater.updateIndex();
|
|
1748
1802
|
const duration = Date.now() - start;
|
|
1749
1803
|
console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
|
|
1804
|
+
return dirtyPaths;
|
|
1750
1805
|
}
|
|
1751
1806
|
/**
|
|
1752
1807
|
* Runs duplicate detection and returns a normalized report payload ready for persistence or display.
|
|
@@ -1775,12 +1830,12 @@ var DryScan = class {
|
|
|
1775
1830
|
await this.ensureDatabase();
|
|
1776
1831
|
console.log("[DryScan] Updating index...");
|
|
1777
1832
|
const updateStart = Date.now();
|
|
1778
|
-
await this.updateIndex();
|
|
1833
|
+
const dirtyPaths = await this.updateIndex();
|
|
1779
1834
|
const updateDuration = Date.now() - updateStart;
|
|
1780
1835
|
console.log(`[DryScan] Index update took ${updateDuration}ms.`);
|
|
1781
1836
|
console.log("[DryScan] Detecting duplicates...");
|
|
1782
1837
|
const dupStart = Date.now();
|
|
1783
|
-
const result = await this.services.duplicate.findDuplicates(config);
|
|
1838
|
+
const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
|
|
1784
1839
|
const dupDuration = Date.now() - dupStart;
|
|
1785
1840
|
console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
|
|
1786
1841
|
return result;
|