@goshenkata/dryscan-core 1.2.4 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -1
- package/dist/index.js +267 -167
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/DryScan.ts +5 -4
- package/src/config/dryconfig.ts +1 -1
- package/src/db/DryScanDatabase.ts +1 -1
- package/src/extractors/java.ts +47 -12
- package/src/services/DuplicateService.ts +136 -166
- package/src/services/DuplicationCache.ts +107 -1
- package/src/services/UpdateService.ts +5 -2
package/dist/index.d.ts
CHANGED
|
@@ -221,7 +221,7 @@ declare class DryScan {
|
|
|
221
221
|
* 6. Recompute embeddings for affected units
|
|
222
222
|
* 7. Update file tracking metadata
|
|
223
223
|
*/
|
|
224
|
-
updateIndex(): Promise<
|
|
224
|
+
updateIndex(): Promise<string[]>;
|
|
225
225
|
/**
|
|
226
226
|
* Runs duplicate detection and returns a normalized report payload ready for persistence or display.
|
|
227
227
|
*/
|
package/dist/index.js
CHANGED
|
@@ -61,7 +61,7 @@ var DEFAULT_CONFIG = {
|
|
|
61
61
|
excludedPairs: [],
|
|
62
62
|
minLines: 3,
|
|
63
63
|
minBlockLines: 5,
|
|
64
|
-
threshold: 0.
|
|
64
|
+
threshold: 0.8,
|
|
65
65
|
embeddingSource: "http://localhost:11434",
|
|
66
66
|
contextLength: 2048
|
|
67
67
|
};
|
|
@@ -244,7 +244,8 @@ var JavaExtractor = class {
|
|
|
244
244
|
const fnUnit = this.buildFunctionUnit(node, source, fileRelPath, currentClass);
|
|
245
245
|
const fnLength = fnUnit.endLine - fnUnit.startLine;
|
|
246
246
|
const bodyNode = this.getFunctionBody(node);
|
|
247
|
-
const
|
|
247
|
+
const fnArity = this.getNodeArity(node);
|
|
248
|
+
const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength, fnArity);
|
|
248
249
|
if (skipFunction) {
|
|
249
250
|
return;
|
|
250
251
|
}
|
|
@@ -311,22 +312,34 @@ var JavaExtractor = class {
|
|
|
311
312
|
const normalized = this.normalizeCode(unit.code);
|
|
312
313
|
return crypto.createHash(BLOCK_HASH_ALGO).update(normalized).digest("hex");
|
|
313
314
|
}
|
|
314
|
-
shouldSkip(unitType, name, lineCount) {
|
|
315
|
+
shouldSkip(unitType, name, lineCount, arity) {
|
|
315
316
|
if (!this.config) {
|
|
316
317
|
throw new Error("Config not loaded before skip evaluation");
|
|
317
318
|
}
|
|
318
319
|
const config = this.config;
|
|
319
320
|
const minLines = unitType === "block" /* BLOCK */ ? Math.max(indexConfig.blockMinLines, config.minBlockLines ?? 0) : config.minLines;
|
|
320
321
|
const belowMin = minLines > 0 && lineCount < minLines;
|
|
321
|
-
const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name);
|
|
322
|
+
const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name, arity ?? 0);
|
|
322
323
|
return belowMin || trivial;
|
|
323
324
|
}
|
|
324
|
-
|
|
325
|
+
/**
|
|
326
|
+
* A function is trivial if it follows a simple accessor pattern:
|
|
327
|
+
* - getters/isers: name matches get[A-Z] or is[A-Z] with exactly 0 parameters
|
|
328
|
+
* - setters: name matches set[A-Z] with at most 1 parameter
|
|
329
|
+
* Methods like getUserById(Long id) have arity > 0 and are NOT trivial.
|
|
330
|
+
*/
|
|
331
|
+
isTrivialFunction(fullName, arity) {
|
|
325
332
|
const simpleName = fullName.split(".").pop() || fullName;
|
|
326
|
-
const isGetter = /^(get|is)[A-Z]/.test(simpleName);
|
|
327
|
-
const isSetter = /^set[A-Z]/.test(simpleName);
|
|
333
|
+
const isGetter = /^(get|is)[A-Z]/.test(simpleName) && arity === 0;
|
|
334
|
+
const isSetter = /^set[A-Z]/.test(simpleName) && arity <= 1;
|
|
328
335
|
return isGetter || isSetter;
|
|
329
336
|
}
|
|
337
|
+
/** Counts the formal parameters of a method or constructor node. */
|
|
338
|
+
getNodeArity(node) {
|
|
339
|
+
const params = node.childForFieldName?.("parameters");
|
|
340
|
+
if (!params) return 0;
|
|
341
|
+
return params.namedChildren.filter((c) => c.type === "formal_parameter" || c.type === "spread_parameter").length;
|
|
342
|
+
}
|
|
330
343
|
isDtoClass(node, source, className) {
|
|
331
344
|
const classBody = node.children.find((child) => child.type === "class_body");
|
|
332
345
|
if (!classBody) return false;
|
|
@@ -344,7 +357,8 @@ var JavaExtractor = class {
|
|
|
344
357
|
if (child.type === "method_declaration" || child.type === "constructor_declaration") {
|
|
345
358
|
const simpleName = this.getSimpleFunctionName(child, source);
|
|
346
359
|
const fullName = `${className}.${simpleName}`;
|
|
347
|
-
|
|
360
|
+
const arity = this.getNodeArity(child);
|
|
361
|
+
if (!this.isTrivialFunction(fullName, arity)) {
|
|
348
362
|
return false;
|
|
349
363
|
}
|
|
350
364
|
continue;
|
|
@@ -368,6 +382,7 @@ var JavaExtractor = class {
|
|
|
368
382
|
filePath: file,
|
|
369
383
|
startLine,
|
|
370
384
|
endLine,
|
|
385
|
+
children: [],
|
|
371
386
|
code: this.stripComments(source.slice(node.startIndex, node.endIndex)),
|
|
372
387
|
unitType: "function" /* FUNCTION */,
|
|
373
388
|
parentId: parentClass?.id,
|
|
@@ -402,9 +417,11 @@ var JavaExtractor = class {
|
|
|
402
417
|
parentId: parentFunction.id,
|
|
403
418
|
parent: parentFunction
|
|
404
419
|
};
|
|
420
|
+
const contextLength = this.config?.contextLength ?? 2048;
|
|
421
|
+
const splitBlocks = this.textSplitBlockIfOverContextLimit(blockUnit, contextLength);
|
|
405
422
|
parentFunction.children = parentFunction.children || [];
|
|
406
|
-
parentFunction.children.push(
|
|
407
|
-
blocks.push(
|
|
423
|
+
parentFunction.children.push(...splitBlocks);
|
|
424
|
+
blocks.push(...splitBlocks);
|
|
408
425
|
}
|
|
409
426
|
}
|
|
410
427
|
for (let i = 0; i < n.namedChildCount; i++) {
|
|
@@ -450,6 +467,21 @@ var JavaExtractor = class {
|
|
|
450
467
|
removeDuplicates(units) {
|
|
451
468
|
return Array.from(new Map(units.map((u) => [u.id, u])).values());
|
|
452
469
|
}
|
|
470
|
+
/** Splits a block unit's code into chunks if it exceeds the context length limit. */
|
|
471
|
+
textSplitBlockIfOverContextLimit(unit, contextLength) {
|
|
472
|
+
if (unit.code.length <= contextLength) return [unit];
|
|
473
|
+
const chunks = [];
|
|
474
|
+
let chunkIndex = 0;
|
|
475
|
+
for (let i = 0; i < unit.code.length; i += contextLength) {
|
|
476
|
+
chunks.push({
|
|
477
|
+
...unit,
|
|
478
|
+
id: `${unit.id}:chunk${chunkIndex}`,
|
|
479
|
+
code: unit.code.slice(i, i + contextLength)
|
|
480
|
+
});
|
|
481
|
+
chunkIndex++;
|
|
482
|
+
}
|
|
483
|
+
return chunks;
|
|
484
|
+
}
|
|
453
485
|
};
|
|
454
486
|
|
|
455
487
|
// src/Gitignore.ts
|
|
@@ -1003,7 +1035,7 @@ var RepositoryInitializer = class {
|
|
|
1003
1035
|
};
|
|
1004
1036
|
|
|
1005
1037
|
// src/services/UpdateService.ts
|
|
1006
|
-
import
|
|
1038
|
+
import debug5 from "debug";
|
|
1007
1039
|
|
|
1008
1040
|
// src/DryScanUpdater.ts
|
|
1009
1041
|
import path4 from "path";
|
|
@@ -1123,11 +1155,20 @@ async function performIncrementalUpdate(repoPath, extractor, db) {
|
|
|
1123
1155
|
}
|
|
1124
1156
|
|
|
1125
1157
|
// src/services/DuplicationCache.ts
|
|
1158
|
+
import debug4 from "debug";
|
|
1159
|
+
import { cosineSimilarity } from "@langchain/core/utils/math";
|
|
1160
|
+
var log4 = debug4("DryScan:DuplicationCache");
|
|
1126
1161
|
var DuplicationCache = class _DuplicationCache {
|
|
1127
1162
|
static instance = null;
|
|
1128
1163
|
comparisons = /* @__PURE__ */ new Map();
|
|
1129
1164
|
fileIndex = /* @__PURE__ */ new Map();
|
|
1130
1165
|
initialized = false;
|
|
1166
|
+
/** Per-run similarity matrix from a single batched library call (reset each run). */
|
|
1167
|
+
embSimMatrix = [];
|
|
1168
|
+
/** Maps unit ID to its row/column index in embSimMatrix. */
|
|
1169
|
+
embSimIndex = /* @__PURE__ */ new Map();
|
|
1170
|
+
/** Per-run memoization of parent unit similarity scores (reset each run). */
|
|
1171
|
+
parentSimCache = /* @__PURE__ */ new Map();
|
|
1131
1172
|
static getInstance() {
|
|
1132
1173
|
if (!_DuplicationCache.instance) {
|
|
1133
1174
|
_DuplicationCache.instance = new _DuplicationCache();
|
|
@@ -1190,6 +1231,84 @@ var DuplicationCache = class _DuplicationCache {
|
|
|
1190
1231
|
this.comparisons.clear();
|
|
1191
1232
|
this.fileIndex.clear();
|
|
1192
1233
|
this.initialized = false;
|
|
1234
|
+
this.embSimMatrix = [];
|
|
1235
|
+
this.embSimIndex.clear();
|
|
1236
|
+
this.clearRunCaches();
|
|
1237
|
+
}
|
|
1238
|
+
/**
|
|
1239
|
+
* Resets per-run memoization (parent similarities).
|
|
1240
|
+
* The embedding matrix is intentionally preserved so incremental runs can
|
|
1241
|
+
* reuse clean×clean values across calls.
|
|
1242
|
+
*/
|
|
1243
|
+
clearRunCaches() {
|
|
1244
|
+
this.parentSimCache.clear();
|
|
1245
|
+
}
|
|
1246
|
+
/**
|
|
1247
|
+
* Builds or incrementally updates the embedding similarity matrix.
|
|
1248
|
+
*
|
|
1249
|
+
* Full rebuild (default): replaces the entire matrix — O(n²).
|
|
1250
|
+
* Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
|
|
1251
|
+
* cells from the old matrix and recomputes only dirty rows via one batched
|
|
1252
|
+
* cosineSimilarity call — O(d·n) where d = number of dirty units.
|
|
1253
|
+
*/
|
|
1254
|
+
buildEmbSimCache(units, dirtyPaths) {
|
|
1255
|
+
const embedded = units.filter((u) => Array.isArray(u.embedding) && u.embedding.length > 0);
|
|
1256
|
+
if (embedded.length < 2) {
|
|
1257
|
+
this.embSimMatrix = [];
|
|
1258
|
+
this.embSimIndex.clear();
|
|
1259
|
+
return;
|
|
1260
|
+
}
|
|
1261
|
+
const embeddings = embedded.map((u) => u.embedding);
|
|
1262
|
+
const newIndex = new Map(embedded.map((u, i) => [u.id, i]));
|
|
1263
|
+
const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
|
|
1264
|
+
const hasPriorMatrix = this.embSimMatrix.length > 0;
|
|
1265
|
+
if (!dirtySet || !hasPriorMatrix) {
|
|
1266
|
+
this.embSimIndex = newIndex;
|
|
1267
|
+
this.embSimMatrix = cosineSimilarity(embeddings, embeddings);
|
|
1268
|
+
log4("Built full embedding similarity matrix: %d units", embedded.length);
|
|
1269
|
+
return;
|
|
1270
|
+
}
|
|
1271
|
+
const dirtyIds = new Set(embedded.filter((u) => dirtySet.has(u.filePath)).map((u) => u.id));
|
|
1272
|
+
if (dirtyIds.size === 0) {
|
|
1273
|
+
log4("Matrix reused: no dirty units detected");
|
|
1274
|
+
return;
|
|
1275
|
+
}
|
|
1276
|
+
const n = embedded.length;
|
|
1277
|
+
const newMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
1278
|
+
for (let i = 0; i < n; i++) {
|
|
1279
|
+
for (let j = 0; j < n; j++) {
|
|
1280
|
+
if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
|
|
1281
|
+
const oi = this.embSimIndex.get(embedded[i].id);
|
|
1282
|
+
const oj = this.embSimIndex.get(embedded[j].id);
|
|
1283
|
+
if (oi !== void 0 && oj !== void 0) newMatrix[i][j] = this.embSimMatrix[oi][oj];
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
1286
|
+
const dirtyIndices = embedded.reduce((acc, u, i) => dirtyIds.has(u.id) ? [...acc, i] : acc, []);
|
|
1287
|
+
const dirtyRows = cosineSimilarity(dirtyIndices.map((i) => embeddings[i]), embeddings);
|
|
1288
|
+
dirtyIndices.forEach((rowIdx, di) => {
|
|
1289
|
+
for (let j = 0; j < n; j++) {
|
|
1290
|
+
newMatrix[rowIdx][j] = dirtyRows[di][j];
|
|
1291
|
+
newMatrix[j][rowIdx] = dirtyRows[di][j];
|
|
1292
|
+
}
|
|
1293
|
+
});
|
|
1294
|
+
this.embSimIndex = newIndex;
|
|
1295
|
+
this.embSimMatrix = newMatrix;
|
|
1296
|
+
log4("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
|
|
1297
|
+
}
|
|
1298
|
+
/** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
|
|
1299
|
+
getEmbSim(id1, id2) {
|
|
1300
|
+
const i = this.embSimIndex.get(id1);
|
|
1301
|
+
const j = this.embSimIndex.get(id2);
|
|
1302
|
+
if (i === void 0 || j === void 0) return void 0;
|
|
1303
|
+
return this.embSimMatrix[i][j];
|
|
1304
|
+
}
|
|
1305
|
+
/** Returns the memoized parent similarity for the given stable key, if available. */
|
|
1306
|
+
getParentSim(key) {
|
|
1307
|
+
return this.parentSimCache.get(key);
|
|
1308
|
+
}
|
|
1309
|
+
/** Stores a memoized parent similarity for the given stable key. */
|
|
1310
|
+
setParentSim(key, sim) {
|
|
1311
|
+
this.parentSimCache.set(key, sim);
|
|
1193
1312
|
}
|
|
1194
1313
|
addKeyForFile(filePath, key) {
|
|
1195
1314
|
const current = this.fileIndex.get(filePath) ?? /* @__PURE__ */ new Set();
|
|
@@ -1206,125 +1325,106 @@ var DuplicationCache = class _DuplicationCache {
|
|
|
1206
1325
|
};
|
|
1207
1326
|
|
|
1208
1327
|
// src/services/UpdateService.ts
|
|
1209
|
-
var
|
|
1328
|
+
var log5 = debug5("DryScan:UpdateService");
|
|
1210
1329
|
var UpdateService = class {
|
|
1211
1330
|
constructor(deps, exclusionService) {
|
|
1212
1331
|
this.deps = deps;
|
|
1213
1332
|
this.exclusionService = exclusionService;
|
|
1214
1333
|
}
|
|
1334
|
+
/** Returns the list of file paths that were modified or deleted (dirty). */
|
|
1215
1335
|
async updateIndex() {
|
|
1216
1336
|
const extractor = this.deps.extractor;
|
|
1217
1337
|
const cache = DuplicationCache.getInstance();
|
|
1218
1338
|
try {
|
|
1219
1339
|
const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
|
|
1220
1340
|
await this.exclusionService.cleanupExcludedFiles();
|
|
1221
|
-
|
|
1341
|
+
const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
|
|
1342
|
+
await cache.invalidate(dirtyPaths);
|
|
1343
|
+
return dirtyPaths;
|
|
1222
1344
|
} catch (err) {
|
|
1223
|
-
|
|
1345
|
+
log5("Error during index update:", err);
|
|
1224
1346
|
throw err;
|
|
1225
1347
|
}
|
|
1226
1348
|
}
|
|
1227
1349
|
};
|
|
1228
1350
|
|
|
1229
1351
|
// src/services/DuplicateService.ts
|
|
1230
|
-
import
|
|
1352
|
+
import debug6 from "debug";
|
|
1231
1353
|
import shortUuid from "short-uuid";
|
|
1232
|
-
|
|
1233
|
-
var log5 = debug5("DryScan:DuplicateService");
|
|
1354
|
+
var log6 = debug6("DryScan:DuplicateService");
|
|
1234
1355
|
var DuplicateService = class {
|
|
1235
1356
|
constructor(deps) {
|
|
1236
1357
|
this.deps = deps;
|
|
1237
1358
|
}
|
|
1238
1359
|
config;
|
|
1239
1360
|
cache = DuplicationCache.getInstance();
|
|
1240
|
-
|
|
1361
|
+
/**
|
|
1362
|
+
* @param dirtyPaths - File paths changed since last run. When provided, only
|
|
1363
|
+
* dirty×all similarities are recomputed; clean×clean values are reused from
|
|
1364
|
+
* the existing matrix. Pass undefined (or omit) for a full rebuild.
|
|
1365
|
+
*/
|
|
1366
|
+
async findDuplicates(config, dirtyPaths) {
|
|
1241
1367
|
this.config = config;
|
|
1368
|
+
const t0 = performance.now();
|
|
1242
1369
|
const allUnits = await this.deps.db.getAllUnits();
|
|
1370
|
+
log6("Starting duplicate analysis on %d units", allUnits.length);
|
|
1243
1371
|
if (allUnits.length < 2) {
|
|
1244
|
-
|
|
1245
|
-
return { duplicates: [], score: score2 };
|
|
1372
|
+
return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
|
|
1246
1373
|
}
|
|
1247
1374
|
const thresholds = this.resolveThresholds(config.threshold);
|
|
1248
|
-
const duplicates = this.computeDuplicates(allUnits, thresholds);
|
|
1249
|
-
const
|
|
1250
|
-
|
|
1251
|
-
this.cache.update(
|
|
1252
|
-
const score = this.computeDuplicationScore(
|
|
1253
|
-
|
|
1375
|
+
const duplicates = this.computeDuplicates(allUnits, thresholds, dirtyPaths);
|
|
1376
|
+
const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
|
|
1377
|
+
log6("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
|
|
1378
|
+
this.cache.update(filtered).catch((err) => log6("Cache update failed: %O", err));
|
|
1379
|
+
const score = this.computeDuplicationScore(filtered, allUnits);
|
|
1380
|
+
log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
|
|
1381
|
+
return { duplicates: filtered, score };
|
|
1254
1382
|
}
|
|
1255
1383
|
resolveThresholds(functionThreshold) {
|
|
1256
|
-
const
|
|
1257
|
-
const clamp = (
|
|
1258
|
-
const
|
|
1259
|
-
const blockOffset = defaults.block - defaults.function;
|
|
1260
|
-
const classOffset = defaults.class - defaults.function;
|
|
1261
|
-
const functionThresholdValue = clamp(base);
|
|
1384
|
+
const d = indexConfig.thresholds;
|
|
1385
|
+
const clamp = (v) => Math.min(1, Math.max(0, v));
|
|
1386
|
+
const fn = clamp(functionThreshold ?? d.function);
|
|
1262
1387
|
return {
|
|
1263
|
-
function:
|
|
1264
|
-
block: clamp(
|
|
1265
|
-
class: clamp(
|
|
1388
|
+
function: fn,
|
|
1389
|
+
block: clamp(fn + d.block - d.function),
|
|
1390
|
+
class: clamp(fn + d.class - d.function)
|
|
1266
1391
|
};
|
|
1267
1392
|
}
|
|
1268
|
-
computeDuplicates(units, thresholds) {
|
|
1393
|
+
computeDuplicates(units, thresholds, dirtyPaths) {
|
|
1394
|
+
this.cache.clearRunCaches();
|
|
1395
|
+
this.cache.buildEmbSimCache(units, dirtyPaths);
|
|
1269
1396
|
const duplicates = [];
|
|
1270
|
-
const
|
|
1271
|
-
for (const
|
|
1272
|
-
const list = byType.get(unit.unitType) ?? [];
|
|
1273
|
-
list.push(unit);
|
|
1274
|
-
byType.set(unit.unitType, list);
|
|
1275
|
-
}
|
|
1276
|
-
for (const [type, typedUnits] of byType.entries()) {
|
|
1397
|
+
const t0 = performance.now();
|
|
1398
|
+
for (const [type, typedUnits] of this.groupByType(units)) {
|
|
1277
1399
|
const threshold = this.getThreshold(type, thresholds);
|
|
1400
|
+
log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
|
|
1278
1401
|
for (let i = 0; i < typedUnits.length; i++) {
|
|
1279
1402
|
for (let j = i + 1; j < typedUnits.length; j++) {
|
|
1280
|
-
const left = typedUnits[i];
|
|
1281
|
-
const right = typedUnits[j];
|
|
1403
|
+
const left = typedUnits[i], right = typedUnits[j];
|
|
1282
1404
|
if (this.shouldSkipComparison(left, right)) continue;
|
|
1283
1405
|
const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
similarity,
|
|
1298
|
-
shortId: shortUuid.generate(),
|
|
1299
|
-
exclusionString,
|
|
1300
|
-
left: {
|
|
1301
|
-
id: left.id,
|
|
1302
|
-
name: left.name,
|
|
1303
|
-
filePath: left.filePath,
|
|
1304
|
-
startLine: left.startLine,
|
|
1305
|
-
endLine: left.endLine,
|
|
1306
|
-
code: left.code,
|
|
1307
|
-
unitType: left.unitType
|
|
1308
|
-
},
|
|
1309
|
-
right: {
|
|
1310
|
-
id: right.id,
|
|
1311
|
-
name: right.name,
|
|
1312
|
-
filePath: right.filePath,
|
|
1313
|
-
startLine: right.startLine,
|
|
1314
|
-
endLine: right.endLine,
|
|
1315
|
-
code: right.code,
|
|
1316
|
-
unitType: right.unitType
|
|
1317
|
-
}
|
|
1318
|
-
});
|
|
1319
|
-
}
|
|
1406
|
+
const hasEmbeddings = left.embedding?.length && right.embedding?.length;
|
|
1407
|
+
const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
|
|
1408
|
+
if (similarity < threshold) continue;
|
|
1409
|
+
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
1410
|
+
if (!exclusionString) continue;
|
|
1411
|
+
duplicates.push({
|
|
1412
|
+
id: `${left.id}::${right.id}`,
|
|
1413
|
+
similarity,
|
|
1414
|
+
shortId: shortUuid.generate(),
|
|
1415
|
+
exclusionString,
|
|
1416
|
+
left: this.toMember(left),
|
|
1417
|
+
right: this.toMember(right)
|
|
1418
|
+
});
|
|
1320
1419
|
}
|
|
1321
1420
|
}
|
|
1322
1421
|
}
|
|
1422
|
+
log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
|
|
1323
1423
|
return duplicates.sort((a, b) => b.similarity - a.similarity);
|
|
1324
1424
|
}
|
|
1325
1425
|
isGroupExcluded(group) {
|
|
1326
1426
|
const config = this.config;
|
|
1327
|
-
if (!config
|
|
1427
|
+
if (!config?.excludedPairs?.length) return false;
|
|
1328
1428
|
const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
|
|
1329
1429
|
if (!key) return false;
|
|
1330
1430
|
const actual = this.deps.pairing.parsePairKey(key);
|
|
@@ -1339,109 +1439,108 @@ var DuplicateService = class {
|
|
|
1339
1439
|
if (type === "block" /* BLOCK */) return thresholds.block;
|
|
1340
1440
|
return thresholds.function;
|
|
1341
1441
|
}
|
|
1342
|
-
computeWeightedSimilarity(left, right) {
|
|
1343
|
-
const
|
|
1442
|
+
computeWeightedSimilarity(left, right, threshold) {
|
|
1443
|
+
const selfSim = this.similarity(left, right);
|
|
1344
1444
|
if (left.unitType === "class" /* CLASS */) {
|
|
1345
|
-
return
|
|
1445
|
+
return selfSim * indexConfig.weights.class.self;
|
|
1346
1446
|
}
|
|
1347
1447
|
if (left.unitType === "function" /* FUNCTION */) {
|
|
1348
|
-
const
|
|
1349
|
-
const
|
|
1350
|
-
const
|
|
1351
|
-
|
|
1352
|
-
return (
|
|
1448
|
+
const w2 = indexConfig.weights.function;
|
|
1449
|
+
const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1450
|
+
const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
|
|
1451
|
+
if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
|
|
1452
|
+
return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
|
|
1353
1453
|
}
|
|
1354
|
-
const
|
|
1355
|
-
const
|
|
1356
|
-
const
|
|
1357
|
-
const
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
const
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
const leftHasEmbedding = this.hasVector(left);
|
|
1370
|
-
const rightHasEmbedding = this.hasVector(right);
|
|
1371
|
-
if (leftHasEmbedding && rightHasEmbedding) {
|
|
1372
|
-
return cosineSimilarity([left.embedding], [right.embedding])[0][0];
|
|
1454
|
+
const w = indexConfig.weights.block;
|
|
1455
|
+
const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
|
|
1456
|
+
const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1457
|
+
const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
|
|
1458
|
+
if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
|
|
1459
|
+
return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
|
|
1460
|
+
}
|
|
1461
|
+
/** Groups all units by type for the comparison loop. Units without embeddings are included
|
|
1462
|
+
* so that cache hits can still be returned for pairs whose embeddings were cleared. */
|
|
1463
|
+
groupByType(units) {
|
|
1464
|
+
const byType = /* @__PURE__ */ new Map();
|
|
1465
|
+
for (const unit of units) {
|
|
1466
|
+
const list = byType.get(unit.unitType) ?? [];
|
|
1467
|
+
list.push(unit);
|
|
1468
|
+
byType.set(unit.unitType, list);
|
|
1373
1469
|
}
|
|
1374
|
-
return
|
|
1470
|
+
return byType;
|
|
1471
|
+
}
|
|
1472
|
+
toMember(unit) {
|
|
1473
|
+
return {
|
|
1474
|
+
id: unit.id,
|
|
1475
|
+
name: unit.name,
|
|
1476
|
+
filePath: unit.filePath,
|
|
1477
|
+
startLine: unit.startLine,
|
|
1478
|
+
endLine: unit.endLine,
|
|
1479
|
+
code: unit.code,
|
|
1480
|
+
unitType: unit.unitType
|
|
1481
|
+
};
|
|
1482
|
+
}
|
|
1483
|
+
bothHaveParent(left, right, type) {
|
|
1484
|
+
return !!this.findParent(left, type) && !!this.findParent(right, type);
|
|
1485
|
+
}
|
|
1486
|
+
parentSimilarity(left, right, type) {
|
|
1487
|
+
const lp = this.findParent(left, type), rp = this.findParent(right, type);
|
|
1488
|
+
if (!lp || !rp) return 0;
|
|
1489
|
+
const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
|
|
1490
|
+
const cached = this.cache.getParentSim(key);
|
|
1491
|
+
if (cached !== void 0) return cached;
|
|
1492
|
+
const sim = this.similarity(lp, rp);
|
|
1493
|
+
this.cache.setParentSim(key, sim);
|
|
1494
|
+
return sim;
|
|
1495
|
+
}
|
|
1496
|
+
/** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
|
|
1497
|
+
similarity(left, right) {
|
|
1498
|
+
return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
|
|
1375
1499
|
}
|
|
1376
1500
|
childSimilarity(left, right) {
|
|
1377
|
-
const
|
|
1378
|
-
|
|
1379
|
-
if (leftChildren.length === 0 || rightChildren.length === 0) return 0;
|
|
1501
|
+
const lc = left.children ?? [], rc = right.children ?? [];
|
|
1502
|
+
if (!lc.length || !rc.length) return 0;
|
|
1380
1503
|
let best = 0;
|
|
1381
|
-
for (const
|
|
1382
|
-
for (const
|
|
1383
|
-
if (
|
|
1384
|
-
const sim = this.
|
|
1504
|
+
for (const l of lc) {
|
|
1505
|
+
for (const r of rc) {
|
|
1506
|
+
if (l.unitType !== r.unitType) continue;
|
|
1507
|
+
const sim = this.similarity(l, r);
|
|
1385
1508
|
if (sim > best) best = sim;
|
|
1386
1509
|
}
|
|
1387
1510
|
}
|
|
1388
1511
|
return best;
|
|
1389
1512
|
}
|
|
1390
|
-
hasVector(unit) {
|
|
1391
|
-
return Array.isArray(unit.embedding) && unit.embedding.length > 0;
|
|
1392
|
-
}
|
|
1393
1513
|
shouldSkipComparison(left, right) {
|
|
1394
|
-
if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */)
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
}
|
|
1404
|
-
findParentOfType(unit, targetType) {
|
|
1405
|
-
let current = unit.parent;
|
|
1406
|
-
while (current) {
|
|
1407
|
-
if (current.unitType === targetType) return current;
|
|
1408
|
-
current = current.parent;
|
|
1514
|
+
if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
|
|
1515
|
+
if (left.filePath !== right.filePath) return false;
|
|
1516
|
+
return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
|
|
1517
|
+
}
|
|
1518
|
+
findParent(unit, type) {
|
|
1519
|
+
let p = unit.parent;
|
|
1520
|
+
while (p) {
|
|
1521
|
+
if (p.unitType === type) return p;
|
|
1522
|
+
p = p.parent;
|
|
1409
1523
|
}
|
|
1410
1524
|
return null;
|
|
1411
1525
|
}
|
|
1412
1526
|
computeDuplicationScore(duplicates, allUnits) {
|
|
1413
|
-
const totalLines =
|
|
1414
|
-
if (totalLines
|
|
1415
|
-
return {
|
|
1416
|
-
score: 0,
|
|
1417
|
-
grade: "Excellent",
|
|
1418
|
-
totalLines,
|
|
1419
|
-
duplicateLines: 0,
|
|
1420
|
-
duplicateGroups: 0
|
|
1421
|
-
};
|
|
1527
|
+
const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
|
|
1528
|
+
if (!totalLines || !duplicates.length) {
|
|
1529
|
+
return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
|
|
1422
1530
|
}
|
|
1423
|
-
const
|
|
1424
|
-
const
|
|
1425
|
-
|
|
1426
|
-
const avgLines = (leftLines + rightLines) / 2;
|
|
1427
|
-
return sum + group.similarity * avgLines;
|
|
1531
|
+
const duplicateLines = duplicates.reduce((sum, g) => {
|
|
1532
|
+
const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
|
|
1533
|
+
return sum + g.similarity * avg;
|
|
1428
1534
|
}, 0);
|
|
1429
|
-
const score =
|
|
1430
|
-
const grade = this.getScoreGrade(score);
|
|
1535
|
+
const score = duplicateLines / totalLines * 100;
|
|
1431
1536
|
return {
|
|
1432
1537
|
score,
|
|
1433
|
-
grade,
|
|
1538
|
+
grade: this.getScoreGrade(score),
|
|
1434
1539
|
totalLines,
|
|
1435
|
-
duplicateLines: Math.round(
|
|
1540
|
+
duplicateLines: Math.round(duplicateLines),
|
|
1436
1541
|
duplicateGroups: duplicates.length
|
|
1437
1542
|
};
|
|
1438
1543
|
}
|
|
1439
|
-
calculateTotalLines(units) {
|
|
1440
|
-
return units.reduce((sum, unit) => {
|
|
1441
|
-
const lines = unit.endLine - unit.startLine + 1;
|
|
1442
|
-
return sum + lines;
|
|
1443
|
-
}, 0);
|
|
1444
|
-
}
|
|
1445
1544
|
getScoreGrade(score) {
|
|
1446
1545
|
if (score < 5) return "Excellent";
|
|
1447
1546
|
if (score < 15) return "Good";
|
|
@@ -1537,9 +1636,9 @@ var ExclusionService = class {
|
|
|
1537
1636
|
|
|
1538
1637
|
// src/services/PairingService.ts
|
|
1539
1638
|
import crypto3 from "crypto";
|
|
1540
|
-
import
|
|
1639
|
+
import debug7 from "debug";
|
|
1541
1640
|
import { minimatch as minimatch2 } from "minimatch";
|
|
1542
|
-
var
|
|
1641
|
+
var log7 = debug7("DryScan:pairs");
|
|
1543
1642
|
var PairingService = class {
|
|
1544
1643
|
constructor(indexUnitExtractor) {
|
|
1545
1644
|
this.indexUnitExtractor = indexUnitExtractor;
|
|
@@ -1550,7 +1649,7 @@ var PairingService = class {
|
|
|
1550
1649
|
*/
|
|
1551
1650
|
pairKeyForUnits(left, right) {
|
|
1552
1651
|
if (left.unitType !== right.unitType) {
|
|
1553
|
-
|
|
1652
|
+
log7("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
|
|
1554
1653
|
return null;
|
|
1555
1654
|
}
|
|
1556
1655
|
const type = left.unitType;
|
|
@@ -1566,13 +1665,13 @@ var PairingService = class {
|
|
|
1566
1665
|
parsePairKey(value) {
|
|
1567
1666
|
const parts = value.split("|");
|
|
1568
1667
|
if (parts.length !== 3) {
|
|
1569
|
-
|
|
1668
|
+
log7("Invalid pair key format: %s", value);
|
|
1570
1669
|
return null;
|
|
1571
1670
|
}
|
|
1572
1671
|
const [typeRaw, leftRaw, rightRaw] = parts;
|
|
1573
1672
|
const type = this.stringToUnitType(typeRaw);
|
|
1574
1673
|
if (!type) {
|
|
1575
|
-
|
|
1674
|
+
log7("Unknown unit type in pair key: %s", typeRaw);
|
|
1576
1675
|
return null;
|
|
1577
1676
|
}
|
|
1578
1677
|
const [left, right] = [leftRaw, rightRaw].sort();
|
|
@@ -1706,9 +1805,10 @@ var DryScan = class {
|
|
|
1706
1805
|
console.log("[DryScan] Checking for file changes...");
|
|
1707
1806
|
const start = Date.now();
|
|
1708
1807
|
await this.ensureDatabase();
|
|
1709
|
-
await this.services.updater.updateIndex();
|
|
1808
|
+
const dirtyPaths = await this.services.updater.updateIndex();
|
|
1710
1809
|
const duration = Date.now() - start;
|
|
1711
1810
|
console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
|
|
1811
|
+
return dirtyPaths;
|
|
1712
1812
|
}
|
|
1713
1813
|
/**
|
|
1714
1814
|
* Runs duplicate detection and returns a normalized report payload ready for persistence or display.
|
|
@@ -1737,12 +1837,12 @@ var DryScan = class {
|
|
|
1737
1837
|
await this.ensureDatabase();
|
|
1738
1838
|
console.log("[DryScan] Updating index...");
|
|
1739
1839
|
const updateStart = Date.now();
|
|
1740
|
-
await this.updateIndex();
|
|
1840
|
+
const dirtyPaths = await this.updateIndex();
|
|
1741
1841
|
const updateDuration = Date.now() - updateStart;
|
|
1742
1842
|
console.log(`[DryScan] Index update took ${updateDuration}ms.`);
|
|
1743
1843
|
console.log("[DryScan] Detecting duplicates...");
|
|
1744
1844
|
const dupStart = Date.now();
|
|
1745
|
-
const result = await this.services.duplicate.findDuplicates(config);
|
|
1845
|
+
const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
|
|
1746
1846
|
const dupDuration = Date.now() - dupStart;
|
|
1747
1847
|
console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
|
|
1748
1848
|
return result;
|