@goshenkata/dryscan-core 1.2.5 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -1
- package/dist/index.js +246 -184
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/DryScan.ts +5 -4
- package/src/config/dryconfig.ts +1 -1
- package/src/extractors/java.ts +22 -7
- package/src/services/DuplicateService.ts +133 -184
- package/src/services/DuplicationCache.ts +107 -1
- package/src/services/UpdateService.ts +5 -2
package/dist/index.d.ts
CHANGED
|
@@ -221,7 +221,7 @@ declare class DryScan {
|
|
|
221
221
|
* 6. Recompute embeddings for affected units
|
|
222
222
|
* 7. Update file tracking metadata
|
|
223
223
|
*/
|
|
224
|
-
updateIndex(): Promise<
|
|
224
|
+
updateIndex(): Promise<string[]>;
|
|
225
225
|
/**
|
|
226
226
|
* Runs duplicate detection and returns a normalized report payload ready for persistence or display.
|
|
227
227
|
*/
|
package/dist/index.js
CHANGED
|
@@ -61,7 +61,7 @@ var DEFAULT_CONFIG = {
|
|
|
61
61
|
excludedPairs: [],
|
|
62
62
|
minLines: 3,
|
|
63
63
|
minBlockLines: 5,
|
|
64
|
-
threshold: 0.
|
|
64
|
+
threshold: 0.8,
|
|
65
65
|
embeddingSource: "http://localhost:11434",
|
|
66
66
|
contextLength: 2048
|
|
67
67
|
};
|
|
@@ -244,7 +244,8 @@ var JavaExtractor = class {
|
|
|
244
244
|
const fnUnit = this.buildFunctionUnit(node, source, fileRelPath, currentClass);
|
|
245
245
|
const fnLength = fnUnit.endLine - fnUnit.startLine;
|
|
246
246
|
const bodyNode = this.getFunctionBody(node);
|
|
247
|
-
const
|
|
247
|
+
const fnArity = this.getNodeArity(node);
|
|
248
|
+
const skipFunction = this.shouldSkip("function" /* FUNCTION */, fnUnit.name, fnLength, fnArity);
|
|
248
249
|
if (skipFunction) {
|
|
249
250
|
return;
|
|
250
251
|
}
|
|
@@ -311,22 +312,34 @@ var JavaExtractor = class {
|
|
|
311
312
|
const normalized = this.normalizeCode(unit.code);
|
|
312
313
|
return crypto.createHash(BLOCK_HASH_ALGO).update(normalized).digest("hex");
|
|
313
314
|
}
|
|
314
|
-
shouldSkip(unitType, name, lineCount) {
|
|
315
|
+
shouldSkip(unitType, name, lineCount, arity) {
|
|
315
316
|
if (!this.config) {
|
|
316
317
|
throw new Error("Config not loaded before skip evaluation");
|
|
317
318
|
}
|
|
318
319
|
const config = this.config;
|
|
319
320
|
const minLines = unitType === "block" /* BLOCK */ ? Math.max(indexConfig.blockMinLines, config.minBlockLines ?? 0) : config.minLines;
|
|
320
321
|
const belowMin = minLines > 0 && lineCount < minLines;
|
|
321
|
-
const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name);
|
|
322
|
+
const trivial = unitType === "function" /* FUNCTION */ && this.isTrivialFunction(name, arity ?? 0);
|
|
322
323
|
return belowMin || trivial;
|
|
323
324
|
}
|
|
324
|
-
|
|
325
|
+
/**
|
|
326
|
+
* A function is trivial if it follows a simple accessor pattern:
|
|
327
|
+
* - getters/isers: name matches get[A-Z] or is[A-Z] with exactly 0 parameters
|
|
328
|
+
* - setters: name matches set[A-Z] with at most 1 parameter
|
|
329
|
+
* Methods like getUserById(Long id) have arity > 0 and are NOT trivial.
|
|
330
|
+
*/
|
|
331
|
+
isTrivialFunction(fullName, arity) {
|
|
325
332
|
const simpleName = fullName.split(".").pop() || fullName;
|
|
326
|
-
const isGetter = /^(get|is)[A-Z]/.test(simpleName);
|
|
327
|
-
const isSetter = /^set[A-Z]/.test(simpleName);
|
|
333
|
+
const isGetter = /^(get|is)[A-Z]/.test(simpleName) && arity === 0;
|
|
334
|
+
const isSetter = /^set[A-Z]/.test(simpleName) && arity <= 1;
|
|
328
335
|
return isGetter || isSetter;
|
|
329
336
|
}
|
|
337
|
+
/** Counts the formal parameters of a method or constructor node. */
|
|
338
|
+
getNodeArity(node) {
|
|
339
|
+
const params = node.childForFieldName?.("parameters");
|
|
340
|
+
if (!params) return 0;
|
|
341
|
+
return params.namedChildren.filter((c) => c.type === "formal_parameter" || c.type === "spread_parameter").length;
|
|
342
|
+
}
|
|
330
343
|
isDtoClass(node, source, className) {
|
|
331
344
|
const classBody = node.children.find((child) => child.type === "class_body");
|
|
332
345
|
if (!classBody) return false;
|
|
@@ -344,7 +357,8 @@ var JavaExtractor = class {
|
|
|
344
357
|
if (child.type === "method_declaration" || child.type === "constructor_declaration") {
|
|
345
358
|
const simpleName = this.getSimpleFunctionName(child, source);
|
|
346
359
|
const fullName = `${className}.${simpleName}`;
|
|
347
|
-
|
|
360
|
+
const arity = this.getNodeArity(child);
|
|
361
|
+
if (!this.isTrivialFunction(fullName, arity)) {
|
|
348
362
|
return false;
|
|
349
363
|
}
|
|
350
364
|
continue;
|
|
@@ -1021,7 +1035,7 @@ var RepositoryInitializer = class {
|
|
|
1021
1035
|
};
|
|
1022
1036
|
|
|
1023
1037
|
// src/services/UpdateService.ts
|
|
1024
|
-
import
|
|
1038
|
+
import debug5 from "debug";
|
|
1025
1039
|
|
|
1026
1040
|
// src/DryScanUpdater.ts
|
|
1027
1041
|
import path4 from "path";
|
|
@@ -1141,11 +1155,20 @@ async function performIncrementalUpdate(repoPath, extractor, db) {
|
|
|
1141
1155
|
}
|
|
1142
1156
|
|
|
1143
1157
|
// src/services/DuplicationCache.ts
|
|
1158
|
+
import debug4 from "debug";
|
|
1159
|
+
import { cosineSimilarity } from "@langchain/core/utils/math";
|
|
1160
|
+
var log4 = debug4("DryScan:DuplicationCache");
|
|
1144
1161
|
var DuplicationCache = class _DuplicationCache {
|
|
1145
1162
|
static instance = null;
|
|
1146
1163
|
comparisons = /* @__PURE__ */ new Map();
|
|
1147
1164
|
fileIndex = /* @__PURE__ */ new Map();
|
|
1148
1165
|
initialized = false;
|
|
1166
|
+
/** Per-run similarity matrix from a single batched library call (reset each run). */
|
|
1167
|
+
embSimMatrix = [];
|
|
1168
|
+
/** Maps unit ID to its row/column index in embSimMatrix. */
|
|
1169
|
+
embSimIndex = /* @__PURE__ */ new Map();
|
|
1170
|
+
/** Per-run memoization of parent unit similarity scores (reset each run). */
|
|
1171
|
+
parentSimCache = /* @__PURE__ */ new Map();
|
|
1149
1172
|
static getInstance() {
|
|
1150
1173
|
if (!_DuplicationCache.instance) {
|
|
1151
1174
|
_DuplicationCache.instance = new _DuplicationCache();
|
|
@@ -1208,6 +1231,84 @@ var DuplicationCache = class _DuplicationCache {
|
|
|
1208
1231
|
this.comparisons.clear();
|
|
1209
1232
|
this.fileIndex.clear();
|
|
1210
1233
|
this.initialized = false;
|
|
1234
|
+
this.embSimMatrix = [];
|
|
1235
|
+
this.embSimIndex.clear();
|
|
1236
|
+
this.clearRunCaches();
|
|
1237
|
+
}
|
|
1238
|
+
/**
|
|
1239
|
+
* Resets per-run memoization (parent similarities).
|
|
1240
|
+
* The embedding matrix is intentionally preserved so incremental runs can
|
|
1241
|
+
* reuse clean×clean values across calls.
|
|
1242
|
+
*/
|
|
1243
|
+
clearRunCaches() {
|
|
1244
|
+
this.parentSimCache.clear();
|
|
1245
|
+
}
|
|
1246
|
+
/**
|
|
1247
|
+
* Builds or incrementally updates the embedding similarity matrix.
|
|
1248
|
+
*
|
|
1249
|
+
* Full rebuild (default): replaces the entire matrix — O(n²).
|
|
1250
|
+
* Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
|
|
1251
|
+
* cells from the old matrix and recomputes only dirty rows via one batched
|
|
1252
|
+
* cosineSimilarity call — O(d·n) where d = number of dirty units.
|
|
1253
|
+
*/
|
|
1254
|
+
buildEmbSimCache(units, dirtyPaths) {
|
|
1255
|
+
const embedded = units.filter((u) => Array.isArray(u.embedding) && u.embedding.length > 0);
|
|
1256
|
+
if (embedded.length < 2) {
|
|
1257
|
+
this.embSimMatrix = [];
|
|
1258
|
+
this.embSimIndex.clear();
|
|
1259
|
+
return;
|
|
1260
|
+
}
|
|
1261
|
+
const embeddings = embedded.map((u) => u.embedding);
|
|
1262
|
+
const newIndex = new Map(embedded.map((u, i) => [u.id, i]));
|
|
1263
|
+
const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
|
|
1264
|
+
const hasPriorMatrix = this.embSimMatrix.length > 0;
|
|
1265
|
+
if (!dirtySet || !hasPriorMatrix) {
|
|
1266
|
+
this.embSimIndex = newIndex;
|
|
1267
|
+
this.embSimMatrix = cosineSimilarity(embeddings, embeddings);
|
|
1268
|
+
log4("Built full embedding similarity matrix: %d units", embedded.length);
|
|
1269
|
+
return;
|
|
1270
|
+
}
|
|
1271
|
+
const dirtyIds = new Set(embedded.filter((u) => dirtySet.has(u.filePath)).map((u) => u.id));
|
|
1272
|
+
if (dirtyIds.size === 0) {
|
|
1273
|
+
log4("Matrix reused: no dirty units detected");
|
|
1274
|
+
return;
|
|
1275
|
+
}
|
|
1276
|
+
const n = embedded.length;
|
|
1277
|
+
const newMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
1278
|
+
for (let i = 0; i < n; i++) {
|
|
1279
|
+
for (let j = 0; j < n; j++) {
|
|
1280
|
+
if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
|
|
1281
|
+
const oi = this.embSimIndex.get(embedded[i].id);
|
|
1282
|
+
const oj = this.embSimIndex.get(embedded[j].id);
|
|
1283
|
+
if (oi !== void 0 && oj !== void 0) newMatrix[i][j] = this.embSimMatrix[oi][oj];
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
1286
|
+
const dirtyIndices = embedded.reduce((acc, u, i) => dirtyIds.has(u.id) ? [...acc, i] : acc, []);
|
|
1287
|
+
const dirtyRows = cosineSimilarity(dirtyIndices.map((i) => embeddings[i]), embeddings);
|
|
1288
|
+
dirtyIndices.forEach((rowIdx, di) => {
|
|
1289
|
+
for (let j = 0; j < n; j++) {
|
|
1290
|
+
newMatrix[rowIdx][j] = dirtyRows[di][j];
|
|
1291
|
+
newMatrix[j][rowIdx] = dirtyRows[di][j];
|
|
1292
|
+
}
|
|
1293
|
+
});
|
|
1294
|
+
this.embSimIndex = newIndex;
|
|
1295
|
+
this.embSimMatrix = newMatrix;
|
|
1296
|
+
log4("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
|
|
1297
|
+
}
|
|
1298
|
+
/** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
|
|
1299
|
+
getEmbSim(id1, id2) {
|
|
1300
|
+
const i = this.embSimIndex.get(id1);
|
|
1301
|
+
const j = this.embSimIndex.get(id2);
|
|
1302
|
+
if (i === void 0 || j === void 0) return void 0;
|
|
1303
|
+
return this.embSimMatrix[i][j];
|
|
1304
|
+
}
|
|
1305
|
+
/** Returns the memoized parent similarity for the given stable key, if available. */
|
|
1306
|
+
getParentSim(key) {
|
|
1307
|
+
return this.parentSimCache.get(key);
|
|
1308
|
+
}
|
|
1309
|
+
/** Stores a memoized parent similarity for the given stable key. */
|
|
1310
|
+
setParentSim(key, sim) {
|
|
1311
|
+
this.parentSimCache.set(key, sim);
|
|
1211
1312
|
}
|
|
1212
1313
|
addKeyForFile(filePath, key) {
|
|
1213
1314
|
const current = this.fileIndex.get(filePath) ?? /* @__PURE__ */ new Set();
|
|
@@ -1224,145 +1325,106 @@ var DuplicationCache = class _DuplicationCache {
|
|
|
1224
1325
|
};
|
|
1225
1326
|
|
|
1226
1327
|
// src/services/UpdateService.ts
|
|
1227
|
-
var
|
|
1328
|
+
var log5 = debug5("DryScan:UpdateService");
|
|
1228
1329
|
var UpdateService = class {
|
|
1229
1330
|
constructor(deps, exclusionService) {
|
|
1230
1331
|
this.deps = deps;
|
|
1231
1332
|
this.exclusionService = exclusionService;
|
|
1232
1333
|
}
|
|
1334
|
+
/** Returns the list of file paths that were modified or deleted (dirty). */
|
|
1233
1335
|
async updateIndex() {
|
|
1234
1336
|
const extractor = this.deps.extractor;
|
|
1235
1337
|
const cache = DuplicationCache.getInstance();
|
|
1236
1338
|
try {
|
|
1237
1339
|
const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
|
|
1238
1340
|
await this.exclusionService.cleanupExcludedFiles();
|
|
1239
|
-
|
|
1341
|
+
const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
|
|
1342
|
+
await cache.invalidate(dirtyPaths);
|
|
1343
|
+
return dirtyPaths;
|
|
1240
1344
|
} catch (err) {
|
|
1241
|
-
|
|
1345
|
+
log5("Error during index update:", err);
|
|
1242
1346
|
throw err;
|
|
1243
1347
|
}
|
|
1244
1348
|
}
|
|
1245
1349
|
};
|
|
1246
1350
|
|
|
1247
1351
|
// src/services/DuplicateService.ts
|
|
1248
|
-
import
|
|
1352
|
+
import debug6 from "debug";
|
|
1249
1353
|
import shortUuid from "short-uuid";
|
|
1250
|
-
|
|
1251
|
-
var log5 = debug5("DryScan:DuplicateService");
|
|
1354
|
+
var log6 = debug6("DryScan:DuplicateService");
|
|
1252
1355
|
var DuplicateService = class {
|
|
1253
1356
|
constructor(deps) {
|
|
1254
1357
|
this.deps = deps;
|
|
1255
1358
|
}
|
|
1256
1359
|
config;
|
|
1257
1360
|
cache = DuplicationCache.getInstance();
|
|
1258
|
-
|
|
1259
|
-
|
|
1361
|
+
/**
|
|
1362
|
+
* @param dirtyPaths - File paths changed since last run. When provided, only
|
|
1363
|
+
* dirty×all similarities are recomputed; clean×clean values are reused from
|
|
1364
|
+
* the existing matrix. Pass undefined (or omit) for a full rebuild.
|
|
1365
|
+
*/
|
|
1366
|
+
async findDuplicates(config, dirtyPaths) {
|
|
1260
1367
|
this.config = config;
|
|
1261
1368
|
const t0 = performance.now();
|
|
1262
1369
|
const allUnits = await this.deps.db.getAllUnits();
|
|
1263
|
-
|
|
1370
|
+
log6("Starting duplicate analysis on %d units", allUnits.length);
|
|
1264
1371
|
if (allUnits.length < 2) {
|
|
1265
|
-
|
|
1266
|
-
const score2 = this.computeDuplicationScore([], allUnits);
|
|
1267
|
-
return { duplicates: [], score: score2 };
|
|
1372
|
+
return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
|
|
1268
1373
|
}
|
|
1269
1374
|
const thresholds = this.resolveThresholds(config.threshold);
|
|
1270
|
-
|
|
1271
|
-
const
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
this.
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
return { duplicates: filteredDuplicates, score };
|
|
1375
|
+
const duplicates = this.computeDuplicates(allUnits, thresholds, dirtyPaths);
|
|
1376
|
+
const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
|
|
1377
|
+
log6("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
|
|
1378
|
+
this.cache.update(filtered).catch((err) => log6("Cache update failed: %O", err));
|
|
1379
|
+
const score = this.computeDuplicationScore(filtered, allUnits);
|
|
1380
|
+
log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
|
|
1381
|
+
return { duplicates: filtered, score };
|
|
1278
1382
|
}
|
|
1279
1383
|
resolveThresholds(functionThreshold) {
|
|
1280
|
-
const
|
|
1281
|
-
const clamp = (
|
|
1282
|
-
const
|
|
1283
|
-
const blockOffset = defaults.block - defaults.function;
|
|
1284
|
-
const classOffset = defaults.class - defaults.function;
|
|
1285
|
-
const functionThresholdValue = clamp(base);
|
|
1384
|
+
const d = indexConfig.thresholds;
|
|
1385
|
+
const clamp = (v) => Math.min(1, Math.max(0, v));
|
|
1386
|
+
const fn = clamp(functionThreshold ?? d.function);
|
|
1286
1387
|
return {
|
|
1287
|
-
function:
|
|
1288
|
-
block: clamp(
|
|
1289
|
-
class: clamp(
|
|
1388
|
+
function: fn,
|
|
1389
|
+
block: clamp(fn + d.block - d.function),
|
|
1390
|
+
class: clamp(fn + d.class - d.function)
|
|
1290
1391
|
};
|
|
1291
1392
|
}
|
|
1292
|
-
computeDuplicates(units, thresholds) {
|
|
1393
|
+
computeDuplicates(units, thresholds, dirtyPaths) {
|
|
1394
|
+
this.cache.clearRunCaches();
|
|
1395
|
+
this.cache.buildEmbSimCache(units, dirtyPaths);
|
|
1293
1396
|
const duplicates = [];
|
|
1294
|
-
const byType = /* @__PURE__ */ new Map();
|
|
1295
|
-
for (const unit of units) {
|
|
1296
|
-
const list = byType.get(unit.unitType) ?? [];
|
|
1297
|
-
list.push(unit);
|
|
1298
|
-
byType.set(unit.unitType, list);
|
|
1299
|
-
}
|
|
1300
1397
|
const t0 = performance.now();
|
|
1301
|
-
for (const [type, typedUnits] of
|
|
1398
|
+
for (const [type, typedUnits] of this.groupByType(units)) {
|
|
1302
1399
|
const threshold = this.getThreshold(type, thresholds);
|
|
1303
|
-
|
|
1304
|
-
const typeStart = performance.now();
|
|
1400
|
+
log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
|
|
1305
1401
|
for (let i = 0; i < typedUnits.length; i++) {
|
|
1306
1402
|
for (let j = i + 1; j < typedUnits.length; j++) {
|
|
1307
|
-
const left = typedUnits[i];
|
|
1308
|
-
|
|
1309
|
-
if (this.shouldSkipComparison(left, right)) {
|
|
1310
|
-
log5("Skipping nested block comparison: '%s' and '%s'", left.name, right.name);
|
|
1311
|
-
continue;
|
|
1312
|
-
}
|
|
1403
|
+
const left = typedUnits[i], right = typedUnits[j];
|
|
1404
|
+
if (this.shouldSkipComparison(left, right)) continue;
|
|
1313
1405
|
const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
if (similarity >= threshold) {
|
|
1328
|
-
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
1329
|
-
if (!exclusionString) continue;
|
|
1330
|
-
log5("Duplicate found: '%s' <-> '%s' (similarity=%d)", left.name, right.name, similarity);
|
|
1331
|
-
duplicates.push({
|
|
1332
|
-
id: `${left.id}::${right.id}`,
|
|
1333
|
-
similarity,
|
|
1334
|
-
shortId: shortUuid.generate(),
|
|
1335
|
-
exclusionString,
|
|
1336
|
-
left: {
|
|
1337
|
-
id: left.id,
|
|
1338
|
-
name: left.name,
|
|
1339
|
-
filePath: left.filePath,
|
|
1340
|
-
startLine: left.startLine,
|
|
1341
|
-
endLine: left.endLine,
|
|
1342
|
-
code: left.code,
|
|
1343
|
-
unitType: left.unitType
|
|
1344
|
-
},
|
|
1345
|
-
right: {
|
|
1346
|
-
id: right.id,
|
|
1347
|
-
name: right.name,
|
|
1348
|
-
filePath: right.filePath,
|
|
1349
|
-
startLine: right.startLine,
|
|
1350
|
-
endLine: right.endLine,
|
|
1351
|
-
code: right.code,
|
|
1352
|
-
unitType: right.unitType
|
|
1353
|
-
}
|
|
1354
|
-
});
|
|
1355
|
-
}
|
|
1406
|
+
const hasEmbeddings = left.embedding?.length && right.embedding?.length;
|
|
1407
|
+
const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
|
|
1408
|
+
if (similarity < threshold) continue;
|
|
1409
|
+
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
1410
|
+
if (!exclusionString) continue;
|
|
1411
|
+
duplicates.push({
|
|
1412
|
+
id: `${left.id}::${right.id}`,
|
|
1413
|
+
similarity,
|
|
1414
|
+
shortId: shortUuid.generate(),
|
|
1415
|
+
exclusionString,
|
|
1416
|
+
left: this.toMember(left),
|
|
1417
|
+
right: this.toMember(right)
|
|
1418
|
+
});
|
|
1356
1419
|
}
|
|
1357
1420
|
}
|
|
1358
|
-
log5("Type '%s' comparisons completed in %dms", type, (performance.now() - typeStart).toFixed(2));
|
|
1359
1421
|
}
|
|
1360
|
-
|
|
1422
|
+
log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
|
|
1361
1423
|
return duplicates.sort((a, b) => b.similarity - a.similarity);
|
|
1362
1424
|
}
|
|
1363
1425
|
isGroupExcluded(group) {
|
|
1364
1426
|
const config = this.config;
|
|
1365
|
-
if (!config
|
|
1427
|
+
if (!config?.excludedPairs?.length) return false;
|
|
1366
1428
|
const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
|
|
1367
1429
|
if (!key) return false;
|
|
1368
1430
|
const actual = this.deps.pairing.parsePairKey(key);
|
|
@@ -1377,109 +1439,108 @@ var DuplicateService = class {
|
|
|
1377
1439
|
if (type === "block" /* BLOCK */) return thresholds.block;
|
|
1378
1440
|
return thresholds.function;
|
|
1379
1441
|
}
|
|
1380
|
-
computeWeightedSimilarity(left, right) {
|
|
1381
|
-
const
|
|
1442
|
+
computeWeightedSimilarity(left, right, threshold) {
|
|
1443
|
+
const selfSim = this.similarity(left, right);
|
|
1382
1444
|
if (left.unitType === "class" /* CLASS */) {
|
|
1383
|
-
return
|
|
1445
|
+
return selfSim * indexConfig.weights.class.self;
|
|
1384
1446
|
}
|
|
1385
1447
|
if (left.unitType === "function" /* FUNCTION */) {
|
|
1386
|
-
const
|
|
1387
|
-
const
|
|
1388
|
-
const
|
|
1389
|
-
|
|
1390
|
-
return (
|
|
1448
|
+
const w2 = indexConfig.weights.function;
|
|
1449
|
+
const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1450
|
+
const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
|
|
1451
|
+
if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
|
|
1452
|
+
return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
|
|
1391
1453
|
}
|
|
1392
|
-
const
|
|
1393
|
-
const
|
|
1394
|
-
const
|
|
1395
|
-
const
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
const
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
const leftHasEmbedding = this.hasVector(left);
|
|
1408
|
-
const rightHasEmbedding = this.hasVector(right);
|
|
1409
|
-
if (leftHasEmbedding && rightHasEmbedding) {
|
|
1410
|
-
return cosineSimilarity([left.embedding], [right.embedding])[0][0];
|
|
1454
|
+
const w = indexConfig.weights.block;
|
|
1455
|
+
const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
|
|
1456
|
+
const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1457
|
+
const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
|
|
1458
|
+
if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
|
|
1459
|
+
return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
|
|
1460
|
+
}
|
|
1461
|
+
/** Groups all units by type for the comparison loop. Units without embeddings are included
|
|
1462
|
+
* so that cache hits can still be returned for pairs whose embeddings were cleared. */
|
|
1463
|
+
groupByType(units) {
|
|
1464
|
+
const byType = /* @__PURE__ */ new Map();
|
|
1465
|
+
for (const unit of units) {
|
|
1466
|
+
const list = byType.get(unit.unitType) ?? [];
|
|
1467
|
+
list.push(unit);
|
|
1468
|
+
byType.set(unit.unitType, list);
|
|
1411
1469
|
}
|
|
1412
|
-
return
|
|
1470
|
+
return byType;
|
|
1471
|
+
}
|
|
1472
|
+
toMember(unit) {
|
|
1473
|
+
return {
|
|
1474
|
+
id: unit.id,
|
|
1475
|
+
name: unit.name,
|
|
1476
|
+
filePath: unit.filePath,
|
|
1477
|
+
startLine: unit.startLine,
|
|
1478
|
+
endLine: unit.endLine,
|
|
1479
|
+
code: unit.code,
|
|
1480
|
+
unitType: unit.unitType
|
|
1481
|
+
};
|
|
1482
|
+
}
|
|
1483
|
+
bothHaveParent(left, right, type) {
|
|
1484
|
+
return !!this.findParent(left, type) && !!this.findParent(right, type);
|
|
1485
|
+
}
|
|
1486
|
+
parentSimilarity(left, right, type) {
|
|
1487
|
+
const lp = this.findParent(left, type), rp = this.findParent(right, type);
|
|
1488
|
+
if (!lp || !rp) return 0;
|
|
1489
|
+
const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
|
|
1490
|
+
const cached = this.cache.getParentSim(key);
|
|
1491
|
+
if (cached !== void 0) return cached;
|
|
1492
|
+
const sim = this.similarity(lp, rp);
|
|
1493
|
+
this.cache.setParentSim(key, sim);
|
|
1494
|
+
return sim;
|
|
1495
|
+
}
|
|
1496
|
+
/** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
|
|
1497
|
+
similarity(left, right) {
|
|
1498
|
+
return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
|
|
1413
1499
|
}
|
|
1414
1500
|
childSimilarity(left, right) {
|
|
1415
|
-
const
|
|
1416
|
-
|
|
1417
|
-
if (leftChildren.length === 0 || rightChildren.length === 0) return 0;
|
|
1501
|
+
const lc = left.children ?? [], rc = right.children ?? [];
|
|
1502
|
+
if (!lc.length || !rc.length) return 0;
|
|
1418
1503
|
let best = 0;
|
|
1419
|
-
for (const
|
|
1420
|
-
for (const
|
|
1421
|
-
if (
|
|
1422
|
-
const sim = this.
|
|
1504
|
+
for (const l of lc) {
|
|
1505
|
+
for (const r of rc) {
|
|
1506
|
+
if (l.unitType !== r.unitType) continue;
|
|
1507
|
+
const sim = this.similarity(l, r);
|
|
1423
1508
|
if (sim > best) best = sim;
|
|
1424
1509
|
}
|
|
1425
1510
|
}
|
|
1426
1511
|
return best;
|
|
1427
1512
|
}
|
|
1428
|
-
hasVector(unit) {
|
|
1429
|
-
return Array.isArray(unit.embedding) && unit.embedding.length > 0;
|
|
1430
|
-
}
|
|
1431
1513
|
shouldSkipComparison(left, right) {
|
|
1432
|
-
if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */)
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
}
|
|
1442
|
-
findParentOfType(unit, targetType) {
|
|
1443
|
-
let current = unit.parent;
|
|
1444
|
-
while (current) {
|
|
1445
|
-
if (current.unitType === targetType) return current;
|
|
1446
|
-
current = current.parent;
|
|
1514
|
+
if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
|
|
1515
|
+
if (left.filePath !== right.filePath) return false;
|
|
1516
|
+
return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
|
|
1517
|
+
}
|
|
1518
|
+
findParent(unit, type) {
|
|
1519
|
+
let p = unit.parent;
|
|
1520
|
+
while (p) {
|
|
1521
|
+
if (p.unitType === type) return p;
|
|
1522
|
+
p = p.parent;
|
|
1447
1523
|
}
|
|
1448
1524
|
return null;
|
|
1449
1525
|
}
|
|
1450
1526
|
computeDuplicationScore(duplicates, allUnits) {
|
|
1451
|
-
const totalLines =
|
|
1452
|
-
if (totalLines
|
|
1453
|
-
return {
|
|
1454
|
-
score: 0,
|
|
1455
|
-
grade: "Excellent",
|
|
1456
|
-
totalLines,
|
|
1457
|
-
duplicateLines: 0,
|
|
1458
|
-
duplicateGroups: 0
|
|
1459
|
-
};
|
|
1527
|
+
const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
|
|
1528
|
+
if (!totalLines || !duplicates.length) {
|
|
1529
|
+
return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
|
|
1460
1530
|
}
|
|
1461
|
-
const
|
|
1462
|
-
const
|
|
1463
|
-
|
|
1464
|
-
const avgLines = (leftLines + rightLines) / 2;
|
|
1465
|
-
return sum + group.similarity * avgLines;
|
|
1531
|
+
const duplicateLines = duplicates.reduce((sum, g) => {
|
|
1532
|
+
const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
|
|
1533
|
+
return sum + g.similarity * avg;
|
|
1466
1534
|
}, 0);
|
|
1467
|
-
const score =
|
|
1468
|
-
const grade = this.getScoreGrade(score);
|
|
1535
|
+
const score = duplicateLines / totalLines * 100;
|
|
1469
1536
|
return {
|
|
1470
1537
|
score,
|
|
1471
|
-
grade,
|
|
1538
|
+
grade: this.getScoreGrade(score),
|
|
1472
1539
|
totalLines,
|
|
1473
|
-
duplicateLines: Math.round(
|
|
1540
|
+
duplicateLines: Math.round(duplicateLines),
|
|
1474
1541
|
duplicateGroups: duplicates.length
|
|
1475
1542
|
};
|
|
1476
1543
|
}
|
|
1477
|
-
calculateTotalLines(units) {
|
|
1478
|
-
return units.reduce((sum, unit) => {
|
|
1479
|
-
const lines = unit.endLine - unit.startLine + 1;
|
|
1480
|
-
return sum + lines;
|
|
1481
|
-
}, 0);
|
|
1482
|
-
}
|
|
1483
1544
|
getScoreGrade(score) {
|
|
1484
1545
|
if (score < 5) return "Excellent";
|
|
1485
1546
|
if (score < 15) return "Good";
|
|
@@ -1575,9 +1636,9 @@ var ExclusionService = class {
|
|
|
1575
1636
|
|
|
1576
1637
|
// src/services/PairingService.ts
|
|
1577
1638
|
import crypto3 from "crypto";
|
|
1578
|
-
import
|
|
1639
|
+
import debug7 from "debug";
|
|
1579
1640
|
import { minimatch as minimatch2 } from "minimatch";
|
|
1580
|
-
var
|
|
1641
|
+
var log7 = debug7("DryScan:pairs");
|
|
1581
1642
|
var PairingService = class {
|
|
1582
1643
|
constructor(indexUnitExtractor) {
|
|
1583
1644
|
this.indexUnitExtractor = indexUnitExtractor;
|
|
@@ -1588,7 +1649,7 @@ var PairingService = class {
|
|
|
1588
1649
|
*/
|
|
1589
1650
|
pairKeyForUnits(left, right) {
|
|
1590
1651
|
if (left.unitType !== right.unitType) {
|
|
1591
|
-
|
|
1652
|
+
log7("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
|
|
1592
1653
|
return null;
|
|
1593
1654
|
}
|
|
1594
1655
|
const type = left.unitType;
|
|
@@ -1604,13 +1665,13 @@ var PairingService = class {
|
|
|
1604
1665
|
parsePairKey(value) {
|
|
1605
1666
|
const parts = value.split("|");
|
|
1606
1667
|
if (parts.length !== 3) {
|
|
1607
|
-
|
|
1668
|
+
log7("Invalid pair key format: %s", value);
|
|
1608
1669
|
return null;
|
|
1609
1670
|
}
|
|
1610
1671
|
const [typeRaw, leftRaw, rightRaw] = parts;
|
|
1611
1672
|
const type = this.stringToUnitType(typeRaw);
|
|
1612
1673
|
if (!type) {
|
|
1613
|
-
|
|
1674
|
+
log7("Unknown unit type in pair key: %s", typeRaw);
|
|
1614
1675
|
return null;
|
|
1615
1676
|
}
|
|
1616
1677
|
const [left, right] = [leftRaw, rightRaw].sort();
|
|
@@ -1744,9 +1805,10 @@ var DryScan = class {
|
|
|
1744
1805
|
console.log("[DryScan] Checking for file changes...");
|
|
1745
1806
|
const start = Date.now();
|
|
1746
1807
|
await this.ensureDatabase();
|
|
1747
|
-
await this.services.updater.updateIndex();
|
|
1808
|
+
const dirtyPaths = await this.services.updater.updateIndex();
|
|
1748
1809
|
const duration = Date.now() - start;
|
|
1749
1810
|
console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
|
|
1811
|
+
return dirtyPaths;
|
|
1750
1812
|
}
|
|
1751
1813
|
/**
|
|
1752
1814
|
* Runs duplicate detection and returns a normalized report payload ready for persistence or display.
|
|
@@ -1775,12 +1837,12 @@ var DryScan = class {
|
|
|
1775
1837
|
await this.ensureDatabase();
|
|
1776
1838
|
console.log("[DryScan] Updating index...");
|
|
1777
1839
|
const updateStart = Date.now();
|
|
1778
|
-
await this.updateIndex();
|
|
1840
|
+
const dirtyPaths = await this.updateIndex();
|
|
1779
1841
|
const updateDuration = Date.now() - updateStart;
|
|
1780
1842
|
console.log(`[DryScan] Index update took ${updateDuration}ms.`);
|
|
1781
1843
|
console.log("[DryScan] Detecting duplicates...");
|
|
1782
1844
|
const dupStart = Date.now();
|
|
1783
|
-
const result = await this.services.duplicate.findDuplicates(config);
|
|
1845
|
+
const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
|
|
1784
1846
|
const dupDuration = Date.now() - dupStart;
|
|
1785
1847
|
console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
|
|
1786
1848
|
return result;
|