@goshenkata/dryscan-core 1.2.8 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-EUXUH3YW.js +15 -0
- package/dist/chunk-EUXUH3YW.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +306 -386
- package/dist/index.js.map +1 -1
- package/dist/services/ParallelSimilarity.js +37 -3
- package/dist/services/ParallelSimilarity.js.map +1 -1
- package/package.json +4 -1
- package/dist/chunk-ZZC4V5LV.js +0 -52
- package/dist/chunk-ZZC4V5LV.js.map +0 -1
- package/src/DryScan.ts +0 -166
- package/src/DryScanUpdater.ts +0 -236
- package/src/Gitignore.ts +0 -71
- package/src/IndexUnitExtractor.ts +0 -208
- package/src/config/configStore.ts +0 -55
- package/src/config/dryconfig.ts +0 -115
- package/src/config/indexConfig.ts +0 -13
- package/src/const.ts +0 -5
- package/src/db/DryScanDatabase.ts +0 -133
- package/src/db/entities/FileEntity.ts +0 -29
- package/src/db/entities/IndexUnitEntity.ts +0 -50
- package/src/extractors/LanguageExtractor.ts +0 -9
- package/src/extractors/java.ts +0 -376
- package/src/index.ts +0 -9
- package/src/services/DuplicateService.ts +0 -257
- package/src/services/DuplicationCache.ts +0 -210
- package/src/services/EmbeddingService.ts +0 -81
- package/src/services/ExclusionService.ts +0 -102
- package/src/services/PairingService.ts +0 -145
- package/src/services/ParallelSimilarity.ts +0 -59
- package/src/services/RepositoryInitializer.ts +0 -93
- package/src/services/UpdateService.ts +0 -31
- package/src/services/cosineSimilarityWorker.ts +0 -20
- package/src/services/types.ts +0 -10
- package/src/types/glob-gitignore.d.ts +0 -7
- package/src/types/short-uuid.d.ts +0 -7
- package/src/types/tree-sitter-langs.d.ts +0 -4
- package/src/types.ts +0 -76
- package/tsup.config.ts +0 -15
package/dist/index.js
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
|
-
__decorateClass
|
|
3
|
-
|
|
4
|
-
} from "./chunk-ZZC4V5LV.js";
|
|
2
|
+
__decorateClass
|
|
3
|
+
} from "./chunk-EUXUH3YW.js";
|
|
5
4
|
|
|
6
5
|
// src/DryScan.ts
|
|
7
6
|
import upath6 from "upath";
|
|
@@ -10,6 +9,7 @@ import fs7 from "fs/promises";
|
|
|
10
9
|
// src/const.ts
|
|
11
10
|
var DRYSCAN_DIR = ".dry";
|
|
12
11
|
var INDEX_DB = "index.db";
|
|
12
|
+
var REPORTS_DIR = "reports";
|
|
13
13
|
var FILE_CHECKSUM_ALGO = "md5";
|
|
14
14
|
var BLOCK_HASH_ALGO = "sha1";
|
|
15
15
|
|
|
@@ -1029,7 +1029,7 @@ var RepositoryInitializer = class {
|
|
|
1029
1029
|
};
|
|
1030
1030
|
|
|
1031
1031
|
// src/services/UpdateService.ts
|
|
1032
|
-
import
|
|
1032
|
+
import debug4 from "debug";
|
|
1033
1033
|
|
|
1034
1034
|
// src/DryScanUpdater.ts
|
|
1035
1035
|
import path4 from "path";
|
|
@@ -1148,177 +1148,8 @@ async function performIncrementalUpdate(repoPath, extractor, db) {
|
|
|
1148
1148
|
return changeSet;
|
|
1149
1149
|
}
|
|
1150
1150
|
|
|
1151
|
-
// src/services/DuplicationCache.ts
|
|
1152
|
-
import debug4 from "debug";
|
|
1153
|
-
var log4 = debug4("DryScan:DuplicationCache");
|
|
1154
|
-
var DuplicationCache = class _DuplicationCache {
|
|
1155
|
-
static instance = null;
|
|
1156
|
-
comparisons = /* @__PURE__ */ new Map();
|
|
1157
|
-
fileIndex = /* @__PURE__ */ new Map();
|
|
1158
|
-
initialized = false;
|
|
1159
|
-
/** Per-run similarity matrix from a single batched library call (reset each run). */
|
|
1160
|
-
embSimMatrix = [];
|
|
1161
|
-
/** Maps unit ID to its row/column index in embSimMatrix. */
|
|
1162
|
-
embSimIndex = /* @__PURE__ */ new Map();
|
|
1163
|
-
/** Per-run memoization of parent unit similarity scores (reset each run). */
|
|
1164
|
-
parentSimCache = /* @__PURE__ */ new Map();
|
|
1165
|
-
static getInstance() {
|
|
1166
|
-
if (!_DuplicationCache.instance) {
|
|
1167
|
-
_DuplicationCache.instance = new _DuplicationCache();
|
|
1168
|
-
}
|
|
1169
|
-
return _DuplicationCache.instance;
|
|
1170
|
-
}
|
|
1171
|
-
/**
|
|
1172
|
-
* Updates the cache with fresh duplicate groups. Not awaited by callers to avoid blocking.
|
|
1173
|
-
*/
|
|
1174
|
-
async update(groups) {
|
|
1175
|
-
if (!groups) return;
|
|
1176
|
-
for (const group of groups) {
|
|
1177
|
-
const key = this.makeKey(group.left.id, group.right.id);
|
|
1178
|
-
this.comparisons.set(key, group.similarity);
|
|
1179
|
-
this.addKeyForFile(group.left.filePath, key);
|
|
1180
|
-
this.addKeyForFile(group.right.filePath, key);
|
|
1181
|
-
}
|
|
1182
|
-
this.initialized = this.initialized || groups.length > 0;
|
|
1183
|
-
}
|
|
1184
|
-
/**
|
|
1185
|
-
* Retrieves a cached similarity if present and valid for both file paths.
|
|
1186
|
-
* Returns null when the cache has not been initialized or when the pair is missing.
|
|
1187
|
-
*/
|
|
1188
|
-
get(leftId, rightId, leftFilePath, rightFilePath) {
|
|
1189
|
-
if (!this.initialized) return null;
|
|
1190
|
-
const key = this.makeKey(leftId, rightId);
|
|
1191
|
-
if (!this.fileHasKey(leftFilePath, key) || !this.fileHasKey(rightFilePath, key)) {
|
|
1192
|
-
return null;
|
|
1193
|
-
}
|
|
1194
|
-
const value = this.comparisons.get(key);
|
|
1195
|
-
return typeof value === "number" ? value : null;
|
|
1196
|
-
}
|
|
1197
|
-
/**
|
|
1198
|
-
* Invalidates all cached comparisons involving the provided file paths.
|
|
1199
|
-
*/
|
|
1200
|
-
async invalidate(paths) {
|
|
1201
|
-
if (!this.initialized || !paths || paths.length === 0) return;
|
|
1202
|
-
const unique = new Set(paths);
|
|
1203
|
-
for (const filePath of unique) {
|
|
1204
|
-
const keys = this.fileIndex.get(filePath);
|
|
1205
|
-
if (!keys) continue;
|
|
1206
|
-
for (const key of keys) {
|
|
1207
|
-
this.comparisons.delete(key);
|
|
1208
|
-
for (const [otherPath, otherKeys] of this.fileIndex.entries()) {
|
|
1209
|
-
if (otherKeys.delete(key) && otherKeys.size === 0) {
|
|
1210
|
-
this.fileIndex.delete(otherPath);
|
|
1211
|
-
}
|
|
1212
|
-
}
|
|
1213
|
-
}
|
|
1214
|
-
this.fileIndex.delete(filePath);
|
|
1215
|
-
}
|
|
1216
|
-
if (this.comparisons.size === 0) {
|
|
1217
|
-
this.initialized = false;
|
|
1218
|
-
}
|
|
1219
|
-
}
|
|
1220
|
-
/**
|
|
1221
|
-
* Clears all cached data. Intended for test setup.
|
|
1222
|
-
*/
|
|
1223
|
-
clear() {
|
|
1224
|
-
this.comparisons.clear();
|
|
1225
|
-
this.fileIndex.clear();
|
|
1226
|
-
this.initialized = false;
|
|
1227
|
-
this.embSimMatrix = [];
|
|
1228
|
-
this.embSimIndex.clear();
|
|
1229
|
-
this.clearRunCaches();
|
|
1230
|
-
}
|
|
1231
|
-
/**
|
|
1232
|
-
* Resets per-run memoization (parent similarities).
|
|
1233
|
-
* The embedding matrix is intentionally preserved so incremental runs can
|
|
1234
|
-
* reuse clean×clean values across calls.
|
|
1235
|
-
*/
|
|
1236
|
-
clearRunCaches() {
|
|
1237
|
-
this.parentSimCache.clear();
|
|
1238
|
-
}
|
|
1239
|
-
/**
|
|
1240
|
-
* Builds or incrementally updates the embedding similarity matrix.
|
|
1241
|
-
*
|
|
1242
|
-
* Full rebuild (default): replaces the entire matrix — O(n²).
|
|
1243
|
-
* Incremental (dirtyPaths provided + prior matrix exists): copies clean×clean
|
|
1244
|
-
* cells from the old matrix and recomputes only dirty rows via one batched
|
|
1245
|
-
* cosineSimilarity call — O(d·n) where d = number of dirty units.
|
|
1246
|
-
*/
|
|
1247
|
-
async buildEmbSimCache(units, dirtyPaths) {
|
|
1248
|
-
const embedded = units.filter((u) => Array.isArray(u.embedding) && u.embedding.length > 0);
|
|
1249
|
-
if (embedded.length < 2) {
|
|
1250
|
-
this.embSimMatrix = [];
|
|
1251
|
-
this.embSimIndex.clear();
|
|
1252
|
-
return;
|
|
1253
|
-
}
|
|
1254
|
-
const embeddings = embedded.map((u) => u.embedding);
|
|
1255
|
-
const newIndex = new Map(embedded.map((u, i) => [u.id, i]));
|
|
1256
|
-
const dirtySet = dirtyPaths ? new Set(dirtyPaths) : null;
|
|
1257
|
-
const hasPriorMatrix = this.embSimMatrix.length > 0;
|
|
1258
|
-
if (!dirtySet || !hasPriorMatrix) {
|
|
1259
|
-
this.embSimIndex = newIndex;
|
|
1260
|
-
this.embSimMatrix = await parallelCosineSimilarity(embeddings, embeddings);
|
|
1261
|
-
log4("Built full embedding similarity matrix: %d units", embedded.length);
|
|
1262
|
-
return;
|
|
1263
|
-
}
|
|
1264
|
-
const dirtyIds = new Set(embedded.filter((u) => dirtySet.has(u.filePath)).map((u) => u.id));
|
|
1265
|
-
if (dirtyIds.size === 0) {
|
|
1266
|
-
log4("Matrix reused: no dirty units detected");
|
|
1267
|
-
return;
|
|
1268
|
-
}
|
|
1269
|
-
const n = embedded.length;
|
|
1270
|
-
const newMatrix = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
1271
|
-
for (let i = 0; i < n; i++) {
|
|
1272
|
-
for (let j = 0; j < n; j++) {
|
|
1273
|
-
if (dirtyIds.has(embedded[i].id) || dirtyIds.has(embedded[j].id)) continue;
|
|
1274
|
-
const oi = this.embSimIndex.get(embedded[i].id);
|
|
1275
|
-
const oj = this.embSimIndex.get(embedded[j].id);
|
|
1276
|
-
if (oi !== void 0 && oj !== void 0) newMatrix[i][j] = this.embSimMatrix[oi][oj];
|
|
1277
|
-
}
|
|
1278
|
-
}
|
|
1279
|
-
const dirtyIndices = embedded.reduce((acc, u, i) => dirtyIds.has(u.id) ? [...acc, i] : acc, []);
|
|
1280
|
-
const dirtyRows = await parallelCosineSimilarity(dirtyIndices.map((i) => embeddings[i]), embeddings);
|
|
1281
|
-
dirtyIndices.forEach((rowIdx, di) => {
|
|
1282
|
-
for (let j = 0; j < n; j++) {
|
|
1283
|
-
newMatrix[rowIdx][j] = dirtyRows[di][j];
|
|
1284
|
-
newMatrix[j][rowIdx] = dirtyRows[di][j];
|
|
1285
|
-
}
|
|
1286
|
-
});
|
|
1287
|
-
this.embSimIndex = newIndex;
|
|
1288
|
-
this.embSimMatrix = newMatrix;
|
|
1289
|
-
log4("Incremental matrix update: %d dirty unit(s) out of %d total", dirtyIds.size, n);
|
|
1290
|
-
}
|
|
1291
|
-
/** Returns the pre-computed cosine similarity for a pair of unit IDs, if available. */
|
|
1292
|
-
getEmbSim(id1, id2) {
|
|
1293
|
-
const i = this.embSimIndex.get(id1);
|
|
1294
|
-
const j = this.embSimIndex.get(id2);
|
|
1295
|
-
if (i === void 0 || j === void 0) return void 0;
|
|
1296
|
-
return this.embSimMatrix[i][j];
|
|
1297
|
-
}
|
|
1298
|
-
/** Returns the memoized parent similarity for the given stable key, if available. */
|
|
1299
|
-
getParentSim(key) {
|
|
1300
|
-
return this.parentSimCache.get(key);
|
|
1301
|
-
}
|
|
1302
|
-
/** Stores a memoized parent similarity for the given stable key. */
|
|
1303
|
-
setParentSim(key, sim) {
|
|
1304
|
-
this.parentSimCache.set(key, sim);
|
|
1305
|
-
}
|
|
1306
|
-
addKeyForFile(filePath, key) {
|
|
1307
|
-
const current = this.fileIndex.get(filePath) ?? /* @__PURE__ */ new Set();
|
|
1308
|
-
current.add(key);
|
|
1309
|
-
this.fileIndex.set(filePath, current);
|
|
1310
|
-
}
|
|
1311
|
-
fileHasKey(filePath, key) {
|
|
1312
|
-
const keys = this.fileIndex.get(filePath);
|
|
1313
|
-
return keys ? keys.has(key) : false;
|
|
1314
|
-
}
|
|
1315
|
-
makeKey(leftId, rightId) {
|
|
1316
|
-
return [leftId, rightId].sort().join("::");
|
|
1317
|
-
}
|
|
1318
|
-
};
|
|
1319
|
-
|
|
1320
1151
|
// src/services/UpdateService.ts
|
|
1321
|
-
var
|
|
1152
|
+
var log4 = debug4("DryScan:UpdateService");
|
|
1322
1153
|
var UpdateService = class {
|
|
1323
1154
|
constructor(deps, exclusionService) {
|
|
1324
1155
|
this.deps = deps;
|
|
@@ -1327,222 +1158,18 @@ var UpdateService = class {
|
|
|
1327
1158
|
/** Returns the list of file paths that were modified or deleted (dirty). */
|
|
1328
1159
|
async updateIndex() {
|
|
1329
1160
|
const extractor = this.deps.extractor;
|
|
1330
|
-
const cache = DuplicationCache.getInstance();
|
|
1331
1161
|
try {
|
|
1332
1162
|
const changeSet = await performIncrementalUpdate(this.deps.repoPath, extractor, this.deps.db);
|
|
1333
1163
|
await this.exclusionService.cleanupExcludedFiles();
|
|
1334
1164
|
const dirtyPaths = [...changeSet.changed, ...changeSet.deleted, ...changeSet.added];
|
|
1335
|
-
await cache.invalidate(dirtyPaths);
|
|
1336
1165
|
return dirtyPaths;
|
|
1337
1166
|
} catch (err) {
|
|
1338
|
-
|
|
1167
|
+
log4("Error during index update:", err);
|
|
1339
1168
|
throw err;
|
|
1340
1169
|
}
|
|
1341
1170
|
}
|
|
1342
1171
|
};
|
|
1343
1172
|
|
|
1344
|
-
// src/services/DuplicateService.ts
|
|
1345
|
-
import debug6 from "debug";
|
|
1346
|
-
import shortUuid from "short-uuid";
|
|
1347
|
-
var log6 = debug6("DryScan:DuplicateService");
|
|
1348
|
-
var DuplicateService = class {
|
|
1349
|
-
constructor(deps) {
|
|
1350
|
-
this.deps = deps;
|
|
1351
|
-
}
|
|
1352
|
-
config;
|
|
1353
|
-
cache = DuplicationCache.getInstance();
|
|
1354
|
-
/**
|
|
1355
|
-
* @param dirtyPaths - File paths changed since last run. When provided, only
|
|
1356
|
-
* dirty×all similarities are recomputed; clean×clean values are reused from
|
|
1357
|
-
* the existing matrix. Pass undefined (or omit) for a full rebuild.
|
|
1358
|
-
*/
|
|
1359
|
-
async findDuplicates(config, dirtyPaths) {
|
|
1360
|
-
this.config = config;
|
|
1361
|
-
const t0 = performance.now();
|
|
1362
|
-
const allUnits = await this.deps.db.getAllUnits();
|
|
1363
|
-
log6("Starting duplicate analysis on %d units", allUnits.length);
|
|
1364
|
-
if (allUnits.length < 2) {
|
|
1365
|
-
return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
|
|
1366
|
-
}
|
|
1367
|
-
const thresholds = this.resolveThresholds(config.threshold);
|
|
1368
|
-
const duplicates = await this.computeDuplicates(allUnits, thresholds, dirtyPaths);
|
|
1369
|
-
const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
|
|
1370
|
-
log6("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
|
|
1371
|
-
this.cache.update(filtered).catch((err) => log6("Cache update failed: %O", err));
|
|
1372
|
-
const score = this.computeDuplicationScore(filtered, allUnits);
|
|
1373
|
-
log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
|
|
1374
|
-
return { duplicates: filtered, score };
|
|
1375
|
-
}
|
|
1376
|
-
resolveThresholds(functionThreshold) {
|
|
1377
|
-
const d = indexConfig.thresholds;
|
|
1378
|
-
const clamp = (v) => Math.min(1, Math.max(0, v));
|
|
1379
|
-
const fn = clamp(functionThreshold ?? d.function);
|
|
1380
|
-
return {
|
|
1381
|
-
function: fn,
|
|
1382
|
-
block: clamp(fn + d.block - d.function),
|
|
1383
|
-
class: clamp(fn + d.class - d.function)
|
|
1384
|
-
};
|
|
1385
|
-
}
|
|
1386
|
-
async computeDuplicates(units, thresholds, dirtyPaths) {
|
|
1387
|
-
this.cache.clearRunCaches();
|
|
1388
|
-
await this.cache.buildEmbSimCache(units, dirtyPaths);
|
|
1389
|
-
const duplicates = [];
|
|
1390
|
-
const t0 = performance.now();
|
|
1391
|
-
for (const [type, typedUnits] of this.groupByType(units)) {
|
|
1392
|
-
const threshold = this.getThreshold(type, thresholds);
|
|
1393
|
-
log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
|
|
1394
|
-
for (let i = 0; i < typedUnits.length; i++) {
|
|
1395
|
-
for (let j = i + 1; j < typedUnits.length; j++) {
|
|
1396
|
-
const left = typedUnits[i], right = typedUnits[j];
|
|
1397
|
-
if (this.shouldSkipComparison(left, right)) continue;
|
|
1398
|
-
const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
|
|
1399
|
-
const hasEmbeddings = left.embedding?.length && right.embedding?.length;
|
|
1400
|
-
const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
|
|
1401
|
-
if (similarity < threshold) continue;
|
|
1402
|
-
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
1403
|
-
if (!exclusionString) continue;
|
|
1404
|
-
duplicates.push({
|
|
1405
|
-
id: `${left.id}::${right.id}`,
|
|
1406
|
-
similarity,
|
|
1407
|
-
shortId: shortUuid.generate(),
|
|
1408
|
-
exclusionString,
|
|
1409
|
-
left: this.toMember(left),
|
|
1410
|
-
right: this.toMember(right)
|
|
1411
|
-
});
|
|
1412
|
-
}
|
|
1413
|
-
}
|
|
1414
|
-
}
|
|
1415
|
-
log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
|
|
1416
|
-
return duplicates.sort((a, b) => b.similarity - a.similarity);
|
|
1417
|
-
}
|
|
1418
|
-
isGroupExcluded(group) {
|
|
1419
|
-
const config = this.config;
|
|
1420
|
-
if (!config?.excludedPairs?.length) return false;
|
|
1421
|
-
const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
|
|
1422
|
-
if (!key) return false;
|
|
1423
|
-
const actual = this.deps.pairing.parsePairKey(key);
|
|
1424
|
-
if (!actual) return false;
|
|
1425
|
-
return config.excludedPairs.some((entry) => {
|
|
1426
|
-
const parsed = this.deps.pairing.parsePairKey(entry);
|
|
1427
|
-
return parsed ? this.deps.pairing.pairKeyMatches(actual, parsed) : false;
|
|
1428
|
-
});
|
|
1429
|
-
}
|
|
1430
|
-
getThreshold(type, thresholds) {
|
|
1431
|
-
if (type === "class" /* CLASS */) return thresholds.class;
|
|
1432
|
-
if (type === "block" /* BLOCK */) return thresholds.block;
|
|
1433
|
-
return thresholds.function;
|
|
1434
|
-
}
|
|
1435
|
-
computeWeightedSimilarity(left, right, threshold) {
|
|
1436
|
-
const selfSim = this.similarity(left, right);
|
|
1437
|
-
if (left.unitType === "class" /* CLASS */) {
|
|
1438
|
-
return selfSim * indexConfig.weights.class.self;
|
|
1439
|
-
}
|
|
1440
|
-
if (left.unitType === "function" /* FUNCTION */) {
|
|
1441
|
-
const w2 = indexConfig.weights.function;
|
|
1442
|
-
const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1443
|
-
const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
|
|
1444
|
-
if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
|
|
1445
|
-
return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
|
|
1446
|
-
}
|
|
1447
|
-
const w = indexConfig.weights.block;
|
|
1448
|
-
const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
|
|
1449
|
-
const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1450
|
-
const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
|
|
1451
|
-
if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
|
|
1452
|
-
return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
|
|
1453
|
-
}
|
|
1454
|
-
/** Groups all units by type for the comparison loop. Units without embeddings are included
|
|
1455
|
-
* so that cache hits can still be returned for pairs whose embeddings were cleared. */
|
|
1456
|
-
groupByType(units) {
|
|
1457
|
-
const byType = /* @__PURE__ */ new Map();
|
|
1458
|
-
for (const unit of units) {
|
|
1459
|
-
const list = byType.get(unit.unitType) ?? [];
|
|
1460
|
-
list.push(unit);
|
|
1461
|
-
byType.set(unit.unitType, list);
|
|
1462
|
-
}
|
|
1463
|
-
return byType;
|
|
1464
|
-
}
|
|
1465
|
-
toMember(unit) {
|
|
1466
|
-
return {
|
|
1467
|
-
id: unit.id,
|
|
1468
|
-
name: unit.name,
|
|
1469
|
-
filePath: unit.filePath,
|
|
1470
|
-
startLine: unit.startLine,
|
|
1471
|
-
endLine: unit.endLine,
|
|
1472
|
-
code: unit.code,
|
|
1473
|
-
unitType: unit.unitType
|
|
1474
|
-
};
|
|
1475
|
-
}
|
|
1476
|
-
bothHaveParent(left, right, type) {
|
|
1477
|
-
return !!this.findParent(left, type) && !!this.findParent(right, type);
|
|
1478
|
-
}
|
|
1479
|
-
parentSimilarity(left, right, type) {
|
|
1480
|
-
const lp = this.findParent(left, type), rp = this.findParent(right, type);
|
|
1481
|
-
if (!lp || !rp) return 0;
|
|
1482
|
-
const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
|
|
1483
|
-
const cached = this.cache.getParentSim(key);
|
|
1484
|
-
if (cached !== void 0) return cached;
|
|
1485
|
-
const sim = this.similarity(lp, rp);
|
|
1486
|
-
this.cache.setParentSim(key, sim);
|
|
1487
|
-
return sim;
|
|
1488
|
-
}
|
|
1489
|
-
/** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
|
|
1490
|
-
similarity(left, right) {
|
|
1491
|
-
return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
|
|
1492
|
-
}
|
|
1493
|
-
childSimilarity(left, right) {
|
|
1494
|
-
const lc = left.children ?? [], rc = right.children ?? [];
|
|
1495
|
-
if (!lc.length || !rc.length) return 0;
|
|
1496
|
-
let best = 0;
|
|
1497
|
-
for (const l of lc) {
|
|
1498
|
-
for (const r of rc) {
|
|
1499
|
-
if (l.unitType !== r.unitType) continue;
|
|
1500
|
-
const sim = this.similarity(l, r);
|
|
1501
|
-
if (sim > best) best = sim;
|
|
1502
|
-
}
|
|
1503
|
-
}
|
|
1504
|
-
return best;
|
|
1505
|
-
}
|
|
1506
|
-
shouldSkipComparison(left, right) {
|
|
1507
|
-
if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
|
|
1508
|
-
if (left.filePath !== right.filePath) return false;
|
|
1509
|
-
return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
|
|
1510
|
-
}
|
|
1511
|
-
findParent(unit, type) {
|
|
1512
|
-
let p = unit.parent;
|
|
1513
|
-
while (p) {
|
|
1514
|
-
if (p.unitType === type) return p;
|
|
1515
|
-
p = p.parent;
|
|
1516
|
-
}
|
|
1517
|
-
return null;
|
|
1518
|
-
}
|
|
1519
|
-
computeDuplicationScore(duplicates, allUnits) {
|
|
1520
|
-
const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
|
|
1521
|
-
if (!totalLines || !duplicates.length) {
|
|
1522
|
-
return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
|
|
1523
|
-
}
|
|
1524
|
-
const duplicateLines = duplicates.reduce((sum, g) => {
|
|
1525
|
-
const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
|
|
1526
|
-
return sum + g.similarity * avg;
|
|
1527
|
-
}, 0);
|
|
1528
|
-
const score = duplicateLines / totalLines * 100;
|
|
1529
|
-
return {
|
|
1530
|
-
score,
|
|
1531
|
-
grade: this.getScoreGrade(score),
|
|
1532
|
-
totalLines,
|
|
1533
|
-
duplicateLines: Math.round(duplicateLines),
|
|
1534
|
-
duplicateGroups: duplicates.length
|
|
1535
|
-
};
|
|
1536
|
-
}
|
|
1537
|
-
getScoreGrade(score) {
|
|
1538
|
-
if (score < 5) return "Excellent";
|
|
1539
|
-
if (score < 15) return "Good";
|
|
1540
|
-
if (score < 30) return "Fair";
|
|
1541
|
-
if (score < 50) return "Poor";
|
|
1542
|
-
return "Critical";
|
|
1543
|
-
}
|
|
1544
|
-
};
|
|
1545
|
-
|
|
1546
1173
|
// src/services/ExclusionService.ts
|
|
1547
1174
|
import { minimatch } from "minimatch";
|
|
1548
1175
|
var ExclusionService = class {
|
|
@@ -1629,9 +1256,9 @@ var ExclusionService = class {
|
|
|
1629
1256
|
|
|
1630
1257
|
// src/services/PairingService.ts
|
|
1631
1258
|
import crypto3 from "crypto";
|
|
1632
|
-
import
|
|
1259
|
+
import debug5 from "debug";
|
|
1633
1260
|
import { minimatch as minimatch2 } from "minimatch";
|
|
1634
|
-
var
|
|
1261
|
+
var log5 = debug5("DryScan:pairs");
|
|
1635
1262
|
var PairingService = class {
|
|
1636
1263
|
constructor(indexUnitExtractor) {
|
|
1637
1264
|
this.indexUnitExtractor = indexUnitExtractor;
|
|
@@ -1642,7 +1269,7 @@ var PairingService = class {
|
|
|
1642
1269
|
*/
|
|
1643
1270
|
pairKeyForUnits(left, right) {
|
|
1644
1271
|
if (left.unitType !== right.unitType) {
|
|
1645
|
-
|
|
1272
|
+
log5("Skipping pair with mismatched types: %s vs %s", left.unitType, right.unitType);
|
|
1646
1273
|
return null;
|
|
1647
1274
|
}
|
|
1648
1275
|
const type = left.unitType;
|
|
@@ -1658,13 +1285,13 @@ var PairingService = class {
|
|
|
1658
1285
|
parsePairKey(value) {
|
|
1659
1286
|
const parts = value.split("|");
|
|
1660
1287
|
if (parts.length !== 3) {
|
|
1661
|
-
|
|
1288
|
+
log5("Invalid pair key format: %s", value);
|
|
1662
1289
|
return null;
|
|
1663
1290
|
}
|
|
1664
1291
|
const [typeRaw, leftRaw, rightRaw] = parts;
|
|
1665
1292
|
const type = this.stringToUnitType(typeRaw);
|
|
1666
1293
|
if (!type) {
|
|
1667
|
-
|
|
1294
|
+
log5("Unknown unit type in pair key: %s", typeRaw);
|
|
1668
1295
|
return null;
|
|
1669
1296
|
}
|
|
1670
1297
|
const [left, right] = [leftRaw, rightRaw].sort();
|
|
@@ -1736,6 +1363,259 @@ var PairingService = class {
|
|
|
1736
1363
|
|
|
1737
1364
|
// src/DryScan.ts
|
|
1738
1365
|
import { existsSync } from "fs";
|
|
1366
|
+
|
|
1367
|
+
// src/services/DuplicateService.ts
|
|
1368
|
+
import debug6 from "debug";
|
|
1369
|
+
import shortUuid from "short-uuid";
|
|
1370
|
+
import { cosineSimilarity } from "@langchain/core/utils/math";
|
|
1371
|
+
var log6 = debug6("DryScan:DuplicateService");
|
|
1372
|
+
var DuplicateService = class {
|
|
1373
|
+
constructor(deps) {
|
|
1374
|
+
this.deps = deps;
|
|
1375
|
+
}
|
|
1376
|
+
config;
|
|
1377
|
+
similarityCache = /* @__PURE__ */ new Map();
|
|
1378
|
+
parentSimCache = /* @__PURE__ */ new Map();
|
|
1379
|
+
async findDuplicates(config, dirtyPaths = [], previousReport) {
|
|
1380
|
+
this.config = config;
|
|
1381
|
+
this.similarityCache = /* @__PURE__ */ new Map();
|
|
1382
|
+
this.parentSimCache = /* @__PURE__ */ new Map();
|
|
1383
|
+
const t0 = performance.now();
|
|
1384
|
+
const allUnits = await this.deps.db.getAllUnits();
|
|
1385
|
+
log6("Starting duplicate analysis on %d units", allUnits.length);
|
|
1386
|
+
if (allUnits.length < 2) {
|
|
1387
|
+
return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
|
|
1388
|
+
}
|
|
1389
|
+
const thresholds = this.resolveThresholds(config.threshold);
|
|
1390
|
+
const dirtySet = new Set(dirtyPaths);
|
|
1391
|
+
const canReuseFromReport = Boolean(previousReport && previousReport.threshold === config.threshold);
|
|
1392
|
+
const reusableClean = canReuseFromReport ? this.reuseCleanPairsFromPreviousReport(previousReport, allUnits, dirtySet) : [];
|
|
1393
|
+
const recomputed = this.computeDuplicates(
|
|
1394
|
+
allUnits,
|
|
1395
|
+
thresholds,
|
|
1396
|
+
canReuseFromReport ? dirtySet : null
|
|
1397
|
+
);
|
|
1398
|
+
const merged = this.mergeDuplicates(reusableClean, recomputed);
|
|
1399
|
+
const filtered = merged.filter((g) => !this.isGroupExcluded(g));
|
|
1400
|
+
log6(
|
|
1401
|
+
"Found %d duplicate groups (%d excluded, %d reused)",
|
|
1402
|
+
filtered.length,
|
|
1403
|
+
merged.length - filtered.length,
|
|
1404
|
+
reusableClean.length
|
|
1405
|
+
);
|
|
1406
|
+
const score = this.computeDuplicationScore(filtered, allUnits);
|
|
1407
|
+
log6("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
|
|
1408
|
+
return { duplicates: filtered, score };
|
|
1409
|
+
}
|
|
1410
|
+
resolveThresholds(functionThreshold) {
|
|
1411
|
+
const d = indexConfig.thresholds;
|
|
1412
|
+
const clamp = (v) => Math.min(1, Math.max(0, v));
|
|
1413
|
+
const fn = clamp(functionThreshold ?? d.function);
|
|
1414
|
+
return {
|
|
1415
|
+
function: fn,
|
|
1416
|
+
block: clamp(fn + d.block - d.function),
|
|
1417
|
+
class: clamp(fn + d.class - d.function)
|
|
1418
|
+
};
|
|
1419
|
+
}
|
|
1420
|
+
computeDuplicates(units, thresholds, dirtySet) {
|
|
1421
|
+
if (dirtySet && dirtySet.size === 0) {
|
|
1422
|
+
log6("Skipping recomputation: no dirty files and previous report threshold matches");
|
|
1423
|
+
return [];
|
|
1424
|
+
}
|
|
1425
|
+
const duplicates = [];
|
|
1426
|
+
const t0 = performance.now();
|
|
1427
|
+
for (const [type, typedUnits] of this.groupByType(units)) {
|
|
1428
|
+
const threshold = this.getThreshold(type, thresholds);
|
|
1429
|
+
log6("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
|
|
1430
|
+
for (let i = 0; i < typedUnits.length; i++) {
|
|
1431
|
+
for (let j = i + 1; j < typedUnits.length; j++) {
|
|
1432
|
+
const left = typedUnits[i];
|
|
1433
|
+
const right = typedUnits[j];
|
|
1434
|
+
if (this.shouldSkipComparison(left, right)) continue;
|
|
1435
|
+
if (dirtySet && !dirtySet.has(left.filePath) && !dirtySet.has(right.filePath)) {
|
|
1436
|
+
continue;
|
|
1437
|
+
}
|
|
1438
|
+
const hasEmbeddings = left.embedding?.length && right.embedding?.length;
|
|
1439
|
+
const similarity = hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0;
|
|
1440
|
+
if (similarity < threshold) continue;
|
|
1441
|
+
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
1442
|
+
if (!exclusionString) continue;
|
|
1443
|
+
duplicates.push({
|
|
1444
|
+
id: `${left.id}::${right.id}`,
|
|
1445
|
+
similarity,
|
|
1446
|
+
shortId: shortUuid.generate(),
|
|
1447
|
+
exclusionString,
|
|
1448
|
+
left: this.toMember(left),
|
|
1449
|
+
right: this.toMember(right)
|
|
1450
|
+
});
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
}
|
|
1454
|
+
log6("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
|
|
1455
|
+
return duplicates.sort((a, b) => b.similarity - a.similarity);
|
|
1456
|
+
}
|
|
1457
|
+
reuseCleanPairsFromPreviousReport(report, units, dirtySet) {
|
|
1458
|
+
const unitIds = new Set(units.map((u) => u.id));
|
|
1459
|
+
const reusable = report.duplicates.filter((group) => {
|
|
1460
|
+
const leftDirty = dirtySet.has(group.left.filePath);
|
|
1461
|
+
const rightDirty = dirtySet.has(group.right.filePath);
|
|
1462
|
+
if (leftDirty || rightDirty) return false;
|
|
1463
|
+
return unitIds.has(group.left.id) && unitIds.has(group.right.id);
|
|
1464
|
+
});
|
|
1465
|
+
log6("Reused %d clean-clean duplicate groups from previous report", reusable.length);
|
|
1466
|
+
return reusable;
|
|
1467
|
+
}
|
|
1468
|
+
mergeDuplicates(reused, recomputed) {
|
|
1469
|
+
const merged = /* @__PURE__ */ new Map();
|
|
1470
|
+
for (const group of reused) {
|
|
1471
|
+
merged.set(this.groupKey(group), group);
|
|
1472
|
+
}
|
|
1473
|
+
for (const group of recomputed) {
|
|
1474
|
+
merged.set(this.groupKey(group), group);
|
|
1475
|
+
}
|
|
1476
|
+
return Array.from(merged.values()).sort((a, b) => b.similarity - a.similarity);
|
|
1477
|
+
}
|
|
1478
|
+
groupKey(group) {
|
|
1479
|
+
return [group.left.id, group.right.id].sort().join("::");
|
|
1480
|
+
}
|
|
1481
|
+
isGroupExcluded(group) {
|
|
1482
|
+
const config = this.config;
|
|
1483
|
+
if (!config?.excludedPairs?.length) return false;
|
|
1484
|
+
const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
|
|
1485
|
+
if (!key) return false;
|
|
1486
|
+
const actual = this.deps.pairing.parsePairKey(key);
|
|
1487
|
+
if (!actual) return false;
|
|
1488
|
+
return config.excludedPairs.some((entry) => {
|
|
1489
|
+
const parsed = this.deps.pairing.parsePairKey(entry);
|
|
1490
|
+
return parsed ? this.deps.pairing.pairKeyMatches(actual, parsed) : false;
|
|
1491
|
+
});
|
|
1492
|
+
}
|
|
1493
|
+
getThreshold(type, thresholds) {
|
|
1494
|
+
if (type === "class" /* CLASS */) return thresholds.class;
|
|
1495
|
+
if (type === "block" /* BLOCK */) return thresholds.block;
|
|
1496
|
+
return thresholds.function;
|
|
1497
|
+
}
|
|
1498
|
+
computeWeightedSimilarity(left, right, threshold) {
|
|
1499
|
+
const selfSim = this.similarity(left, right);
|
|
1500
|
+
if (left.unitType === "class" /* CLASS */) {
|
|
1501
|
+
return selfSim * indexConfig.weights.class.self;
|
|
1502
|
+
}
|
|
1503
|
+
if (left.unitType === "function" /* FUNCTION */) {
|
|
1504
|
+
const w2 = indexConfig.weights.function;
|
|
1505
|
+
const hasPC2 = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1506
|
+
const total2 = w2.self + (hasPC2 ? w2.parentClass : 0);
|
|
1507
|
+
if ((w2.self * selfSim + (hasPC2 ? w2.parentClass : 0)) / total2 < threshold) return 0;
|
|
1508
|
+
return (w2.self * selfSim + (hasPC2 ? w2.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total2;
|
|
1509
|
+
}
|
|
1510
|
+
const w = indexConfig.weights.block;
|
|
1511
|
+
const hasPF = this.bothHaveParent(left, right, "function" /* FUNCTION */);
|
|
1512
|
+
const hasPC = this.bothHaveParent(left, right, "class" /* CLASS */);
|
|
1513
|
+
const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
|
|
1514
|
+
if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
|
|
1515
|
+
return (w.self * selfSim + (hasPF ? w.parentFunction * this.parentSimilarity(left, right, "function" /* FUNCTION */) : 0) + (hasPC ? w.parentClass * this.parentSimilarity(left, right, "class" /* CLASS */) : 0)) / total;
|
|
1516
|
+
}
|
|
1517
|
+
groupByType(units) {
|
|
1518
|
+
const byType = /* @__PURE__ */ new Map();
|
|
1519
|
+
for (const unit of units) {
|
|
1520
|
+
const list = byType.get(unit.unitType) ?? [];
|
|
1521
|
+
list.push(unit);
|
|
1522
|
+
byType.set(unit.unitType, list);
|
|
1523
|
+
}
|
|
1524
|
+
return byType;
|
|
1525
|
+
}
|
|
1526
|
+
toMember(unit) {
|
|
1527
|
+
return {
|
|
1528
|
+
id: unit.id,
|
|
1529
|
+
name: unit.name,
|
|
1530
|
+
filePath: unit.filePath,
|
|
1531
|
+
startLine: unit.startLine,
|
|
1532
|
+
endLine: unit.endLine,
|
|
1533
|
+
code: unit.code,
|
|
1534
|
+
unitType: unit.unitType
|
|
1535
|
+
};
|
|
1536
|
+
}
|
|
1537
|
+
bothHaveParent(left, right, type) {
|
|
1538
|
+
return !!this.findParent(left, type) && !!this.findParent(right, type);
|
|
1539
|
+
}
|
|
1540
|
+
parentSimilarity(left, right, type) {
|
|
1541
|
+
const lp = this.findParent(left, type);
|
|
1542
|
+
const rp = this.findParent(right, type);
|
|
1543
|
+
if (!lp || !rp) return 0;
|
|
1544
|
+
const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
|
|
1545
|
+
const cached = this.parentSimCache.get(key);
|
|
1546
|
+
if (cached !== void 0) return cached;
|
|
1547
|
+
const sim = this.similarity(lp, rp);
|
|
1548
|
+
this.parentSimCache.set(key, sim);
|
|
1549
|
+
return sim;
|
|
1550
|
+
}
|
|
1551
|
+
similarity(left, right) {
|
|
1552
|
+
const key = left.id < right.id ? `${left.id}::${right.id}` : `${right.id}::${left.id}`;
|
|
1553
|
+
const cached = this.similarityCache.get(key);
|
|
1554
|
+
if (cached !== void 0) return cached;
|
|
1555
|
+
let value = 0;
|
|
1556
|
+
if (left.embedding?.length && right.embedding?.length) {
|
|
1557
|
+
value = cosineSimilarity([left.embedding], [right.embedding])[0][0] ?? 0;
|
|
1558
|
+
} else {
|
|
1559
|
+
value = this.childSimilarity(left, right);
|
|
1560
|
+
}
|
|
1561
|
+
this.similarityCache.set(key, value);
|
|
1562
|
+
return value;
|
|
1563
|
+
}
|
|
1564
|
+
childSimilarity(left, right) {
|
|
1565
|
+
const lc = left.children ?? [];
|
|
1566
|
+
const rc = right.children ?? [];
|
|
1567
|
+
if (!lc.length || !rc.length) return 0;
|
|
1568
|
+
let best = 0;
|
|
1569
|
+
for (const l of lc) {
|
|
1570
|
+
for (const r of rc) {
|
|
1571
|
+
if (l.unitType !== r.unitType) continue;
|
|
1572
|
+
const sim = this.similarity(l, r);
|
|
1573
|
+
if (sim > best) best = sim;
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
return best;
|
|
1577
|
+
}
|
|
1578
|
+
shouldSkipComparison(left, right) {
|
|
1579
|
+
if (left.unitType !== "block" /* BLOCK */ || right.unitType !== "block" /* BLOCK */) return false;
|
|
1580
|
+
if (left.filePath !== right.filePath) return false;
|
|
1581
|
+
return left.startLine <= right.startLine && left.endLine >= right.endLine || right.startLine <= left.startLine && right.endLine >= left.endLine;
|
|
1582
|
+
}
|
|
1583
|
+
findParent(unit, type) {
|
|
1584
|
+
let p = unit.parent;
|
|
1585
|
+
while (p) {
|
|
1586
|
+
if (p.unitType === type) return p;
|
|
1587
|
+
p = p.parent;
|
|
1588
|
+
}
|
|
1589
|
+
return null;
|
|
1590
|
+
}
|
|
1591
|
+
computeDuplicationScore(duplicates, allUnits) {
|
|
1592
|
+
const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
|
|
1593
|
+
if (!totalLines || !duplicates.length) {
|
|
1594
|
+
return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
|
|
1595
|
+
}
|
|
1596
|
+
const duplicateLines = duplicates.reduce((sum, g) => {
|
|
1597
|
+
const avg = (g.left.endLine - g.left.startLine + 1 + (g.right.endLine - g.right.startLine + 1)) / 2;
|
|
1598
|
+
return sum + g.similarity * avg;
|
|
1599
|
+
}, 0);
|
|
1600
|
+
const score = duplicateLines / totalLines * 100;
|
|
1601
|
+
return {
|
|
1602
|
+
score,
|
|
1603
|
+
grade: this.getScoreGrade(score),
|
|
1604
|
+
totalLines,
|
|
1605
|
+
duplicateLines: Math.round(duplicateLines),
|
|
1606
|
+
duplicateGroups: duplicates.length
|
|
1607
|
+
};
|
|
1608
|
+
}
|
|
1609
|
+
getScoreGrade(score) {
|
|
1610
|
+
if (score < 5) return "Excellent";
|
|
1611
|
+
if (score < 15) return "Good";
|
|
1612
|
+
if (score < 30) return "Fair";
|
|
1613
|
+
if (score < 50) return "Poor";
|
|
1614
|
+
return "Critical";
|
|
1615
|
+
}
|
|
1616
|
+
};
|
|
1617
|
+
|
|
1618
|
+
// src/DryScan.ts
|
|
1739
1619
|
var DryScan = class {
|
|
1740
1620
|
repoPath;
|
|
1741
1621
|
extractor;
|
|
@@ -1809,7 +1689,7 @@ var DryScan = class {
|
|
|
1809
1689
|
async buildDuplicateReport() {
|
|
1810
1690
|
const config = await this.loadConfig();
|
|
1811
1691
|
const analysis = await this.findDuplicates(config);
|
|
1812
|
-
|
|
1692
|
+
const report = {
|
|
1813
1693
|
version: 1,
|
|
1814
1694
|
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1815
1695
|
threshold: config.threshold,
|
|
@@ -1817,6 +1697,8 @@ var DryScan = class {
|
|
|
1817
1697
|
score: analysis.score,
|
|
1818
1698
|
duplicates: analysis.duplicates
|
|
1819
1699
|
};
|
|
1700
|
+
await this.saveReport(report);
|
|
1701
|
+
return report;
|
|
1820
1702
|
}
|
|
1821
1703
|
/**
|
|
1822
1704
|
* Finds duplicate code blocks using cosine similarity on embeddings.
|
|
@@ -1833,9 +1715,13 @@ var DryScan = class {
|
|
|
1833
1715
|
const dirtyPaths = await this.updateIndex();
|
|
1834
1716
|
const updateDuration = Date.now() - updateStart;
|
|
1835
1717
|
console.log(`[DryScan] Index update took ${updateDuration}ms.`);
|
|
1718
|
+
const previousReport = await this.loadLatestReport();
|
|
1719
|
+
if (previousReport?.threshold === config.threshold) {
|
|
1720
|
+
console.log("[DryScan] Reusing clean-clean duplicates from latest report (threshold unchanged).");
|
|
1721
|
+
}
|
|
1836
1722
|
console.log("[DryScan] Detecting duplicates...");
|
|
1837
1723
|
const dupStart = Date.now();
|
|
1838
|
-
const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
|
|
1724
|
+
const result = await this.services.duplicate.findDuplicates(config, dirtyPaths, previousReport);
|
|
1839
1725
|
const dupDuration = Date.now() - dupStart;
|
|
1840
1726
|
console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
|
|
1841
1727
|
return result;
|
|
@@ -1857,6 +1743,40 @@ var DryScan = class {
|
|
|
1857
1743
|
async loadConfig() {
|
|
1858
1744
|
return configStore.get(this.repoPath);
|
|
1859
1745
|
}
|
|
1746
|
+
async saveReport(report) {
|
|
1747
|
+
const reportDir = upath6.join(this.repoPath, DRYSCAN_DIR, REPORTS_DIR);
|
|
1748
|
+
await fs7.mkdir(reportDir, { recursive: true });
|
|
1749
|
+
const safeTimestamp = report.generatedAt.replace(/[:.]/g, "-");
|
|
1750
|
+
const reportPath = upath6.join(reportDir, `dupes-${safeTimestamp}.json`);
|
|
1751
|
+
await fs7.writeFile(reportPath, JSON.stringify(report, null, 2), "utf8");
|
|
1752
|
+
}
|
|
1753
|
+
async loadLatestReport() {
|
|
1754
|
+
const reportDir = upath6.join(this.repoPath, DRYSCAN_DIR, REPORTS_DIR);
|
|
1755
|
+
let entries;
|
|
1756
|
+
try {
|
|
1757
|
+
entries = await fs7.readdir(reportDir);
|
|
1758
|
+
} catch (err) {
|
|
1759
|
+
if (err?.code === "ENOENT") return null;
|
|
1760
|
+
throw err;
|
|
1761
|
+
}
|
|
1762
|
+
const jsonReports = entries.filter((name) => name.endsWith(".json"));
|
|
1763
|
+
if (jsonReports.length === 0) return null;
|
|
1764
|
+
const withStats = await Promise.all(
|
|
1765
|
+
jsonReports.map(async (name) => {
|
|
1766
|
+
const fullPath = upath6.join(reportDir, name);
|
|
1767
|
+
const stat = await fs7.stat(fullPath);
|
|
1768
|
+
return { fullPath, mtimeMs: stat.mtimeMs };
|
|
1769
|
+
})
|
|
1770
|
+
);
|
|
1771
|
+
withStats.sort((a, b) => b.mtimeMs - a.mtimeMs);
|
|
1772
|
+
const latest = withStats[0];
|
|
1773
|
+
const raw = await fs7.readFile(latest.fullPath, "utf8");
|
|
1774
|
+
const parsed = JSON.parse(raw);
|
|
1775
|
+
if (!parsed || !Array.isArray(parsed.duplicates) || typeof parsed.threshold !== "number") {
|
|
1776
|
+
return null;
|
|
1777
|
+
}
|
|
1778
|
+
return parsed;
|
|
1779
|
+
}
|
|
1860
1780
|
};
|
|
1861
1781
|
export {
|
|
1862
1782
|
DryScan,
|