@goshenkata/dryscan-core 1.2.5 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -1
- package/dist/index.js +246 -184
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/DryScan.ts +5 -4
- package/src/config/dryconfig.ts +1 -1
- package/src/extractors/java.ts +22 -7
- package/src/services/DuplicateService.ts +133 -184
- package/src/services/DuplicationCache.ts +107 -1
- package/src/services/UpdateService.ts +5 -2
package/package.json
CHANGED
package/src/DryScan.ts
CHANGED
|
@@ -89,14 +89,15 @@ export class DryScan {
|
|
|
89
89
|
* 6. Recompute embeddings for affected units
|
|
90
90
|
* 7. Update file tracking metadata
|
|
91
91
|
*/
|
|
92
|
-
async updateIndex(): Promise<
|
|
92
|
+
async updateIndex(): Promise<string[]> {
|
|
93
93
|
console.log(`[DryScan] Updating index at ${this.repoPath}...`);
|
|
94
94
|
console.log("[DryScan] Checking for file changes...");
|
|
95
95
|
const start = Date.now();
|
|
96
96
|
await this.ensureDatabase();
|
|
97
|
-
await this.services.updater.updateIndex();
|
|
97
|
+
const dirtyPaths = await this.services.updater.updateIndex();
|
|
98
98
|
const duration = Date.now() - start;
|
|
99
99
|
console.log(`[DryScan] Index update complete. Took ${duration}ms.`);
|
|
100
|
+
return dirtyPaths;
|
|
100
101
|
}
|
|
101
102
|
|
|
102
103
|
|
|
@@ -129,13 +130,13 @@ export class DryScan {
|
|
|
129
130
|
|
|
130
131
|
console.log("[DryScan] Updating index...");
|
|
131
132
|
const updateStart = Date.now();
|
|
132
|
-
await this.updateIndex();
|
|
133
|
+
const dirtyPaths = await this.updateIndex();
|
|
133
134
|
const updateDuration = Date.now() - updateStart;
|
|
134
135
|
console.log(`[DryScan] Index update took ${updateDuration}ms.`);
|
|
135
136
|
|
|
136
137
|
console.log("[DryScan] Detecting duplicates...");
|
|
137
138
|
const dupStart = Date.now();
|
|
138
|
-
const result = await this.services.duplicate.findDuplicates(config);
|
|
139
|
+
const result = await this.services.duplicate.findDuplicates(config, dirtyPaths);
|
|
139
140
|
const dupDuration = Date.now() - dupStart;
|
|
140
141
|
console.log(`[DryScan] Duplicate detection took ${dupDuration}ms.`);
|
|
141
142
|
|
package/src/config/dryconfig.ts
CHANGED
package/src/extractors/java.ts
CHANGED
|
@@ -72,7 +72,8 @@ export class JavaExtractor implements LanguageExtractor {
|
|
|
72
72
|
const fnUnit = this.buildFunctionUnit(node, source, fileRelPath, currentClass);
|
|
73
73
|
const fnLength = fnUnit.endLine - fnUnit.startLine;
|
|
74
74
|
const bodyNode = this.getFunctionBody(node);
|
|
75
|
-
const
|
|
75
|
+
const fnArity = this.getNodeArity(node);
|
|
76
|
+
const skipFunction = this.shouldSkip(IndexUnitType.FUNCTION, fnUnit.name, fnLength, fnArity);
|
|
76
77
|
|
|
77
78
|
if (skipFunction) {
|
|
78
79
|
return;
|
|
@@ -158,7 +159,7 @@ export class JavaExtractor implements LanguageExtractor {
|
|
|
158
159
|
return crypto.createHash(BLOCK_HASH_ALGO).update(normalized).digest("hex");
|
|
159
160
|
}
|
|
160
161
|
|
|
161
|
-
private shouldSkip(unitType: IndexUnitType, name: string, lineCount: number): boolean {
|
|
162
|
+
private shouldSkip(unitType: IndexUnitType, name: string, lineCount: number, arity?: number): boolean {
|
|
162
163
|
if (!this.config) {
|
|
163
164
|
throw new Error("Config not loaded before skip evaluation");
|
|
164
165
|
}
|
|
@@ -167,17 +168,30 @@ export class JavaExtractor implements LanguageExtractor {
|
|
|
167
168
|
? Math.max(indexConfig.blockMinLines, config.minBlockLines ?? 0)
|
|
168
169
|
: config.minLines;
|
|
169
170
|
const belowMin = minLines > 0 && lineCount < minLines;
|
|
170
|
-
const trivial = unitType === IndexUnitType.FUNCTION && this.isTrivialFunction(name);
|
|
171
|
+
const trivial = unitType === IndexUnitType.FUNCTION && this.isTrivialFunction(name, arity ?? 0);
|
|
171
172
|
return belowMin || trivial;
|
|
172
173
|
}
|
|
173
174
|
|
|
174
|
-
|
|
175
|
+
/**
|
|
176
|
+
* A function is trivial if it follows a simple accessor pattern:
|
|
177
|
+
* - getters/isers: name matches get[A-Z] or is[A-Z] with exactly 0 parameters
|
|
178
|
+
* - setters: name matches set[A-Z] with at most 1 parameter
|
|
179
|
+
* Methods like getUserById(Long id) have arity > 0 and are NOT trivial.
|
|
180
|
+
*/
|
|
181
|
+
private isTrivialFunction(fullName: string, arity: number): boolean {
|
|
175
182
|
const simpleName = fullName.split(".").pop() || fullName;
|
|
176
|
-
const isGetter = /^(get|is)[A-Z]/.test(simpleName);
|
|
177
|
-
const isSetter = /^set[A-Z]/.test(simpleName);
|
|
183
|
+
const isGetter = /^(get|is)[A-Z]/.test(simpleName) && arity === 0;
|
|
184
|
+
const isSetter = /^set[A-Z]/.test(simpleName) && arity <= 1;
|
|
178
185
|
return isGetter || isSetter;
|
|
179
186
|
}
|
|
180
187
|
|
|
188
|
+
/** Counts the formal parameters of a method or constructor node. */
|
|
189
|
+
private getNodeArity(node: Parser.SyntaxNode): number {
|
|
190
|
+
const params = node.childForFieldName?.("parameters");
|
|
191
|
+
if (!params) return 0;
|
|
192
|
+
return params.namedChildren.filter(c => c.type === "formal_parameter" || c.type === "spread_parameter").length;
|
|
193
|
+
}
|
|
194
|
+
|
|
181
195
|
private isDtoClass(node: Parser.SyntaxNode, source: string, className: string): boolean {
|
|
182
196
|
const classBody = node.children.find((child) => child.type === "class_body");
|
|
183
197
|
if (!classBody) return false;
|
|
@@ -200,7 +214,8 @@ export class JavaExtractor implements LanguageExtractor {
|
|
|
200
214
|
if (child.type === "method_declaration" || child.type === "constructor_declaration") {
|
|
201
215
|
const simpleName = this.getSimpleFunctionName(child, source);
|
|
202
216
|
const fullName = `${className}.${simpleName}`;
|
|
203
|
-
|
|
217
|
+
const arity = this.getNodeArity(child);
|
|
218
|
+
if (!this.isTrivialFunction(fullName, arity)) {
|
|
204
219
|
return false;
|
|
205
220
|
}
|
|
206
221
|
continue;
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import debug from "debug";
|
|
2
2
|
import shortUuid from "short-uuid";
|
|
3
|
-
import { cosineSimilarity } from "@langchain/core/utils/math";
|
|
4
3
|
import { DryScanServiceDeps } from "./types";
|
|
5
4
|
import { DuplicateAnalysisResult, DuplicateGroup, DuplicationScore, IndexUnit, IndexUnitType } from "../types";
|
|
6
5
|
import { indexConfig } from "../config/indexConfig";
|
|
@@ -15,137 +14,93 @@ export class DuplicateService {
|
|
|
15
14
|
|
|
16
15
|
constructor(private readonly deps: DryScanServiceDeps) {}
|
|
17
16
|
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
/**
|
|
18
|
+
* @param dirtyPaths - File paths changed since last run. When provided, only
|
|
19
|
+
* dirty×all similarities are recomputed; clean×clean values are reused from
|
|
20
|
+
* the existing matrix. Pass undefined (or omit) for a full rebuild.
|
|
21
|
+
*/
|
|
22
|
+
async findDuplicates(config: DryConfig, dirtyPaths?: string[]): Promise<DuplicateAnalysisResult> {
|
|
20
23
|
this.config = config;
|
|
21
24
|
const t0 = performance.now();
|
|
22
25
|
const allUnits = await this.deps.db.getAllUnits();
|
|
23
26
|
log("Starting duplicate analysis on %d units", allUnits.length);
|
|
27
|
+
|
|
24
28
|
if (allUnits.length < 2) {
|
|
25
|
-
|
|
26
|
-
const score = this.computeDuplicationScore([], allUnits);
|
|
27
|
-
return { duplicates: [], score };
|
|
29
|
+
return { duplicates: [], score: this.computeDuplicationScore([], allUnits) };
|
|
28
30
|
}
|
|
29
31
|
|
|
30
32
|
const thresholds = this.resolveThresholds(config.threshold);
|
|
31
|
-
|
|
32
|
-
const
|
|
33
|
-
|
|
34
|
-
log("Found %d duplicate groups (%d excluded)", filteredDuplicates.length, duplicates.length - filteredDuplicates.length);
|
|
33
|
+
const duplicates = this.computeDuplicates(allUnits, thresholds, dirtyPaths);
|
|
34
|
+
const filtered = duplicates.filter((g) => !this.isGroupExcluded(g));
|
|
35
|
+
log("Found %d duplicate groups (%d excluded)", filtered.length, duplicates.length - filtered.length);
|
|
35
36
|
|
|
36
|
-
|
|
37
|
-
this.cache.update(filteredDuplicates).catch((err) => log("Cache update failed: %O", err));
|
|
37
|
+
this.cache.update(filtered).catch((err) => log("Cache update failed: %O", err));
|
|
38
38
|
|
|
39
|
-
const score = this.computeDuplicationScore(
|
|
39
|
+
const score = this.computeDuplicationScore(filtered, allUnits);
|
|
40
40
|
log("findDuplicates completed in %dms", (performance.now() - t0).toFixed(2));
|
|
41
|
-
return { duplicates:
|
|
41
|
+
return { duplicates: filtered, score };
|
|
42
42
|
}
|
|
43
43
|
|
|
44
44
|
private resolveThresholds(functionThreshold?: number): { function: number; block: number; class: number } {
|
|
45
|
-
const
|
|
46
|
-
const clamp = (
|
|
47
|
-
|
|
48
|
-
const base = functionThreshold ?? defaults.function;
|
|
49
|
-
const blockOffset = defaults.block - defaults.function;
|
|
50
|
-
const classOffset = defaults.class - defaults.function;
|
|
51
|
-
|
|
52
|
-
const functionThresholdValue = clamp(base);
|
|
45
|
+
const d = indexConfig.thresholds;
|
|
46
|
+
const clamp = (v: number) => Math.min(1, Math.max(0, v));
|
|
47
|
+
const fn = clamp(functionThreshold ?? d.function);
|
|
53
48
|
return {
|
|
54
|
-
function:
|
|
55
|
-
block: clamp(
|
|
56
|
-
class: clamp(
|
|
49
|
+
function: fn,
|
|
50
|
+
block: clamp(fn + d.block - d.function),
|
|
51
|
+
class: clamp(fn + d.class - d.function),
|
|
57
52
|
};
|
|
58
53
|
}
|
|
59
54
|
|
|
60
55
|
private computeDuplicates(
|
|
61
56
|
units: IndexUnit[],
|
|
62
|
-
thresholds: { function: number; block: number; class: number }
|
|
57
|
+
thresholds: { function: number; block: number; class: number },
|
|
58
|
+
dirtyPaths?: string[]
|
|
63
59
|
): DuplicateGroup[] {
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
for (const unit of units) {
|
|
68
|
-
const list = byType.get(unit.unitType) ?? [];
|
|
69
|
-
list.push(unit);
|
|
70
|
-
byType.set(unit.unitType, list);
|
|
71
|
-
}
|
|
60
|
+
this.cache.clearRunCaches();
|
|
61
|
+
this.cache.buildEmbSimCache(units, dirtyPaths);
|
|
72
62
|
|
|
63
|
+
const duplicates: DuplicateGroup[] = [];
|
|
73
64
|
const t0 = performance.now();
|
|
74
65
|
|
|
75
|
-
for (const [type, typedUnits] of
|
|
66
|
+
for (const [type, typedUnits] of this.groupByType(units)) {
|
|
76
67
|
const threshold = this.getThreshold(type, thresholds);
|
|
77
|
-
log("Comparing %d
|
|
78
|
-
const typeStart = performance.now();
|
|
68
|
+
log("Comparing %d %s units (threshold=%.3f)", typedUnits.length, type, threshold);
|
|
79
69
|
|
|
80
70
|
for (let i = 0; i < typedUnits.length; i++) {
|
|
81
71
|
for (let j = i + 1; j < typedUnits.length; j++) {
|
|
82
|
-
const left = typedUnits[i];
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
if (this.shouldSkipComparison(left, right)) {
|
|
86
|
-
log("Skipping nested block comparison: '%s' and '%s'", left.name, right.name);
|
|
87
|
-
continue;
|
|
88
|
-
}
|
|
72
|
+
const left = typedUnits[i], right = typedUnits[j];
|
|
73
|
+
if (this.shouldSkipComparison(left, right)) continue;
|
|
89
74
|
|
|
75
|
+
// Always check the cache first — this allows pairs whose embeddings
|
|
76
|
+
// have since been cleared to still be reported using a prior score.
|
|
90
77
|
const cached = this.cache.get(left.id, right.id, left.filePath, right.filePath);
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if (
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
if (similarity >= threshold) {
|
|
108
|
-
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
109
|
-
if (!exclusionString) continue;
|
|
110
|
-
|
|
111
|
-
log("Duplicate found: '%s' <-> '%s' (similarity=%d)", left.name, right.name, similarity);
|
|
112
|
-
duplicates.push({
|
|
113
|
-
id: `${left.id}::${right.id}`,
|
|
114
|
-
similarity,
|
|
115
|
-
shortId: shortUuid.generate(),
|
|
116
|
-
exclusionString,
|
|
117
|
-
left: {
|
|
118
|
-
id: left.id,
|
|
119
|
-
name: left.name,
|
|
120
|
-
filePath: left.filePath,
|
|
121
|
-
startLine: left.startLine,
|
|
122
|
-
endLine: left.endLine,
|
|
123
|
-
code: left.code,
|
|
124
|
-
unitType: left.unitType,
|
|
125
|
-
},
|
|
126
|
-
right: {
|
|
127
|
-
id: right.id,
|
|
128
|
-
name: right.name,
|
|
129
|
-
filePath: right.filePath,
|
|
130
|
-
startLine: right.startLine,
|
|
131
|
-
endLine: right.endLine,
|
|
132
|
-
code: right.code,
|
|
133
|
-
unitType: right.unitType,
|
|
134
|
-
},
|
|
135
|
-
});
|
|
136
|
-
}
|
|
78
|
+
const hasEmbeddings = left.embedding?.length && right.embedding?.length;
|
|
79
|
+
const similarity = cached ?? (hasEmbeddings ? this.computeWeightedSimilarity(left, right, threshold) : 0);
|
|
80
|
+
if (similarity < threshold) continue;
|
|
81
|
+
|
|
82
|
+
const exclusionString = this.deps.pairing.pairKeyForUnits(left, right);
|
|
83
|
+
if (!exclusionString) continue;
|
|
84
|
+
|
|
85
|
+
duplicates.push({
|
|
86
|
+
id: `${left.id}::${right.id}`,
|
|
87
|
+
similarity,
|
|
88
|
+
shortId: shortUuid.generate(),
|
|
89
|
+
exclusionString,
|
|
90
|
+
left: this.toMember(left),
|
|
91
|
+
right: this.toMember(right),
|
|
92
|
+
});
|
|
137
93
|
}
|
|
138
94
|
}
|
|
139
|
-
log("Type '%s' comparisons completed in %dms", type, (performance.now() - typeStart).toFixed(2));
|
|
140
95
|
}
|
|
141
96
|
|
|
142
|
-
log("computeDuplicates
|
|
97
|
+
log("computeDuplicates: %d duplicates in %dms", duplicates.length, (performance.now() - t0).toFixed(2));
|
|
143
98
|
return duplicates.sort((a, b) => b.similarity - a.similarity);
|
|
144
99
|
}
|
|
145
100
|
|
|
146
101
|
private isGroupExcluded(group: DuplicateGroup): boolean {
|
|
147
102
|
const config = this.config;
|
|
148
|
-
if (!config
|
|
103
|
+
if (!config?.excludedPairs?.length) return false;
|
|
149
104
|
const key = this.deps.pairing.pairKeyForUnits(group.left, group.right);
|
|
150
105
|
if (!key) return false;
|
|
151
106
|
const actual = this.deps.pairing.parsePairKey(key);
|
|
@@ -162,142 +117,136 @@ export class DuplicateService {
|
|
|
162
117
|
return thresholds.function;
|
|
163
118
|
}
|
|
164
119
|
|
|
165
|
-
private computeWeightedSimilarity(left: IndexUnit, right: IndexUnit): number {
|
|
166
|
-
const
|
|
120
|
+
private computeWeightedSimilarity(left: IndexUnit, right: IndexUnit, threshold: number): number {
|
|
121
|
+
const selfSim = this.similarity(left, right);
|
|
167
122
|
|
|
123
|
+
//CLASS
|
|
168
124
|
if (left.unitType === IndexUnitType.CLASS) {
|
|
169
|
-
return
|
|
125
|
+
return selfSim * indexConfig.weights.class.self;
|
|
170
126
|
}
|
|
171
127
|
|
|
128
|
+
// FUNCTION
|
|
172
129
|
if (left.unitType === IndexUnitType.FUNCTION) {
|
|
173
|
-
const
|
|
174
|
-
const
|
|
175
|
-
const
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
return ((weights.self * selfSimilarity) + (hasParentClass ? (weights.parentClass * parentClassSimilarity) : 0)) / totalWeight;
|
|
130
|
+
const w = indexConfig.weights.function;
|
|
131
|
+
const hasPC = this.bothHaveParent(left, right, IndexUnitType.CLASS);
|
|
132
|
+
const total = w.self + (hasPC ? w.parentClass : 0);
|
|
133
|
+
// Early exit: even with perfect parent similarity, can't reach threshold.
|
|
134
|
+
if ((w.self * selfSim + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
|
|
135
|
+
return (w.self * selfSim + (hasPC ? w.parentClass * this.parentSimilarity(left, right, IndexUnitType.CLASS) : 0)) / total;
|
|
180
136
|
}
|
|
181
137
|
|
|
182
|
-
|
|
183
|
-
const
|
|
184
|
-
const
|
|
185
|
-
const
|
|
186
|
-
const
|
|
138
|
+
// BLOCK
|
|
139
|
+
const w = indexConfig.weights.block;
|
|
140
|
+
const hasPF = this.bothHaveParent(left, right, IndexUnitType.FUNCTION);
|
|
141
|
+
const hasPC = this.bothHaveParent(left, right, IndexUnitType.CLASS);
|
|
142
|
+
const total = w.self + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0);
|
|
143
|
+
if ((w.self * selfSim + (hasPF ? w.parentFunction : 0) + (hasPC ? w.parentClass : 0)) / total < threshold) return 0;
|
|
144
|
+
return (
|
|
145
|
+
w.self * selfSim +
|
|
146
|
+
(hasPF ? w.parentFunction * this.parentSimilarity(left, right, IndexUnitType.FUNCTION) : 0) +
|
|
147
|
+
(hasPC ? w.parentClass * this.parentSimilarity(left, right, IndexUnitType.CLASS) : 0)
|
|
148
|
+
) / total;
|
|
149
|
+
}
|
|
187
150
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
151
|
+
/** Groups all units by type for the comparison loop. Units without embeddings are included
|
|
152
|
+
* so that cache hits can still be returned for pairs whose embeddings were cleared. */
|
|
153
|
+
private groupByType(units: IndexUnit[]): Map<IndexUnitType, IndexUnit[]> {
|
|
154
|
+
const byType = new Map<IndexUnitType, IndexUnit[]>();
|
|
155
|
+
for (const unit of units) {
|
|
156
|
+
const list = byType.get(unit.unitType) ?? [];
|
|
157
|
+
list.push(unit);
|
|
158
|
+
byType.set(unit.unitType, list);
|
|
159
|
+
}
|
|
160
|
+
return byType;
|
|
161
|
+
}
|
|
193
162
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
163
|
+
private toMember(unit: IndexUnit): DuplicateGroup["left"] {
|
|
164
|
+
return {
|
|
165
|
+
id: unit.id,
|
|
166
|
+
name: unit.name,
|
|
167
|
+
filePath: unit.filePath,
|
|
168
|
+
startLine: unit.startLine,
|
|
169
|
+
endLine: unit.endLine,
|
|
170
|
+
code: unit.code,
|
|
171
|
+
unitType: unit.unitType,
|
|
172
|
+
};
|
|
199
173
|
}
|
|
200
174
|
|
|
201
|
-
private
|
|
202
|
-
|
|
203
|
-
const rightParent = this.findParentOfType(right, targetType);
|
|
204
|
-
if (!leftParent || !rightParent) return 0;
|
|
205
|
-
return this.similarityWithFallback(leftParent, rightParent);
|
|
175
|
+
private bothHaveParent(left: IndexUnit, right: IndexUnit, type: IndexUnitType): boolean {
|
|
176
|
+
return !!this.findParent(left, type) && !!this.findParent(right, type);
|
|
206
177
|
}
|
|
207
178
|
|
|
208
|
-
private
|
|
209
|
-
const
|
|
210
|
-
|
|
179
|
+
private parentSimilarity(left: IndexUnit, right: IndexUnit, type: IndexUnitType): number {
|
|
180
|
+
const lp = this.findParent(left, type), rp = this.findParent(right, type);
|
|
181
|
+
if (!lp || !rp) return 0;
|
|
211
182
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
183
|
+
const key = lp.id < rp.id ? `${lp.id}::${rp.id}` : `${rp.id}::${lp.id}`;
|
|
184
|
+
const cached = this.cache.getParentSim(key);
|
|
185
|
+
if (cached !== undefined) return cached;
|
|
215
186
|
|
|
216
|
-
|
|
187
|
+
const sim = this.similarity(lp, rp);
|
|
188
|
+
this.cache.setParentSim(key, sim);
|
|
189
|
+
return sim;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/** Resolves similarity via the pre-computed embedding matrix, falling back to best child match. */
|
|
193
|
+
private similarity(left: IndexUnit, right: IndexUnit): number {
|
|
194
|
+
return this.cache.getEmbSim(left.id, right.id) ?? this.childSimilarity(left, right);
|
|
217
195
|
}
|
|
218
196
|
|
|
219
197
|
private childSimilarity(left: IndexUnit, right: IndexUnit): number {
|
|
220
|
-
const
|
|
221
|
-
|
|
222
|
-
if (leftChildren.length === 0 || rightChildren.length === 0) return 0;
|
|
198
|
+
const lc = left.children ?? [], rc = right.children ?? [];
|
|
199
|
+
if (!lc.length || !rc.length) return 0;
|
|
223
200
|
|
|
224
201
|
let best = 0;
|
|
225
|
-
for (const
|
|
226
|
-
for (const
|
|
227
|
-
if (
|
|
228
|
-
const sim = this.
|
|
202
|
+
for (const l of lc) {
|
|
203
|
+
for (const r of rc) {
|
|
204
|
+
if (l.unitType !== r.unitType) continue;
|
|
205
|
+
const sim = this.similarity(l, r);
|
|
229
206
|
if (sim > best) best = sim;
|
|
230
207
|
}
|
|
231
208
|
}
|
|
232
209
|
return best;
|
|
233
210
|
}
|
|
234
211
|
|
|
235
|
-
private hasVector(unit: IndexUnit): boolean {
|
|
236
|
-
return Array.isArray(unit.embedding) && unit.embedding.length > 0;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
212
|
private shouldSkipComparison(left: IndexUnit, right: IndexUnit): boolean {
|
|
240
|
-
if (left.unitType !== IndexUnitType.BLOCK || right.unitType !== IndexUnitType.BLOCK)
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
if (left.filePath !== right.filePath) {
|
|
245
|
-
return false;
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
const leftContainsRight = left.startLine <= right.startLine && left.endLine >= right.endLine;
|
|
249
|
-
const rightContainsLeft = right.startLine <= left.startLine && right.endLine >= left.endLine;
|
|
250
|
-
return leftContainsRight || rightContainsLeft;
|
|
213
|
+
if (left.unitType !== IndexUnitType.BLOCK || right.unitType !== IndexUnitType.BLOCK) return false;
|
|
214
|
+
if (left.filePath !== right.filePath) return false;
|
|
215
|
+
return (left.startLine <= right.startLine && left.endLine >= right.endLine)
|
|
216
|
+
|| (right.startLine <= left.startLine && right.endLine >= left.endLine);
|
|
251
217
|
}
|
|
252
218
|
|
|
253
|
-
private
|
|
254
|
-
let
|
|
255
|
-
while (
|
|
256
|
-
if (
|
|
257
|
-
|
|
219
|
+
private findParent(unit: IndexUnit, type: IndexUnitType): IndexUnit | null {
|
|
220
|
+
let p = unit.parent;
|
|
221
|
+
while (p) {
|
|
222
|
+
if (p.unitType === type) return p;
|
|
223
|
+
p = p.parent;
|
|
258
224
|
}
|
|
259
225
|
return null;
|
|
260
226
|
}
|
|
261
227
|
|
|
262
228
|
private computeDuplicationScore(duplicates: DuplicateGroup[], allUnits: IndexUnit[]): DuplicationScore {
|
|
263
|
-
const totalLines =
|
|
264
|
-
|
|
265
|
-
if (totalLines
|
|
266
|
-
return {
|
|
267
|
-
score: 0,
|
|
268
|
-
grade: "Excellent",
|
|
269
|
-
totalLines,
|
|
270
|
-
duplicateLines: 0,
|
|
271
|
-
duplicateGroups: 0,
|
|
272
|
-
};
|
|
229
|
+
const totalLines = allUnits.reduce((sum, u) => sum + u.endLine - u.startLine + 1, 0);
|
|
230
|
+
|
|
231
|
+
if (!totalLines || !duplicates.length) {
|
|
232
|
+
return { score: 0, grade: "Excellent", totalLines, duplicateLines: 0, duplicateGroups: 0 };
|
|
273
233
|
}
|
|
274
234
|
|
|
275
|
-
const
|
|
276
|
-
const
|
|
277
|
-
|
|
278
|
-
const avgLines = (leftLines + rightLines) / 2;
|
|
279
|
-
return sum + group.similarity * avgLines;
|
|
235
|
+
const duplicateLines = duplicates.reduce((sum, g) => {
|
|
236
|
+
const avg = ((g.left.endLine - g.left.startLine + 1) + (g.right.endLine - g.right.startLine + 1)) / 2;
|
|
237
|
+
return sum + g.similarity * avg;
|
|
280
238
|
}, 0);
|
|
281
239
|
|
|
282
|
-
const score = (
|
|
283
|
-
const grade = this.getScoreGrade(score);
|
|
284
|
-
|
|
240
|
+
const score = (duplicateLines / totalLines) * 100;
|
|
285
241
|
return {
|
|
286
242
|
score,
|
|
287
|
-
grade,
|
|
243
|
+
grade: this.getScoreGrade(score),
|
|
288
244
|
totalLines,
|
|
289
|
-
duplicateLines: Math.round(
|
|
245
|
+
duplicateLines: Math.round(duplicateLines),
|
|
290
246
|
duplicateGroups: duplicates.length,
|
|
291
247
|
};
|
|
292
248
|
}
|
|
293
249
|
|
|
294
|
-
private calculateTotalLines(units: IndexUnit[]): number {
|
|
295
|
-
return units.reduce((sum, unit) => {
|
|
296
|
-
const lines = unit.endLine - unit.startLine + 1;
|
|
297
|
-
return sum + lines;
|
|
298
|
-
}, 0);
|
|
299
|
-
}
|
|
300
|
-
|
|
301
250
|
private getScoreGrade(score: number): DuplicationScore["grade"] {
|
|
302
251
|
if (score < 5) return "Excellent";
|
|
303
252
|
if (score < 15) return "Good";
|