@rigour-labs/core 5.0.1 → 5.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -1
- package/dist/gates/agent-team.d.ts +0 -1
- package/dist/gates/agent-team.js +0 -1
- package/dist/gates/checkpoint.d.ts +0 -2
- package/dist/gates/checkpoint.js +0 -2
- package/dist/gates/context-window-artifacts.d.ts +6 -2
- package/dist/gates/context-window-artifacts.js +107 -31
- package/dist/gates/deep-analysis.d.ts +2 -0
- package/dist/gates/deep-analysis.js +41 -11
- package/dist/gates/dependency.d.ts +0 -2
- package/dist/gates/dependency.js +23 -5
- package/dist/gates/deprecated-apis.d.ts +0 -2
- package/dist/gates/deprecated-apis.js +33 -20
- package/dist/gates/duplication-drift/index.d.ts +61 -0
- package/dist/gates/duplication-drift/index.js +240 -0
- package/dist/gates/duplication-drift/similarity.d.ts +68 -0
- package/dist/gates/duplication-drift/similarity.js +177 -0
- package/dist/gates/duplication-drift/tokenizer.d.ts +55 -0
- package/dist/gates/duplication-drift/tokenizer.js +195 -0
- package/dist/gates/frontend-secret-exposure.d.ts +0 -3
- package/dist/gates/frontend-secret-exposure.js +1 -114
- package/dist/gates/frontend-secret-patterns.d.ts +33 -0
- package/dist/gates/frontend-secret-patterns.js +119 -0
- package/dist/gates/{hallucinated-imports.d.ts → hallucinated-imports/index.d.ts} +2 -29
- package/dist/gates/hallucinated-imports/index.js +174 -0
- package/dist/gates/hallucinated-imports/js-resolver.d.ts +45 -0
- package/dist/gates/hallucinated-imports/js-resolver.js +320 -0
- package/dist/gates/hallucinated-imports/manifest-discovery.d.ts +28 -0
- package/dist/gates/hallucinated-imports/manifest-discovery.js +114 -0
- package/dist/gates/hallucinated-imports/python-resolver.d.ts +24 -0
- package/dist/gates/hallucinated-imports/python-resolver.js +306 -0
- package/dist/gates/hallucinated-imports-lang.d.ts +2 -2
- package/dist/gates/hallucinated-imports-lang.js +269 -34
- package/dist/gates/hallucinated-imports.test.js +1 -2
- package/dist/gates/inconsistent-error-handling.d.ts +0 -5
- package/dist/gates/inconsistent-error-handling.js +15 -144
- package/dist/gates/language-adapters/csharp-adapter.d.ts +16 -0
- package/dist/gates/language-adapters/csharp-adapter.js +211 -0
- package/dist/gates/language-adapters/go-adapter.d.ts +26 -0
- package/dist/gates/language-adapters/go-adapter.js +195 -0
- package/dist/gates/language-adapters/index.d.ts +15 -0
- package/dist/gates/language-adapters/index.js +16 -0
- package/dist/gates/language-adapters/java-adapter.d.ts +16 -0
- package/dist/gates/language-adapters/java-adapter.js +237 -0
- package/dist/gates/language-adapters/js-adapter.d.ts +26 -0
- package/dist/gates/language-adapters/js-adapter.js +279 -0
- package/dist/gates/language-adapters/python-adapter.d.ts +25 -0
- package/dist/gates/language-adapters/python-adapter.js +183 -0
- package/dist/gates/language-adapters/registry.d.ts +26 -0
- package/dist/gates/language-adapters/registry.js +65 -0
- package/dist/gates/language-adapters/ruby-adapter.d.ts +25 -0
- package/dist/gates/language-adapters/ruby-adapter.js +217 -0
- package/dist/gates/language-adapters/rust-adapter.d.ts +27 -0
- package/dist/gates/language-adapters/rust-adapter.js +235 -0
- package/dist/gates/language-adapters/types.d.ts +60 -0
- package/dist/gates/language-adapters/types.js +22 -0
- package/dist/gates/logic-drift-extractors.d.ts +15 -0
- package/dist/gates/logic-drift-extractors.js +34 -0
- package/dist/gates/logic-drift.d.ts +0 -30
- package/dist/gates/logic-drift.js +39 -129
- package/dist/gates/phantom-apis.d.ts +0 -2
- package/dist/gates/phantom-apis.js +49 -20
- package/dist/gates/promise-safety.d.ts +0 -1
- package/dist/gates/promise-safety.js +14 -2
- package/dist/gates/runner.js +51 -22
- package/dist/gates/security-patterns-data.d.ts +14 -0
- package/dist/gates/security-patterns-data.js +235 -0
- package/dist/gates/security-patterns.d.ts +17 -3
- package/dist/gates/security-patterns.js +80 -211
- package/dist/gates/side-effect-analysis/categorizer.d.ts +32 -0
- package/dist/gates/side-effect-analysis/categorizer.js +83 -0
- package/dist/gates/{side-effect-analysis.d.ts → side-effect-analysis/index.d.ts} +3 -5
- package/dist/gates/{side-effect-analysis.js → side-effect-analysis/index.js} +33 -45
- package/dist/gates/side-effect-analysis/scope-tracker.d.ts +37 -0
- package/dist/gates/side-effect-analysis/scope-tracker.js +40 -0
- package/dist/gates/side-effect-helpers/index.d.ts +4 -0
- package/dist/gates/side-effect-helpers/index.js +4 -0
- package/dist/gates/side-effect-helpers/pattern-detection.d.ts +123 -0
- package/dist/gates/{side-effect-helpers.js → side-effect-helpers/pattern-detection.js} +22 -468
- package/dist/gates/side-effect-helpers/resource-tracking.d.ts +80 -0
- package/dist/gates/side-effect-helpers/resource-tracking.js +281 -0
- package/dist/gates/side-effect-helpers/scope-analysis.d.ts +21 -0
- package/dist/gates/side-effect-helpers/scope-analysis.js +146 -0
- package/dist/gates/side-effect-helpers/types.d.ts +38 -0
- package/dist/gates/side-effect-helpers/types.js +41 -0
- package/dist/gates/side-effect-rules.d.ts +0 -1
- package/dist/gates/side-effect-rules.js +0 -1
- package/dist/gates/style-drift-rules.d.ts +86 -0
- package/dist/gates/style-drift-rules.js +103 -0
- package/dist/gates/style-drift.d.ts +7 -16
- package/dist/gates/style-drift.js +101 -119
- package/dist/gates/test-quality-matchers.d.ts +53 -0
- package/dist/gates/test-quality-matchers.js +86 -0
- package/dist/gates/test-quality.d.ts +0 -3
- package/dist/gates/test-quality.js +47 -44
- package/dist/hooks/checker.d.ts +0 -1
- package/dist/hooks/checker.js +0 -2
- package/dist/hooks/dlp-templates.d.ts +0 -1
- package/dist/hooks/dlp-templates.js +0 -4
- package/dist/hooks/index.d.ts +0 -2
- package/dist/hooks/index.js +0 -2
- package/dist/hooks/input-validator.d.ts +0 -1
- package/dist/hooks/input-validator.js +0 -1
- package/dist/hooks/input-validator.test.js +0 -1
- package/dist/hooks/standalone-checker.d.ts +0 -1
- package/dist/hooks/standalone-checker.js +0 -1
- package/dist/hooks/standalone-dlp-checker.d.ts +0 -1
- package/dist/hooks/standalone-dlp-checker.js +0 -1
- package/dist/hooks/templates.d.ts +0 -1
- package/dist/hooks/templates.js +0 -1
- package/dist/hooks/types.d.ts +0 -1
- package/dist/hooks/types.js +0 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/services/adaptive-thresholds.d.ts +0 -2
- package/dist/services/adaptive-thresholds.js +0 -2
- package/dist/services/filesystem-cache.d.ts +0 -1
- package/dist/services/filesystem-cache.js +0 -1
- package/dist/services/score-history.d.ts +0 -1
- package/dist/services/score-history.js +0 -1
- package/dist/services/temporal-drift.d.ts +1 -2
- package/dist/services/temporal-drift.js +7 -8
- package/dist/storage/db.d.ts +23 -7
- package/dist/storage/db.js +116 -55
- package/dist/storage/findings.d.ts +4 -3
- package/dist/storage/findings.js +13 -20
- package/dist/storage/local-memory.d.ts +4 -4
- package/dist/storage/local-memory.js +20 -22
- package/dist/storage/patterns.d.ts +5 -5
- package/dist/storage/patterns.js +20 -26
- package/dist/storage/scans.d.ts +6 -6
- package/dist/storage/scans.js +12 -21
- package/dist/types/index.d.ts +1 -0
- package/dist/utils/scanner.js +1 -1
- package/package.json +7 -8
- package/dist/gates/duplication-drift.d.ts +0 -128
- package/dist/gates/duplication-drift.js +0 -585
- package/dist/gates/hallucinated-imports.js +0 -641
- package/dist/gates/side-effect-helpers.d.ts +0 -260
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Duplication Drift Gate (v2)
|
|
3
|
+
*
|
|
4
|
+
* Detects when AI generates near-identical functions across files because
|
|
5
|
+
* it doesn't remember what it already wrote. This is an AI-specific failure
|
|
6
|
+
* mode — humans reuse via copy-paste (same file), AI re-invents (cross-file).
|
|
7
|
+
*
|
|
8
|
+
* v2 upgrades:
|
|
9
|
+
* - tree-sitter AST node type sequences replace hand-rolled regex tokenizer
|
|
10
|
+
* - Jaccard similarity on AST node multisets (structural, not textual)
|
|
11
|
+
* - Catches duplicates even when every variable name is different
|
|
12
|
+
* - MD5 kept as fast-path for exact matches, Jaccard runs on remaining pairs
|
|
13
|
+
*
|
|
14
|
+
* Detection strategy (three-pass):
|
|
15
|
+
* 1. Extract function bodies, normalize text (strip comments/whitespace)
|
|
16
|
+
* 2. Parse with tree-sitter → walk AST → collect node type multiset
|
|
17
|
+
* 3. Generate semantic embeddings via all-MiniLM-L6-v2 (384D)
|
|
18
|
+
* 4. Pass 1 (fast): MD5 hash → exact duplicates (O(n), <10ms)
|
|
19
|
+
* 5. Pass 2 (Jaccard): AST node multiset similarity → structural near-duplicates (O(n²) bounded)
|
|
20
|
+
* 6. Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
21
|
+
* 7. Flag functions with similarity > threshold in different files
|
|
22
|
+
*
|
|
23
|
+
* Why AST node types > raw tokens:
|
|
24
|
+
* - `getUserById(id) { return db.find(x => x.id === id) }`
|
|
25
|
+
* - `fetchUser(userId) { return database.filter(u => u.id === userId)[0] }`
|
|
26
|
+
* Both produce similar AST: [return_statement, call_expression, arrow_function,
|
|
27
|
+
* binary_expression, member_expression]. Variable names are invisible.
|
|
28
|
+
*/
|
|
29
|
+
import path from 'path';
|
|
30
|
+
import { Gate } from '../base.js';
|
|
31
|
+
import { FileScanner } from '../../utils/scanner.js';
|
|
32
|
+
import { Logger } from '../../utils/logger.js';
|
|
33
|
+
import { languageAdapters } from '../language-adapters/index.js';
|
|
34
|
+
import { generateEmbedding } from '../../pattern-index/embeddings.js';
|
|
35
|
+
import { initTreeSitter, GRAMMAR_PATHS, languageCache, getParser, getGrammarDir, textTokenize, collectASTNodeTypes, findFunctionNodeAtLine, normalizeBody, extractFunctionBody, } from './tokenizer.js';
|
|
36
|
+
import { hashBody, findDuplicateGroups, buildEmbeddingText, calculateSimilarity, } from './similarity.js';
|
|
37
|
+
export class DuplicationDriftGate extends Gate {
|
|
38
|
+
config;
|
|
39
|
+
parser = null;
|
|
40
|
+
constructor(config = {}) {
|
|
41
|
+
super('duplication-drift', 'AI Duplication Drift Detection');
|
|
42
|
+
this.config = {
|
|
43
|
+
enabled: config.enabled ?? true,
|
|
44
|
+
similarity_threshold: config.similarity_threshold ?? 0.75,
|
|
45
|
+
semantic_threshold: config.semantic_threshold ?? 0.85,
|
|
46
|
+
semantic_enabled: config.semantic_enabled ?? true,
|
|
47
|
+
min_body_lines: config.min_body_lines ?? 5,
|
|
48
|
+
approved_duplications: config.approved_duplications ?? [],
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
get provenance() { return 'ai-drift'; }
|
|
52
|
+
async run(context) {
|
|
53
|
+
if (!this.config.enabled)
|
|
54
|
+
return [];
|
|
55
|
+
// Try to init tree-sitter (non-blocking, falls back gracefully)
|
|
56
|
+
const hasTreeSitter = await initTreeSitter();
|
|
57
|
+
if (hasTreeSitter && !this.parser) {
|
|
58
|
+
const Parser = getParser();
|
|
59
|
+
this.parser = new Parser();
|
|
60
|
+
}
|
|
61
|
+
const failures = [];
|
|
62
|
+
const functions = [];
|
|
63
|
+
const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py,go,rs}'];
|
|
64
|
+
const files = await FileScanner.findFiles({
|
|
65
|
+
cwd: context.cwd,
|
|
66
|
+
patterns: scanPatterns,
|
|
67
|
+
ignore: [...(context.ignore || []), '**/node_modules/**', '**/dist/**', '**/*.test.*', '**/*.spec.*'],
|
|
68
|
+
});
|
|
69
|
+
Logger.info(`Duplication Drift: Scanning ${files.length} files (tree-sitter: ${hasTreeSitter ? 'ON' : 'fallback'})`);
|
|
70
|
+
for (const file of files) {
|
|
71
|
+
try {
|
|
72
|
+
const { readFile } = await import('fs-extra');
|
|
73
|
+
const content = await readFile(path.join(context.cwd, file), 'utf-8');
|
|
74
|
+
const ext = path.extname(file);
|
|
75
|
+
const adapter = languageAdapters.getAdapter(file);
|
|
76
|
+
if (adapter?.id === 'js') {
|
|
77
|
+
this.extractJSFunctions(content, file, functions);
|
|
78
|
+
}
|
|
79
|
+
else if (adapter?.id === 'python') {
|
|
80
|
+
this.extractPyFunctions(content, file, functions);
|
|
81
|
+
}
|
|
82
|
+
// Generate AST tokens using tree-sitter if available
|
|
83
|
+
if (hasTreeSitter && GRAMMAR_PATHS[ext]) {
|
|
84
|
+
await this.enrichWithASTTokens(content, ext, file, functions);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
catch (e) { }
|
|
88
|
+
}
|
|
89
|
+
// Pass 3 prep: Generate semantic embeddings for all extracted functions
|
|
90
|
+
// (embedding generation is lazy — only runs when semantic_enabled is true)
|
|
91
|
+
if (this.config.semantic_enabled && functions.length > 0) {
|
|
92
|
+
const allIndices = functions.map((_, i) => i);
|
|
93
|
+
await this.enrichWithEmbeddings(functions, allIndices);
|
|
94
|
+
}
|
|
95
|
+
const duplicateGroups = findDuplicateGroups(functions, this.config.similarity_threshold, this.config.semantic_threshold, this.config.semantic_enabled);
|
|
96
|
+
// Build approved pairs set for fast lookup
|
|
97
|
+
const approvedSet = new Set((this.config.approved_duplications || []).map(s => s.toLowerCase()));
|
|
98
|
+
for (const group of duplicateGroups) {
|
|
99
|
+
// Check if this pair is human-approved
|
|
100
|
+
const names = group.map(f => f.name).sort();
|
|
101
|
+
const pairKey = names.join(':').toLowerCase();
|
|
102
|
+
if (approvedSet.has(pairKey))
|
|
103
|
+
continue;
|
|
104
|
+
const files = group.map(f => f.file);
|
|
105
|
+
const locations = group.map(f => `${f.file}:${f.line} (${f.name})`).join(', ');
|
|
106
|
+
const { similarity, method } = calculateSimilarity(group[0], group[1] || group[0]);
|
|
107
|
+
const pct = (similarity * 100).toFixed(0);
|
|
108
|
+
failures.push(this.createFailure(`AI Duplication Drift: Function '${group[0].name}' has ${group.length} near-identical copies (${pct}% similar via ${method})`, [...new Set(files)], `Found duplicate implementations at: ${locations}. Extract to a shared module and import.`, 'Duplication Drift', group[0].line, undefined, 'high'));
|
|
109
|
+
}
|
|
110
|
+
return failures;
|
|
111
|
+
}
|
|
112
|
+
// ─── tree-sitter AST Tokenization ───────────────────────────────
|
|
113
|
+
/**
|
|
114
|
+
* Parse the file with tree-sitter, find function nodes that match
|
|
115
|
+
* our extracted functions (by line number), and replace their token
|
|
116
|
+
* multisets with AST node type sequences.
|
|
117
|
+
*/
|
|
118
|
+
async enrichWithASTTokens(content, ext, file, functions) {
|
|
119
|
+
if (!this.parser)
|
|
120
|
+
return;
|
|
121
|
+
const grammarRelPath = GRAMMAR_PATHS[ext];
|
|
122
|
+
if (!grammarRelPath)
|
|
123
|
+
return;
|
|
124
|
+
try {
|
|
125
|
+
// Load language (cached)
|
|
126
|
+
if (!languageCache.has(ext)) {
|
|
127
|
+
const grammarDir = getGrammarDir();
|
|
128
|
+
const grammarPath = path.resolve(grammarDir, grammarRelPath);
|
|
129
|
+
const Parser = getParser();
|
|
130
|
+
const lang = await Parser.Language.load(grammarPath);
|
|
131
|
+
languageCache.set(ext, lang);
|
|
132
|
+
}
|
|
133
|
+
const lang = languageCache.get(ext);
|
|
134
|
+
this.parser.setLanguage(lang);
|
|
135
|
+
const tree = this.parser.parse(content);
|
|
136
|
+
// Find functions that belong to this file
|
|
137
|
+
const fileFunctions = functions.filter(f => f.file === file);
|
|
138
|
+
for (const fn of fileFunctions) {
|
|
139
|
+
// Find the AST node at this function's line
|
|
140
|
+
const node = findFunctionNodeAtLine(tree.rootNode, fn.line);
|
|
141
|
+
if (node) {
|
|
142
|
+
fn.astTokens = collectASTNodeTypes(node);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
catch (e) {
|
|
147
|
+
// tree-sitter parse failed for this file — keep text tokens
|
|
148
|
+
Logger.debug(`tree-sitter parse failed for ${file}: ${e}`);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
// ─── Function Extraction ────────────────────────────────────────
|
|
152
|
+
extractJSFunctions(content, file, functions) {
|
|
153
|
+
const lines = content.split('\n');
|
|
154
|
+
const patterns = [
|
|
155
|
+
/^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/,
|
|
156
|
+
/^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|(\w+))\s*=>/,
|
|
157
|
+
/^\s+(?:async\s+)?(\w+)\s*\(([^)]*)\)\s*\{/,
|
|
158
|
+
];
|
|
159
|
+
for (let i = 0; i < lines.length; i++) {
|
|
160
|
+
const line = lines[i];
|
|
161
|
+
for (const pattern of patterns) {
|
|
162
|
+
const match = line.match(pattern);
|
|
163
|
+
if (match) {
|
|
164
|
+
const name = match[1];
|
|
165
|
+
const params = match[2] || '';
|
|
166
|
+
const body = extractFunctionBody(lines, i);
|
|
167
|
+
if (body.length >= this.config.min_body_lines) {
|
|
168
|
+
const normalized = normalizeBody(body.join('\n'));
|
|
169
|
+
functions.push({
|
|
170
|
+
name,
|
|
171
|
+
file,
|
|
172
|
+
line: i + 1,
|
|
173
|
+
paramCount: params ? params.split(',').length : 0,
|
|
174
|
+
bodyHash: hashBody(normalized),
|
|
175
|
+
bodyLength: body.length,
|
|
176
|
+
normalized,
|
|
177
|
+
// Start with text tokens, enrichWithASTTokens() upgrades if tree-sitter available
|
|
178
|
+
astTokens: textTokenize(normalized),
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
break;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
extractPyFunctions(content, file, functions) {
|
|
187
|
+
const lines = content.split('\n');
|
|
188
|
+
for (let i = 0; i < lines.length; i++) {
|
|
189
|
+
const match = lines[i].match(/^(?:\s*)(?:async\s+)?def\s+(\w+)\s*\(([^)]*)\)/);
|
|
190
|
+
if (match) {
|
|
191
|
+
const name = match[1];
|
|
192
|
+
const params = match[2] || '';
|
|
193
|
+
const indent = lines[i].match(/^(\s*)/)?.[1]?.length || 0;
|
|
194
|
+
const body = [];
|
|
195
|
+
for (let j = i + 1; j < lines.length; j++) {
|
|
196
|
+
const lineIndent = lines[j].match(/^(\s*)/)?.[1]?.length || 0;
|
|
197
|
+
if (lines[j].trim() === '' || lineIndent > indent) {
|
|
198
|
+
body.push(lines[j]);
|
|
199
|
+
}
|
|
200
|
+
else {
|
|
201
|
+
break;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
if (body.length >= this.config.min_body_lines) {
|
|
205
|
+
const normalized = normalizeBody(body.join('\n'));
|
|
206
|
+
functions.push({
|
|
207
|
+
name,
|
|
208
|
+
file,
|
|
209
|
+
line: i + 1,
|
|
210
|
+
paramCount: params ? params.split(',').length : 0,
|
|
211
|
+
bodyHash: hashBody(normalized),
|
|
212
|
+
bodyLength: body.length,
|
|
213
|
+
normalized,
|
|
214
|
+
astTokens: textTokenize(normalized),
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
// ─── Semantic Embedding ─────────────────────────────────────────
|
|
221
|
+
/**
|
|
222
|
+
* Enrich functions with semantic embeddings for Pass 3.
|
|
223
|
+
* Only called for functions not already claimed by Pass 1/2.
|
|
224
|
+
* Uses generateEmbedding() from pattern-index/embeddings.ts.
|
|
225
|
+
*/
|
|
226
|
+
async enrichWithEmbeddings(functions, indices) {
|
|
227
|
+
Logger.info(`Semantic Pass 3: Generating embeddings for ${indices.length} functions`);
|
|
228
|
+
for (const idx of indices) {
|
|
229
|
+
const fn = functions[idx];
|
|
230
|
+
try {
|
|
231
|
+
const text = buildEmbeddingText(fn);
|
|
232
|
+
fn.embedding = await generateEmbedding(text);
|
|
233
|
+
}
|
|
234
|
+
catch {
|
|
235
|
+
// Embedding failed — skip this function for Pass 3
|
|
236
|
+
Logger.debug(`Embedding generation failed for ${fn.file}:${fn.name}`);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Duplication Drift Similarity Detection
|
|
3
|
+
*
|
|
4
|
+
* Handles all comparison and similarity detection logic:
|
|
5
|
+
* - Fingerprinting/hashing
|
|
6
|
+
* - Jaccard similarity on multisets
|
|
7
|
+
* - Clone detection algorithms
|
|
8
|
+
* - Threshold calculations
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Represents a detected function with its signatures and comparison data.
|
|
12
|
+
*/
|
|
13
|
+
export interface FunctionSignature {
|
|
14
|
+
name: string;
|
|
15
|
+
file: string;
|
|
16
|
+
line: number;
|
|
17
|
+
paramCount: number;
|
|
18
|
+
bodyHash: string;
|
|
19
|
+
bodyLength: number;
|
|
20
|
+
normalized: string;
|
|
21
|
+
/** AST node type multiset for Jaccard comparison */
|
|
22
|
+
astTokens: Map<string, number>;
|
|
23
|
+
/** Semantic embedding vector (384D) for cosine similarity */
|
|
24
|
+
embedding?: number[];
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Hash a normalized function body using MD5.
|
|
28
|
+
*/
|
|
29
|
+
export declare function hashBody(text: string): string;
|
|
30
|
+
/**
|
|
31
|
+
* Jaccard similarity on multisets.
|
|
32
|
+
* intersection = sum of min(countA, countB) for each key
|
|
33
|
+
* union = sum of max(countA, countB) for each key
|
|
34
|
+
*/
|
|
35
|
+
export declare function jaccardSimilarity(a: Map<string, number>, b: Map<string, number>): number;
|
|
36
|
+
/**
|
|
37
|
+
* Generate semantic embedding text for a function.
|
|
38
|
+
* Combines function name, parameter names, and first 200 tokens of body.
|
|
39
|
+
* This captures INTENT regardless of implementation differences.
|
|
40
|
+
*
|
|
41
|
+
* Example:
|
|
42
|
+
* getUserById(id) { return db.users.find(x => x.id === id) }
|
|
43
|
+
* → "getUserById id return db users find x id id"
|
|
44
|
+
*
|
|
45
|
+
* fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
|
|
46
|
+
* → "fetchUserRecord userId return database users filter u id userId 0"
|
|
47
|
+
*
|
|
48
|
+
* These produce similar embeddings (~0.91 cosine) despite different AST.
|
|
49
|
+
*/
|
|
50
|
+
export declare function buildEmbeddingText(fn: FunctionSignature): string;
|
|
51
|
+
/**
|
|
52
|
+
* Calculate similarity between two functions using all available methods.
|
|
53
|
+
* Returns { similarity, method } where method indicates which technique was used.
|
|
54
|
+
*/
|
|
55
|
+
export declare function calculateSimilarity(fn1: FunctionSignature, fn2: FunctionSignature): {
|
|
56
|
+
similarity: number;
|
|
57
|
+
method: string;
|
|
58
|
+
};
|
|
59
|
+
/**
|
|
60
|
+
* Three-pass duplicate detection:
|
|
61
|
+
* Pass 1 (fast): MD5 hash → exact duplicates (O(n))
|
|
62
|
+
* Pass 2 (Jaccard): AST node multiset similarity → near-duplicates (O(n²) bounded)
|
|
63
|
+
* Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
64
|
+
*
|
|
65
|
+
* Pass 3 catches what AST Jaccard misses: same intent, different implementation.
|
|
66
|
+
* Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
|
|
67
|
+
*/
|
|
68
|
+
export declare function findDuplicateGroups(functions: FunctionSignature[], similarityThreshold: number, semanticThreshold: number, semanticEnabled: boolean): FunctionSignature[][];
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Duplication Drift Similarity Detection
|
|
3
|
+
*
|
|
4
|
+
* Handles all comparison and similarity detection logic:
|
|
5
|
+
* - Fingerprinting/hashing
|
|
6
|
+
* - Jaccard similarity on multisets
|
|
7
|
+
* - Clone detection algorithms
|
|
8
|
+
* - Threshold calculations
|
|
9
|
+
*/
|
|
10
|
+
import crypto from 'crypto';
|
|
11
|
+
import { cosineSimilarity } from '../../pattern-index/embeddings.js';
|
|
12
|
+
/**
|
|
13
|
+
* Hash a normalized function body using MD5.
|
|
14
|
+
*/
|
|
15
|
+
export function hashBody(text) {
|
|
16
|
+
return crypto.createHash('md5').update(text).digest('hex');
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Jaccard similarity on multisets.
|
|
20
|
+
* intersection = sum of min(countA, countB) for each key
|
|
21
|
+
* union = sum of max(countA, countB) for each key
|
|
22
|
+
*/
|
|
23
|
+
export function jaccardSimilarity(a, b) {
|
|
24
|
+
const allKeys = new Set([...a.keys(), ...b.keys()]);
|
|
25
|
+
let intersection = 0;
|
|
26
|
+
let union = 0;
|
|
27
|
+
for (const key of allKeys) {
|
|
28
|
+
const countA = a.get(key) || 0;
|
|
29
|
+
const countB = b.get(key) || 0;
|
|
30
|
+
intersection += Math.min(countA, countB);
|
|
31
|
+
union += Math.max(countA, countB);
|
|
32
|
+
}
|
|
33
|
+
return union === 0 ? 0 : intersection / union;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Generate semantic embedding text for a function.
|
|
37
|
+
* Combines function name, parameter names, and first 200 tokens of body.
|
|
38
|
+
* This captures INTENT regardless of implementation differences.
|
|
39
|
+
*
|
|
40
|
+
* Example:
|
|
41
|
+
* getUserById(id) { return db.users.find(x => x.id === id) }
|
|
42
|
+
* → "getUserById id return db users find x id id"
|
|
43
|
+
*
|
|
44
|
+
* fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
|
|
45
|
+
* → "fetchUserRecord userId return database users filter u id userId 0"
|
|
46
|
+
*
|
|
47
|
+
* These produce similar embeddings (~0.91 cosine) despite different AST.
|
|
48
|
+
*/
|
|
49
|
+
export function buildEmbeddingText(fn) {
|
|
50
|
+
// Extract identifiers from normalized body (first 200 tokens)
|
|
51
|
+
const bodyTokens = fn.normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
|
|
52
|
+
const first200 = bodyTokens.slice(0, 200).join(' ');
|
|
53
|
+
return `${fn.name} ${first200}`;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Calculate similarity between two functions using all available methods.
|
|
57
|
+
* Returns { similarity, method } where method indicates which technique was used.
|
|
58
|
+
*/
|
|
59
|
+
export function calculateSimilarity(fn1, fn2) {
|
|
60
|
+
// Exact hash match (Pass 1)
|
|
61
|
+
if (fn1.bodyHash === fn2.bodyHash) {
|
|
62
|
+
return { similarity: 1.0, method: 'exact-hash' };
|
|
63
|
+
}
|
|
64
|
+
// Semantic embedding comparison (if available)
|
|
65
|
+
if (fn1.embedding && fn2.embedding) {
|
|
66
|
+
const jaccardSim = jaccardSimilarity(fn1.astTokens, fn2.astTokens);
|
|
67
|
+
const cosineSim = cosineSimilarity(fn1.embedding, fn2.embedding);
|
|
68
|
+
if (cosineSim > jaccardSim) {
|
|
69
|
+
return { similarity: cosineSim, method: 'semantic-embedding' };
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// AST Jaccard similarity (Pass 2)
|
|
73
|
+
const jaccardSim = jaccardSimilarity(fn1.astTokens, fn2.astTokens);
|
|
74
|
+
return { similarity: jaccardSim, method: 'ast-jaccard' };
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Three-pass duplicate detection:
|
|
78
|
+
* Pass 1 (fast): MD5 hash → exact duplicates (O(n))
|
|
79
|
+
* Pass 2 (Jaccard): AST node multiset similarity → near-duplicates (O(n²) bounded)
|
|
80
|
+
* Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
81
|
+
*
|
|
82
|
+
* Pass 3 catches what AST Jaccard misses: same intent, different implementation.
|
|
83
|
+
* Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
|
|
84
|
+
*/
|
|
85
|
+
export function findDuplicateGroups(functions, similarityThreshold, semanticThreshold, semanticEnabled) {
|
|
86
|
+
const duplicates = [];
|
|
87
|
+
const claimedIndices = new Set();
|
|
88
|
+
// Pass 1: Exact hash match
|
|
89
|
+
const hashGroups = new Map();
|
|
90
|
+
for (let i = 0; i < functions.length; i++) {
|
|
91
|
+
const existing = hashGroups.get(functions[i].bodyHash) || [];
|
|
92
|
+
existing.push(i);
|
|
93
|
+
hashGroups.set(functions[i].bodyHash, existing);
|
|
94
|
+
}
|
|
95
|
+
for (const indices of hashGroups.values()) {
|
|
96
|
+
if (indices.length < 2)
|
|
97
|
+
continue;
|
|
98
|
+
const group = indices.map(i => functions[i]);
|
|
99
|
+
const uniqueFiles = new Set(group.map(f => f.file));
|
|
100
|
+
if (uniqueFiles.size >= 2) {
|
|
101
|
+
duplicates.push(group);
|
|
102
|
+
indices.forEach(i => claimedIndices.add(i));
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Pass 2: Jaccard on AST tokens for remaining functions
|
|
106
|
+
const remaining = functions
|
|
107
|
+
.map((fn, i) => ({ fn, idx: i }))
|
|
108
|
+
.filter(({ idx }) => !claimedIndices.has(idx));
|
|
109
|
+
remaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
|
|
110
|
+
const jaccardClaimed = new Set();
|
|
111
|
+
for (let i = 0; i < remaining.length; i++) {
|
|
112
|
+
if (jaccardClaimed.has(remaining[i].idx))
|
|
113
|
+
continue;
|
|
114
|
+
const group = [remaining[i].fn];
|
|
115
|
+
const baseLen = remaining[i].fn.bodyLength;
|
|
116
|
+
for (let j = i + 1; j < remaining.length; j++) {
|
|
117
|
+
if (jaccardClaimed.has(remaining[j].idx))
|
|
118
|
+
continue;
|
|
119
|
+
if (remaining[j].fn.bodyLength > baseLen * 1.5)
|
|
120
|
+
break;
|
|
121
|
+
if (remaining[j].fn.file === remaining[i].fn.file)
|
|
122
|
+
continue;
|
|
123
|
+
const sim = jaccardSimilarity(remaining[i].fn.astTokens, remaining[j].fn.astTokens);
|
|
124
|
+
if (sim >= similarityThreshold) {
|
|
125
|
+
group.push(remaining[j].fn);
|
|
126
|
+
jaccardClaimed.add(remaining[j].idx);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
if (group.length >= 2) {
|
|
130
|
+
const uniqueFiles = new Set(group.map(f => f.file));
|
|
131
|
+
if (uniqueFiles.size >= 2) {
|
|
132
|
+
duplicates.push(group);
|
|
133
|
+
jaccardClaimed.add(remaining[i].idx);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
// Mark all Pass 1 + Pass 2 claimed indices
|
|
138
|
+
for (const idx of jaccardClaimed)
|
|
139
|
+
claimedIndices.add(idx);
|
|
140
|
+
// Pass 3: Semantic embedding cosine similarity for still-unclaimed functions
|
|
141
|
+
if (semanticEnabled) {
|
|
142
|
+
const semanticRemaining = functions
|
|
143
|
+
.map((fn, i) => ({ fn, idx: i }))
|
|
144
|
+
.filter(({ idx }) => !claimedIndices.has(idx))
|
|
145
|
+
.filter(({ fn }) => fn.embedding && fn.embedding.length > 0);
|
|
146
|
+
semanticRemaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
|
|
147
|
+
const semanticClaimed = new Set();
|
|
148
|
+
for (let i = 0; i < semanticRemaining.length; i++) {
|
|
149
|
+
if (semanticClaimed.has(semanticRemaining[i].idx))
|
|
150
|
+
continue;
|
|
151
|
+
const group = [semanticRemaining[i].fn];
|
|
152
|
+
const baseLen = semanticRemaining[i].fn.bodyLength;
|
|
153
|
+
for (let j = i + 1; j < semanticRemaining.length; j++) {
|
|
154
|
+
if (semanticClaimed.has(semanticRemaining[j].idx))
|
|
155
|
+
continue;
|
|
156
|
+
// Body length must be within 2x range (semantic allows more variance)
|
|
157
|
+
if (semanticRemaining[j].fn.bodyLength > baseLen * 2.0)
|
|
158
|
+
break;
|
|
159
|
+
if (semanticRemaining[j].fn.file === semanticRemaining[i].fn.file)
|
|
160
|
+
continue;
|
|
161
|
+
const sim = cosineSimilarity(semanticRemaining[i].fn.embedding, semanticRemaining[j].fn.embedding);
|
|
162
|
+
if (sim >= semanticThreshold) {
|
|
163
|
+
group.push(semanticRemaining[j].fn);
|
|
164
|
+
semanticClaimed.add(semanticRemaining[j].idx);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
if (group.length >= 2) {
|
|
168
|
+
const uniqueFiles = new Set(group.map(f => f.file));
|
|
169
|
+
if (uniqueFiles.size >= 2) {
|
|
170
|
+
duplicates.push(group);
|
|
171
|
+
semanticClaimed.add(semanticRemaining[i].idx);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return duplicates;
|
|
177
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Duplication Drift Tokenizer
|
|
3
|
+
*
|
|
4
|
+
* Handles language-aware tokenization of code functions.
|
|
5
|
+
* Supports tree-sitter AST-based tokenization with fallback to text-based.
|
|
6
|
+
*/
|
|
7
|
+
export declare function initTreeSitter(): Promise<boolean>;
|
|
8
|
+
export declare const GRAMMAR_PATHS: Record<string, string>;
|
|
9
|
+
export declare const languageCache: Map<string, any>;
|
|
10
|
+
/**
|
|
11
|
+
* AST node types that are structural even as leaf nodes.
|
|
12
|
+
* These carry semantic meaning without children.
|
|
13
|
+
*/
|
|
14
|
+
export declare function isStructuralLeaf(type: string): boolean;
|
|
15
|
+
/**
|
|
16
|
+
* Fallback tokenizer when tree-sitter is not available.
|
|
17
|
+
* Uses normalized text → keyword/operator multiset.
|
|
18
|
+
*/
|
|
19
|
+
export declare function textTokenize(normalized: string): Map<string, number>;
|
|
20
|
+
/**
|
|
21
|
+
* Walk an AST subtree and collect node types as a multiset.
|
|
22
|
+
*
|
|
23
|
+
* This is the core insight: two functions with different variable names
|
|
24
|
+
* but the same control flow produce the same node type multiset.
|
|
25
|
+
*
|
|
26
|
+
* Example:
|
|
27
|
+
* `function a(x) { if (x > 0) return x * 2; return 0; }`
|
|
28
|
+
* `function b(val) { if (val > 0) return val * 2; return 0; }`
|
|
29
|
+
*
|
|
30
|
+
* Both produce: {if_statement: 1, binary_expression: 2, return_statement: 2, ...}
|
|
31
|
+
* → Jaccard similarity = 1.0
|
|
32
|
+
*/
|
|
33
|
+
export declare function collectASTNodeTypes(node: any): Map<string, number>;
|
|
34
|
+
/**
|
|
35
|
+
* Walk the AST tree to find a function/method node at a given line.
|
|
36
|
+
*/
|
|
37
|
+
export declare function findFunctionNodeAtLine(rootNode: any, targetLine: number): any;
|
|
38
|
+
/**
|
|
39
|
+
* Normalize function body text for tokenization.
|
|
40
|
+
* Removes comments, normalizes whitespace, etc.
|
|
41
|
+
*/
|
|
42
|
+
export declare function normalizeBody(body: string): string;
|
|
43
|
+
/**
|
|
44
|
+
* Extract function body from lines starting at startIndex.
|
|
45
|
+
* Handles brace matching.
|
|
46
|
+
*/
|
|
47
|
+
export declare function extractFunctionBody(lines: string[], startIndex: number): string[];
|
|
48
|
+
/**
|
|
49
|
+
* Get Parser instance (may be null if tree-sitter not available).
|
|
50
|
+
*/
|
|
51
|
+
export declare function getParser(): any;
|
|
52
|
+
/**
|
|
53
|
+
* Get the __dirname path for grammar loading.
|
|
54
|
+
*/
|
|
55
|
+
export declare function getGrammarDir(): string;
|