@rigour-labs/core 4.3.6 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -10
- package/dist/gates/base.d.ts +3 -0
- package/dist/gates/checkpoint.d.ts +23 -8
- package/dist/gates/checkpoint.js +109 -45
- package/dist/gates/checkpoint.test.js +6 -3
- package/dist/gates/dependency.d.ts +39 -0
- package/dist/gates/dependency.js +212 -5
- package/dist/gates/duplication-drift.d.ts +101 -6
- package/dist/gates/duplication-drift.js +427 -33
- package/dist/gates/logic-drift.d.ts +70 -0
- package/dist/gates/logic-drift.js +280 -0
- package/dist/gates/runner.js +29 -1
- package/dist/gates/style-drift.d.ts +53 -0
- package/dist/gates/style-drift.js +305 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +4 -0
- package/dist/services/adaptive-thresholds.d.ts +54 -10
- package/dist/services/adaptive-thresholds.js +161 -35
- package/dist/services/adaptive-thresholds.test.js +24 -20
- package/dist/services/filesystem-cache.d.ts +50 -0
- package/dist/services/filesystem-cache.js +124 -0
- package/dist/services/temporal-drift.d.ts +101 -0
- package/dist/services/temporal-drift.js +386 -0
- package/dist/templates/universal-config.js +17 -0
- package/dist/types/index.d.ts +196 -0
- package/dist/types/index.js +19 -0
- package/dist/utils/scanner.d.ts +6 -1
- package/dist/utils/scanner.js +8 -1
- package/package.json +6 -6
|
@@ -1,45 +1,108 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Duplication Drift Gate
|
|
2
|
+
* Duplication Drift Gate (v2)
|
|
3
3
|
*
|
|
4
4
|
* Detects when AI generates near-identical functions across files because
|
|
5
5
|
* it doesn't remember what it already wrote. This is an AI-specific failure
|
|
6
6
|
* mode — humans reuse via copy-paste (same file), AI re-invents (cross-file).
|
|
7
7
|
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
8
|
+
* v2 upgrades:
|
|
9
|
+
* - tree-sitter AST node type sequences replace hand-rolled regex tokenizer
|
|
10
|
+
* - Jaccard similarity on AST node multisets (structural, not textual)
|
|
11
|
+
* - Catches duplicates even when every variable name is different
|
|
12
|
+
* - MD5 kept as fast-path for exact matches, Jaccard runs on remaining pairs
|
|
12
13
|
*
|
|
13
|
-
*
|
|
14
|
+
* Detection strategy (three-pass):
|
|
15
|
+
* 1. Extract function bodies, normalize text (strip comments/whitespace)
|
|
16
|
+
* 2. Parse with tree-sitter → walk AST → collect node type multiset
|
|
17
|
+
* 3. Generate semantic embeddings via all-MiniLM-L6-v2 (384D)
|
|
18
|
+
* 4. Pass 1 (fast): MD5 hash → exact duplicates (O(n), <10ms)
|
|
19
|
+
* 5. Pass 2 (Jaccard): AST node multiset similarity → structural near-duplicates (O(n²) bounded)
|
|
20
|
+
* 6. Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
21
|
+
* 7. Flag functions with similarity > threshold in different files
|
|
22
|
+
*
|
|
23
|
+
* Why AST node types > raw tokens:
|
|
24
|
+
* - `getUserById(id) { return db.find(x => x.id === id) }`
|
|
25
|
+
* - `fetchUser(userId) { return database.filter(u => u.id === userId)[0] }`
|
|
26
|
+
* Both produce similar AST: [return_statement, call_expression, arrow_function,
|
|
27
|
+
* binary_expression, member_expression]. Variable names are invisible.
|
|
28
|
+
*
|
|
29
|
+
* @since v2.16.0 (original MD5)
|
|
30
|
+
* @since v5.0.0 (tree-sitter AST + Jaccard)
|
|
31
|
+
* @since v5.1.0 (semantic embedding Pass 3)
|
|
14
32
|
*/
|
|
15
33
|
import { Gate } from './base.js';
|
|
16
34
|
import { FileScanner } from '../utils/scanner.js';
|
|
17
35
|
import { Logger } from '../utils/logger.js';
|
|
36
|
+
import { generateEmbedding, cosineSimilarity } from '../pattern-index/embeddings.js';
|
|
18
37
|
import crypto from 'crypto';
|
|
19
38
|
import path from 'path';
|
|
39
|
+
import { fileURLToPath } from 'url';
|
|
40
|
+
// tree-sitter is optional — graceful fallback to text tokenization
|
|
41
|
+
let Parser = null;
|
|
42
|
+
let treeSitterReady = false;
|
|
43
|
+
let treeSitterFailed = false;
|
|
44
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
45
|
+
async function initTreeSitter() {
|
|
46
|
+
if (treeSitterReady)
|
|
47
|
+
return true;
|
|
48
|
+
if (treeSitterFailed)
|
|
49
|
+
return false;
|
|
50
|
+
try {
|
|
51
|
+
const mod = await import('web-tree-sitter');
|
|
52
|
+
Parser = mod.default || mod;
|
|
53
|
+
await Parser.init();
|
|
54
|
+
treeSitterReady = true;
|
|
55
|
+
return true;
|
|
56
|
+
}
|
|
57
|
+
catch {
|
|
58
|
+
treeSitterFailed = true;
|
|
59
|
+
Logger.debug('tree-sitter not available, falling back to text tokenization');
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
const GRAMMAR_PATHS = {
|
|
64
|
+
'.ts': '../../vendor/grammars/tree-sitter-typescript.wasm',
|
|
65
|
+
'.tsx': '../../vendor/grammars/tree-sitter-tsx.wasm',
|
|
66
|
+
'.js': '../../vendor/grammars/tree-sitter-javascript.wasm',
|
|
67
|
+
'.jsx': '../../vendor/grammars/tree-sitter-javascript.wasm',
|
|
68
|
+
'.py': '../../vendor/grammars/tree-sitter-python.wasm',
|
|
69
|
+
'.go': '../../vendor/grammars/tree-sitter-go.wasm',
|
|
70
|
+
'.rs': '../../vendor/grammars/tree-sitter-rust.wasm',
|
|
71
|
+
};
|
|
72
|
+
// Cache loaded languages
|
|
73
|
+
const languageCache = new Map();
|
|
20
74
|
export class DuplicationDriftGate extends Gate {
|
|
21
75
|
config;
|
|
76
|
+
parser = null;
|
|
22
77
|
constructor(config = {}) {
|
|
23
78
|
super('duplication-drift', 'AI Duplication Drift Detection');
|
|
24
79
|
this.config = {
|
|
25
80
|
enabled: config.enabled ?? true,
|
|
26
|
-
similarity_threshold: config.similarity_threshold ?? 0.
|
|
81
|
+
similarity_threshold: config.similarity_threshold ?? 0.75,
|
|
82
|
+
semantic_threshold: config.semantic_threshold ?? 0.85,
|
|
83
|
+
semantic_enabled: config.semantic_enabled ?? true,
|
|
27
84
|
min_body_lines: config.min_body_lines ?? 5,
|
|
85
|
+
approved_duplications: config.approved_duplications ?? [],
|
|
28
86
|
};
|
|
29
87
|
}
|
|
30
88
|
get provenance() { return 'ai-drift'; }
|
|
31
89
|
async run(context) {
|
|
32
90
|
if (!this.config.enabled)
|
|
33
91
|
return [];
|
|
92
|
+
// Try to init tree-sitter (non-blocking, falls back gracefully)
|
|
93
|
+
const hasTreeSitter = await initTreeSitter();
|
|
94
|
+
if (hasTreeSitter && !this.parser) {
|
|
95
|
+
this.parser = new Parser();
|
|
96
|
+
}
|
|
34
97
|
const failures = [];
|
|
35
98
|
const functions = [];
|
|
36
|
-
const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py}'];
|
|
99
|
+
const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py,go,rs}'];
|
|
37
100
|
const files = await FileScanner.findFiles({
|
|
38
101
|
cwd: context.cwd,
|
|
39
102
|
patterns: scanPatterns,
|
|
40
103
|
ignore: [...(context.ignore || []), '**/node_modules/**', '**/dist/**', '**/*.test.*', '**/*.spec.*'],
|
|
41
104
|
});
|
|
42
|
-
Logger.info(`Duplication Drift: Scanning ${files.length} files`);
|
|
105
|
+
Logger.info(`Duplication Drift: Scanning ${files.length} files (tree-sitter: ${hasTreeSitter ? 'ON' : 'fallback'})`);
|
|
43
106
|
for (const file of files) {
|
|
44
107
|
try {
|
|
45
108
|
const { readFile } = await import('fs-extra');
|
|
@@ -51,27 +114,212 @@ export class DuplicationDriftGate extends Gate {
|
|
|
51
114
|
else if (ext === '.py') {
|
|
52
115
|
this.extractPyFunctions(content, file, functions);
|
|
53
116
|
}
|
|
117
|
+
// Generate AST tokens using tree-sitter if available
|
|
118
|
+
if (hasTreeSitter && GRAMMAR_PATHS[ext]) {
|
|
119
|
+
await this.enrichWithASTTokens(content, ext, file, functions);
|
|
120
|
+
}
|
|
54
121
|
}
|
|
55
122
|
catch (e) { }
|
|
56
123
|
}
|
|
57
|
-
//
|
|
124
|
+
// Pass 3 prep: Generate semantic embeddings for all extracted functions
|
|
125
|
+
// (embedding generation is lazy — only runs when semantic_enabled is true)
|
|
126
|
+
if (this.config.semantic_enabled && functions.length > 0) {
|
|
127
|
+
const allIndices = functions.map((_, i) => i);
|
|
128
|
+
await this.enrichWithEmbeddings(functions, allIndices);
|
|
129
|
+
}
|
|
58
130
|
const duplicateGroups = this.findDuplicateGroups(functions);
|
|
131
|
+
// Build approved pairs set for fast lookup
|
|
132
|
+
const approvedSet = new Set((this.config.approved_duplications || []).map(s => s.toLowerCase()));
|
|
59
133
|
for (const group of duplicateGroups) {
|
|
134
|
+
// Check if this pair is human-approved
|
|
135
|
+
const names = group.map(f => f.name).sort();
|
|
136
|
+
const pairKey = names.join(':').toLowerCase();
|
|
137
|
+
if (approvedSet.has(pairKey))
|
|
138
|
+
continue;
|
|
60
139
|
const files = group.map(f => f.file);
|
|
61
140
|
const locations = group.map(f => `${f.file}:${f.line} (${f.name})`).join(', ');
|
|
62
|
-
|
|
141
|
+
// Determine similarity % and method used
|
|
142
|
+
let similarity;
|
|
143
|
+
let method;
|
|
144
|
+
if (group[0].bodyHash === group[1]?.bodyHash) {
|
|
145
|
+
similarity = 1.0;
|
|
146
|
+
method = 'exact-hash';
|
|
147
|
+
}
|
|
148
|
+
else if (group[0].embedding && group[1]?.embedding) {
|
|
149
|
+
const jaccardSim = this.jaccardSimilarity(group[0].astTokens, group[1].astTokens);
|
|
150
|
+
const cosineSim = cosineSimilarity(group[0].embedding, group[1].embedding);
|
|
151
|
+
if (cosineSim > jaccardSim) {
|
|
152
|
+
similarity = cosineSim;
|
|
153
|
+
method = 'semantic-embedding';
|
|
154
|
+
}
|
|
155
|
+
else {
|
|
156
|
+
similarity = jaccardSim;
|
|
157
|
+
method = 'ast-jaccard';
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
else {
|
|
161
|
+
similarity = group.length > 1
|
|
162
|
+
? this.jaccardSimilarity(group[0].astTokens, group[1].astTokens)
|
|
163
|
+
: 1.0;
|
|
164
|
+
method = 'ast-jaccard';
|
|
165
|
+
}
|
|
166
|
+
const pct = (similarity * 100).toFixed(0);
|
|
167
|
+
failures.push(this.createFailure(`AI Duplication Drift: Function '${group[0].name}' has ${group.length} near-identical copies (${pct}% similar via ${method})`, [...new Set(files)], `Found duplicate implementations at: ${locations}. Extract to a shared module and import.`, 'Duplication Drift', group[0].line, undefined, 'high'));
|
|
63
168
|
}
|
|
64
169
|
return failures;
|
|
65
170
|
}
|
|
171
|
+
// ─── tree-sitter AST Tokenization ───────────────────────────────
|
|
172
|
+
/**
|
|
173
|
+
* Parse the file with tree-sitter, find function nodes that match
|
|
174
|
+
* our extracted functions (by line number), and replace their token
|
|
175
|
+
* multisets with AST node type sequences.
|
|
176
|
+
*
|
|
177
|
+
* AST node types are language-agnostic structural tokens:
|
|
178
|
+
* - if_statement, for_statement, return_statement
|
|
179
|
+
* - call_expression, member_expression, binary_expression
|
|
180
|
+
* - arrow_function, function_declaration
|
|
181
|
+
*
|
|
182
|
+
* Variable names, string literals, comments — all invisible.
|
|
183
|
+
* Only STRUCTURE matters.
|
|
184
|
+
*/
|
|
185
|
+
async enrichWithASTTokens(content, ext, file, functions) {
|
|
186
|
+
if (!this.parser)
|
|
187
|
+
return;
|
|
188
|
+
const grammarRelPath = GRAMMAR_PATHS[ext];
|
|
189
|
+
if (!grammarRelPath)
|
|
190
|
+
return;
|
|
191
|
+
try {
|
|
192
|
+
// Load language (cached)
|
|
193
|
+
if (!languageCache.has(ext)) {
|
|
194
|
+
const grammarPath = path.resolve(__dirname, grammarRelPath);
|
|
195
|
+
const lang = await Parser.Language.load(grammarPath);
|
|
196
|
+
languageCache.set(ext, lang);
|
|
197
|
+
}
|
|
198
|
+
const lang = languageCache.get(ext);
|
|
199
|
+
this.parser.setLanguage(lang);
|
|
200
|
+
const tree = this.parser.parse(content);
|
|
201
|
+
// Find functions that belong to this file
|
|
202
|
+
const fileFunctions = functions.filter(f => f.file === file);
|
|
203
|
+
for (const fn of fileFunctions) {
|
|
204
|
+
// Find the AST node at this function's line
|
|
205
|
+
const node = this.findFunctionNodeAtLine(tree.rootNode, fn.line);
|
|
206
|
+
if (node) {
|
|
207
|
+
fn.astTokens = this.collectASTNodeTypes(node);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
catch (e) {
|
|
212
|
+
// tree-sitter parse failed for this file — keep text tokens
|
|
213
|
+
Logger.debug(`tree-sitter parse failed for ${file}: ${e}`);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Walk the AST tree to find a function/method node at a given line.
|
|
218
|
+
*/
|
|
219
|
+
findFunctionNodeAtLine(rootNode, targetLine) {
|
|
220
|
+
const functionTypes = new Set([
|
|
221
|
+
'function_declaration', 'method_definition', 'arrow_function',
|
|
222
|
+
'function_definition', // Python
|
|
223
|
+
'function_item', // Rust
|
|
224
|
+
'method_declaration', // Java/C#
|
|
225
|
+
'lexical_declaration', // const x = () => {}
|
|
226
|
+
]);
|
|
227
|
+
let bestMatch = null;
|
|
228
|
+
const walk = (node) => {
|
|
229
|
+
// tree-sitter lines are 0-indexed, our lines are 1-indexed
|
|
230
|
+
if (functionTypes.has(node.type) && node.startPosition.row + 1 === targetLine) {
|
|
231
|
+
bestMatch = node;
|
|
232
|
+
return;
|
|
233
|
+
}
|
|
234
|
+
for (let i = 0; i < node.childCount; i++) {
|
|
235
|
+
walk(node.child(i));
|
|
236
|
+
if (bestMatch)
|
|
237
|
+
return;
|
|
238
|
+
}
|
|
239
|
+
};
|
|
240
|
+
walk(rootNode);
|
|
241
|
+
return bestMatch;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Walk an AST subtree and collect node types as a multiset.
|
|
245
|
+
*
|
|
246
|
+
* This is the core insight: two functions with different variable names
|
|
247
|
+
* but the same control flow produce the same node type multiset.
|
|
248
|
+
*
|
|
249
|
+
* Example:
|
|
250
|
+
* `function a(x) { if (x > 0) return x * 2; return 0; }`
|
|
251
|
+
* `function b(val) { if (val > 0) return val * 2; return 0; }`
|
|
252
|
+
*
|
|
253
|
+
* Both produce: {if_statement: 1, binary_expression: 2, return_statement: 2, ...}
|
|
254
|
+
* → Jaccard similarity = 1.0
|
|
255
|
+
*/
|
|
256
|
+
collectASTNodeTypes(node) {
|
|
257
|
+
const types = new Map();
|
|
258
|
+
const walk = (n) => {
|
|
259
|
+
// Skip leaf nodes that are just identifiers/literals (noise)
|
|
260
|
+
// Keep structural node types only
|
|
261
|
+
if (n.childCount > 0 || isStructuralLeaf(n.type)) {
|
|
262
|
+
types.set(n.type, (types.get(n.type) || 0) + 1);
|
|
263
|
+
}
|
|
264
|
+
for (let i = 0; i < n.childCount; i++) {
|
|
265
|
+
walk(n.child(i));
|
|
266
|
+
}
|
|
267
|
+
};
|
|
268
|
+
walk(node);
|
|
269
|
+
return types;
|
|
270
|
+
}
|
|
271
|
+
// ─── Fallback Text Tokenization ─────────────────────────────────
|
|
272
|
+
/**
|
|
273
|
+
* Fallback tokenizer when tree-sitter is not available.
|
|
274
|
+
* Uses normalized text → keyword/operator multiset.
|
|
275
|
+
*/
|
|
276
|
+
textTokenize(normalized) {
|
|
277
|
+
const tokens = new Map();
|
|
278
|
+
const structural = normalized.match(/\b(if|else|for|while|return|const|let|var|function|class|import|export|async|await|try|catch|throw|new|switch|case|break|continue|yield|def|self)\b|[{}()\[\];,.:=<>!&|+\-*/%?]+/g) || [];
|
|
279
|
+
for (const token of structural) {
|
|
280
|
+
tokens.set(token, (tokens.get(token) || 0) + 1);
|
|
281
|
+
}
|
|
282
|
+
// Normalize all identifiers to a count (variable names don't matter)
|
|
283
|
+
const keywords = new Set([
|
|
284
|
+
'if', 'else', 'for', 'while', 'return', 'const', 'let', 'var',
|
|
285
|
+
'function', 'class', 'import', 'export', 'async', 'await',
|
|
286
|
+
'try', 'catch', 'throw', 'new', 'switch', 'case', 'break',
|
|
287
|
+
'continue', 'yield', 'def', 'self', 'true', 'false', 'null', 'undefined',
|
|
288
|
+
]);
|
|
289
|
+
const identifiers = normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
|
|
290
|
+
let idCount = 0;
|
|
291
|
+
for (const id of identifiers) {
|
|
292
|
+
if (!keywords.has(id))
|
|
293
|
+
idCount++;
|
|
294
|
+
}
|
|
295
|
+
if (idCount > 0)
|
|
296
|
+
tokens.set('_ID_', idCount);
|
|
297
|
+
return tokens;
|
|
298
|
+
}
|
|
299
|
+
// ─── Jaccard Similarity ─────────────────────────────────────────
|
|
300
|
+
/**
|
|
301
|
+
* Jaccard similarity on multisets.
|
|
302
|
+
* intersection = sum of min(countA, countB) for each key
|
|
303
|
+
* union = sum of max(countA, countB) for each key
|
|
304
|
+
*/
|
|
305
|
+
jaccardSimilarity(a, b) {
|
|
306
|
+
const allKeys = new Set([...a.keys(), ...b.keys()]);
|
|
307
|
+
let intersection = 0;
|
|
308
|
+
let union = 0;
|
|
309
|
+
for (const key of allKeys) {
|
|
310
|
+
const countA = a.get(key) || 0;
|
|
311
|
+
const countB = b.get(key) || 0;
|
|
312
|
+
intersection += Math.min(countA, countB);
|
|
313
|
+
union += Math.max(countA, countB);
|
|
314
|
+
}
|
|
315
|
+
return union === 0 ? 0 : intersection / union;
|
|
316
|
+
}
|
|
317
|
+
// ─── Function Extraction ────────────────────────────────────────
|
|
66
318
|
extractJSFunctions(content, file, functions) {
|
|
67
319
|
const lines = content.split('\n');
|
|
68
|
-
// Match function declarations, arrow functions, and method definitions
|
|
69
320
|
const patterns = [
|
|
70
|
-
// function name(...) {
|
|
71
321
|
/^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/,
|
|
72
|
-
// const name = (...) => {
|
|
73
322
|
/^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|(\w+))\s*=>/,
|
|
74
|
-
// name(...) { — class method
|
|
75
323
|
/^\s+(?:async\s+)?(\w+)\s*\(([^)]*)\)\s*\{/,
|
|
76
324
|
];
|
|
77
325
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -92,6 +340,8 @@ export class DuplicationDriftGate extends Gate {
|
|
|
92
340
|
bodyHash: this.hash(normalized),
|
|
93
341
|
bodyLength: body.length,
|
|
94
342
|
normalized,
|
|
343
|
+
// Start with text tokens, enrichWithASTTokens() upgrades if tree-sitter available
|
|
344
|
+
astTokens: this.textTokenize(normalized),
|
|
95
345
|
});
|
|
96
346
|
}
|
|
97
347
|
break;
|
|
@@ -107,7 +357,6 @@ export class DuplicationDriftGate extends Gate {
|
|
|
107
357
|
const name = match[1];
|
|
108
358
|
const params = match[2] || '';
|
|
109
359
|
const indent = lines[i].match(/^(\s*)/)?.[1]?.length || 0;
|
|
110
|
-
// Extract body by indentation
|
|
111
360
|
const body = [];
|
|
112
361
|
for (let j = i + 1; j < lines.length; j++) {
|
|
113
362
|
const lineIndent = lines[j].match(/^(\s*)/)?.[1]?.length || 0;
|
|
@@ -128,6 +377,7 @@ export class DuplicationDriftGate extends Gate {
|
|
|
128
377
|
bodyHash: this.hash(normalized),
|
|
129
378
|
bodyLength: body.length,
|
|
130
379
|
normalized,
|
|
380
|
+
astTokens: this.textTokenize(normalized),
|
|
131
381
|
});
|
|
132
382
|
}
|
|
133
383
|
}
|
|
@@ -156,36 +406,180 @@ export class DuplicationDriftGate extends Gate {
|
|
|
156
406
|
}
|
|
157
407
|
normalizeBody(body) {
|
|
158
408
|
return body
|
|
159
|
-
.replace(/\/\/.*/g, '')
|
|
160
|
-
.replace(/\/\*[\s\S]*?\*\//g, '')
|
|
161
|
-
.replace(/#.*/g, '')
|
|
162
|
-
.replace(/`[^`]*`/g, '"STR"')
|
|
163
|
-
.replace(/\basync\s+/g, '')
|
|
164
|
-
.replace(/\s+/g, ' ')
|
|
165
|
-
.replace(/['"]/g, '"')
|
|
409
|
+
.replace(/\/\/.*/g, '')
|
|
410
|
+
.replace(/\/\*[\s\S]*?\*\//g, '')
|
|
411
|
+
.replace(/#.*/g, '')
|
|
412
|
+
.replace(/`[^`]*`/g, '"STR"')
|
|
413
|
+
.replace(/\basync\s+/g, '')
|
|
414
|
+
.replace(/\s+/g, ' ')
|
|
415
|
+
.replace(/['"]/g, '"')
|
|
166
416
|
.trim();
|
|
167
417
|
}
|
|
168
418
|
hash(text) {
|
|
169
419
|
return crypto.createHash('md5').update(text).digest('hex');
|
|
170
420
|
}
|
|
421
|
+
// ─── Semantic Embedding ─────────────────────────────────────────
|
|
422
|
+
/**
|
|
423
|
+
* Generate semantic embedding text for a function.
|
|
424
|
+
* Combines function name, parameter names, and first 200 tokens of body.
|
|
425
|
+
* This captures INTENT regardless of implementation differences.
|
|
426
|
+
*
|
|
427
|
+
* Example:
|
|
428
|
+
* getUserById(id) { return db.users.find(x => x.id === id) }
|
|
429
|
+
* → "getUserById id return db users find x id id"
|
|
430
|
+
*
|
|
431
|
+
* fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
|
|
432
|
+
* → "fetchUserRecord userId return database users filter u id userId 0"
|
|
433
|
+
*
|
|
434
|
+
* These produce similar embeddings (~0.91 cosine) despite different AST.
|
|
435
|
+
*/
|
|
436
|
+
buildEmbeddingText(fn) {
|
|
437
|
+
// Extract identifiers from normalized body (first 200 tokens)
|
|
438
|
+
const bodyTokens = fn.normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
|
|
439
|
+
const first200 = bodyTokens.slice(0, 200).join(' ');
|
|
440
|
+
return `${fn.name} ${first200}`;
|
|
441
|
+
}
|
|
442
|
+
/**
|
|
443
|
+
* Enrich functions with semantic embeddings for Pass 3.
|
|
444
|
+
* Only called for functions not already claimed by Pass 1/2.
|
|
445
|
+
* Uses generateEmbedding() from pattern-index/embeddings.ts.
|
|
446
|
+
*/
|
|
447
|
+
async enrichWithEmbeddings(functions, indices) {
|
|
448
|
+
Logger.info(`Semantic Pass 3: Generating embeddings for ${indices.length} functions`);
|
|
449
|
+
for (const idx of indices) {
|
|
450
|
+
const fn = functions[idx];
|
|
451
|
+
try {
|
|
452
|
+
const text = this.buildEmbeddingText(fn);
|
|
453
|
+
fn.embedding = await generateEmbedding(text);
|
|
454
|
+
}
|
|
455
|
+
catch {
|
|
456
|
+
// Embedding failed — skip this function for Pass 3
|
|
457
|
+
Logger.debug(`Embedding generation failed for ${fn.file}:${fn.name}`);
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
// ─── Duplicate Finding (three-pass) ──────────────────────────────
|
|
462
|
+
/**
|
|
463
|
+
* Three-pass duplicate detection:
|
|
464
|
+
* Pass 1 (fast): MD5 hash → exact duplicates (O(n))
|
|
465
|
+
* Pass 2 (Jaccard): AST node multiset similarity → near-duplicates (O(n²) bounded)
|
|
466
|
+
* Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
467
|
+
*
|
|
468
|
+
* Pass 3 catches what AST Jaccard misses: same intent, different implementation.
|
|
469
|
+
* Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
|
|
470
|
+
*/
|
|
171
471
|
findDuplicateGroups(functions) {
|
|
172
|
-
const groups = new Map();
|
|
173
|
-
// Group by body hash (exact duplicates across files)
|
|
174
|
-
for (const fn of functions) {
|
|
175
|
-
const existing = groups.get(fn.bodyHash) || [];
|
|
176
|
-
existing.push(fn);
|
|
177
|
-
groups.set(fn.bodyHash, existing);
|
|
178
|
-
}
|
|
179
|
-
// Filter: only groups with functions from DIFFERENT files, 2+ members
|
|
180
472
|
const duplicates = [];
|
|
181
|
-
|
|
182
|
-
|
|
473
|
+
const claimedIndices = new Set();
|
|
474
|
+
// Pass 1: Exact hash match
|
|
475
|
+
const hashGroups = new Map();
|
|
476
|
+
for (let i = 0; i < functions.length; i++) {
|
|
477
|
+
const existing = hashGroups.get(functions[i].bodyHash) || [];
|
|
478
|
+
existing.push(i);
|
|
479
|
+
hashGroups.set(functions[i].bodyHash, existing);
|
|
480
|
+
}
|
|
481
|
+
for (const indices of hashGroups.values()) {
|
|
482
|
+
if (indices.length < 2)
|
|
183
483
|
continue;
|
|
484
|
+
const group = indices.map(i => functions[i]);
|
|
184
485
|
const uniqueFiles = new Set(group.map(f => f.file));
|
|
185
486
|
if (uniqueFiles.size >= 2) {
|
|
186
487
|
duplicates.push(group);
|
|
488
|
+
indices.forEach(i => claimedIndices.add(i));
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
// Pass 2: Jaccard on AST tokens for remaining functions
|
|
492
|
+
const remaining = functions
|
|
493
|
+
.map((fn, i) => ({ fn, idx: i }))
|
|
494
|
+
.filter(({ idx }) => !claimedIndices.has(idx));
|
|
495
|
+
remaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
|
|
496
|
+
const jaccardClaimed = new Set();
|
|
497
|
+
for (let i = 0; i < remaining.length; i++) {
|
|
498
|
+
if (jaccardClaimed.has(remaining[i].idx))
|
|
499
|
+
continue;
|
|
500
|
+
const group = [remaining[i].fn];
|
|
501
|
+
const baseLen = remaining[i].fn.bodyLength;
|
|
502
|
+
for (let j = i + 1; j < remaining.length; j++) {
|
|
503
|
+
if (jaccardClaimed.has(remaining[j].idx))
|
|
504
|
+
continue;
|
|
505
|
+
if (remaining[j].fn.bodyLength > baseLen * 1.5)
|
|
506
|
+
break;
|
|
507
|
+
if (remaining[j].fn.file === remaining[i].fn.file)
|
|
508
|
+
continue;
|
|
509
|
+
const sim = this.jaccardSimilarity(remaining[i].fn.astTokens, remaining[j].fn.astTokens);
|
|
510
|
+
if (sim >= this.config.similarity_threshold) {
|
|
511
|
+
group.push(remaining[j].fn);
|
|
512
|
+
jaccardClaimed.add(remaining[j].idx);
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
if (group.length >= 2) {
|
|
516
|
+
const uniqueFiles = new Set(group.map(f => f.file));
|
|
517
|
+
if (uniqueFiles.size >= 2) {
|
|
518
|
+
duplicates.push(group);
|
|
519
|
+
jaccardClaimed.add(remaining[i].idx);
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
// Mark all Pass 1 + Pass 2 claimed indices
|
|
524
|
+
for (const idx of jaccardClaimed)
|
|
525
|
+
claimedIndices.add(idx);
|
|
526
|
+
// Pass 3: Semantic embedding cosine similarity for still-unclaimed functions
|
|
527
|
+
if (this.config.semantic_enabled) {
|
|
528
|
+
const semanticRemaining = functions
|
|
529
|
+
.map((fn, i) => ({ fn, idx: i }))
|
|
530
|
+
.filter(({ idx }) => !claimedIndices.has(idx))
|
|
531
|
+
.filter(({ fn }) => fn.embedding && fn.embedding.length > 0);
|
|
532
|
+
semanticRemaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
|
|
533
|
+
const semanticClaimed = new Set();
|
|
534
|
+
for (let i = 0; i < semanticRemaining.length; i++) {
|
|
535
|
+
if (semanticClaimed.has(semanticRemaining[i].idx))
|
|
536
|
+
continue;
|
|
537
|
+
const group = [semanticRemaining[i].fn];
|
|
538
|
+
const baseLen = semanticRemaining[i].fn.bodyLength;
|
|
539
|
+
for (let j = i + 1; j < semanticRemaining.length; j++) {
|
|
540
|
+
if (semanticClaimed.has(semanticRemaining[j].idx))
|
|
541
|
+
continue;
|
|
542
|
+
// Body length must be within 2x range (semantic allows more variance)
|
|
543
|
+
if (semanticRemaining[j].fn.bodyLength > baseLen * 2.0)
|
|
544
|
+
break;
|
|
545
|
+
if (semanticRemaining[j].fn.file === semanticRemaining[i].fn.file)
|
|
546
|
+
continue;
|
|
547
|
+
const sim = cosineSimilarity(semanticRemaining[i].fn.embedding, semanticRemaining[j].fn.embedding);
|
|
548
|
+
if (sim >= this.config.semantic_threshold) {
|
|
549
|
+
group.push(semanticRemaining[j].fn);
|
|
550
|
+
semanticClaimed.add(semanticRemaining[j].idx);
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
if (group.length >= 2) {
|
|
554
|
+
const uniqueFiles = new Set(group.map(f => f.file));
|
|
555
|
+
if (uniqueFiles.size >= 2) {
|
|
556
|
+
duplicates.push(group);
|
|
557
|
+
semanticClaimed.add(semanticRemaining[i].idx);
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
if (semanticClaimed.size > 0) {
|
|
562
|
+
Logger.info(`Semantic Pass 3: Found ${semanticClaimed.size} additional semantic duplicates`);
|
|
187
563
|
}
|
|
188
564
|
}
|
|
189
565
|
return duplicates;
|
|
190
566
|
}
|
|
191
567
|
}
|
|
568
|
+
/**
|
|
569
|
+
* AST node types that are structural even as leaf nodes.
|
|
570
|
+
* These carry semantic meaning without children.
|
|
571
|
+
*/
|
|
572
|
+
function isStructuralLeaf(type) {
|
|
573
|
+
const structural = new Set([
|
|
574
|
+
'return', 'break', 'continue', 'yield', 'throw',
|
|
575
|
+
'true', 'false', 'null', 'undefined', 'none',
|
|
576
|
+
'self', 'this', 'super',
|
|
577
|
+
'string', 'number', 'template_string',
|
|
578
|
+
// Operators
|
|
579
|
+
'=', '==', '===', '!=', '!==', '<', '>', '<=', '>=',
|
|
580
|
+
'+', '-', '*', '/', '%', '**',
|
|
581
|
+
'&&', '||', '!', '??',
|
|
582
|
+
'=>', '...', '?', ':',
|
|
583
|
+
]);
|
|
584
|
+
return structural.has(type);
|
|
585
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Logic Drift Foundation Gate
|
|
3
|
+
*
|
|
4
|
+
* Detects when AI subtly changes business logic in functions:
|
|
5
|
+
* - Comparison operator mutations: >= became > (off-by-one)
|
|
6
|
+
* - Return statement additions/removals
|
|
7
|
+
* - Branch count changes (new if/else added or removed)
|
|
8
|
+
* - Call sequence changes (function calls reordered)
|
|
9
|
+
*
|
|
10
|
+
* This is the HARDEST drift to catch because:
|
|
11
|
+
* - Code still compiles
|
|
12
|
+
* - Tests might still pass (if they don't cover edge cases)
|
|
13
|
+
* - The change looks intentional ("AI refactored the function")
|
|
14
|
+
*
|
|
15
|
+
* Strategy: Collect baselines for critical functions, then detect
|
|
16
|
+
* mutations between scans. This foundation enables future LLM-powered
|
|
17
|
+
* deeper analysis (feeding baselines into DriftBench training).
|
|
18
|
+
*
|
|
19
|
+
* @since v5.1.0
|
|
20
|
+
*/
|
|
21
|
+
import { Gate, GateContext } from './base.js';
|
|
22
|
+
import { Failure, Provenance } from '../types/index.js';
|
|
23
|
+
export interface LogicDriftConfig {
|
|
24
|
+
enabled?: boolean;
|
|
25
|
+
baseline_path?: string;
|
|
26
|
+
track_operators?: boolean;
|
|
27
|
+
track_branches?: boolean;
|
|
28
|
+
track_returns?: boolean;
|
|
29
|
+
}
|
|
30
|
+
export declare class LogicDriftGate extends Gate {
|
|
31
|
+
private config;
|
|
32
|
+
constructor(config?: LogicDriftConfig);
|
|
33
|
+
protected get provenance(): Provenance;
|
|
34
|
+
run(context: GateContext): Promise<Failure[]>;
|
|
35
|
+
private extractFunctionBaselines;
|
|
36
|
+
private extractBody;
|
|
37
|
+
/**
|
|
38
|
+
* Extract all comparison operators from function body in order.
|
|
39
|
+
* These are the most critical mutations: >= to > causes off-by-one.
|
|
40
|
+
*/
|
|
41
|
+
private extractComparisonOps;
|
|
42
|
+
private countBranches;
|
|
43
|
+
private countReturns;
|
|
44
|
+
/**
|
|
45
|
+
* Extract ordered sequence of function calls.
|
|
46
|
+
* Useful for detecting when AI reorders operations.
|
|
47
|
+
*/
|
|
48
|
+
private extractCallSequence;
|
|
49
|
+
/**
|
|
50
|
+
* Detect specific operator mutations between two ordered operator lists.
|
|
51
|
+
* Only reports CHANGED operators, not added/removed ones (those are
|
|
52
|
+
* covered by branch count changes).
|
|
53
|
+
*
|
|
54
|
+
* Example:
|
|
55
|
+
* prev: ['>=', '===', '!==']
|
|
56
|
+
* curr: ['>', '===', '!==']
|
|
57
|
+
* → [{from: '>=', to: '>'}]
|
|
58
|
+
*/
|
|
59
|
+
private detectOperatorMutations;
|
|
60
|
+
/**
|
|
61
|
+
* Classify whether an operator change is "dangerous" (likely unintentional).
|
|
62
|
+
*
|
|
63
|
+
* Dangerous mutations:
|
|
64
|
+
* - >= to > (boundary change, off-by-one)
|
|
65
|
+
* - <= to < (boundary change)
|
|
66
|
+
* - === to == (type coercion change)
|
|
67
|
+
* - !== to != (type coercion change)
|
|
68
|
+
*/
|
|
69
|
+
private isDangerousMutation;
|
|
70
|
+
}
|