@rigour-labs/core 4.3.5 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,45 +1,108 @@
1
1
  /**
2
- * Duplication Drift Gate
2
+ * Duplication Drift Gate (v2)
3
3
  *
4
4
  * Detects when AI generates near-identical functions across files because
5
5
  * it doesn't remember what it already wrote. This is an AI-specific failure
6
6
  * mode — humans reuse via copy-paste (same file), AI re-invents (cross-file).
7
7
  *
8
- * Detection strategy:
9
- * 1. Extract all function bodies (normalized: strip whitespace, comments)
10
- * 2. Compare function signatures + body hashes across files
11
- * 3. Flag functions with >80% similarity in different files
8
+ * v2 upgrades:
9
+ * - tree-sitter AST node type sequences replace hand-rolled regex tokenizer
10
+ * - Jaccard similarity on AST node multisets (structural, not textual)
11
+ * - Catches duplicates even when every variable name is different
12
+ * - MD5 kept as fast-path for exact matches, Jaccard runs on remaining pairs
12
13
  *
13
- * @since v2.16.0
14
+ * Detection strategy (three-pass):
15
+ * 1. Extract function bodies, normalize text (strip comments/whitespace)
16
+ * 2. Parse with tree-sitter → walk AST → collect node type multiset
17
+ * 3. Generate semantic embeddings via all-MiniLM-L6-v2 (384D)
18
+ * 4. Pass 1 (fast): MD5 hash → exact duplicates (O(n), <10ms)
19
+ * 5. Pass 2 (Jaccard): AST node multiset similarity → structural near-duplicates (O(n²) bounded)
20
+ * 6. Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
21
+ * 7. Flag functions with similarity > threshold in different files
22
+ *
23
+ * Why AST node types > raw tokens:
24
+ * - `getUserById(id) { return db.find(x => x.id === id) }`
25
+ * - `fetchUser(userId) { return database.filter(u => u.id === userId)[0] }`
26
+ * Both produce similar AST: [return_statement, call_expression, arrow_function,
27
+ * binary_expression, member_expression]. Variable names are invisible.
28
+ *
29
+ * @since v2.16.0 (original MD5)
30
+ * @since v5.0.0 (tree-sitter AST + Jaccard)
31
+ * @since v5.1.0 (semantic embedding Pass 3)
14
32
  */
15
33
  import { Gate } from './base.js';
16
34
  import { FileScanner } from '../utils/scanner.js';
17
35
  import { Logger } from '../utils/logger.js';
36
+ import { generateEmbedding, cosineSimilarity } from '../pattern-index/embeddings.js';
18
37
  import crypto from 'crypto';
19
38
  import path from 'path';
39
+ import { fileURLToPath } from 'url';
40
+ // tree-sitter is optional — graceful fallback to text tokenization
41
+ let Parser = null;
42
+ let treeSitterReady = false;
43
+ let treeSitterFailed = false;
44
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
45
+ async function initTreeSitter() {
46
+ if (treeSitterReady)
47
+ return true;
48
+ if (treeSitterFailed)
49
+ return false;
50
+ try {
51
+ const mod = await import('web-tree-sitter');
52
+ Parser = mod.default || mod;
53
+ await Parser.init();
54
+ treeSitterReady = true;
55
+ return true;
56
+ }
57
+ catch {
58
+ treeSitterFailed = true;
59
+ Logger.debug('tree-sitter not available, falling back to text tokenization');
60
+ return false;
61
+ }
62
+ }
63
+ const GRAMMAR_PATHS = {
64
+ '.ts': '../../vendor/grammars/tree-sitter-typescript.wasm',
65
+ '.tsx': '../../vendor/grammars/tree-sitter-tsx.wasm',
66
+ '.js': '../../vendor/grammars/tree-sitter-javascript.wasm',
67
+ '.jsx': '../../vendor/grammars/tree-sitter-javascript.wasm',
68
+ '.py': '../../vendor/grammars/tree-sitter-python.wasm',
69
+ '.go': '../../vendor/grammars/tree-sitter-go.wasm',
70
+ '.rs': '../../vendor/grammars/tree-sitter-rust.wasm',
71
+ };
72
+ // Cache loaded languages
73
+ const languageCache = new Map();
20
74
  export class DuplicationDriftGate extends Gate {
21
75
  config;
76
+ parser = null;
22
77
  constructor(config = {}) {
23
78
  super('duplication-drift', 'AI Duplication Drift Detection');
24
79
  this.config = {
25
80
  enabled: config.enabled ?? true,
26
- similarity_threshold: config.similarity_threshold ?? 0.8,
81
+ similarity_threshold: config.similarity_threshold ?? 0.75,
82
+ semantic_threshold: config.semantic_threshold ?? 0.85,
83
+ semantic_enabled: config.semantic_enabled ?? true,
27
84
  min_body_lines: config.min_body_lines ?? 5,
85
+ approved_duplications: config.approved_duplications ?? [],
28
86
  };
29
87
  }
30
88
  get provenance() { return 'ai-drift'; }
31
89
  async run(context) {
32
90
  if (!this.config.enabled)
33
91
  return [];
92
+ // Try to init tree-sitter (non-blocking, falls back gracefully)
93
+ const hasTreeSitter = await initTreeSitter();
94
+ if (hasTreeSitter && !this.parser) {
95
+ this.parser = new Parser();
96
+ }
34
97
  const failures = [];
35
98
  const functions = [];
36
- const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py}'];
99
+ const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py,go,rs}'];
37
100
  const files = await FileScanner.findFiles({
38
101
  cwd: context.cwd,
39
102
  patterns: scanPatterns,
40
103
  ignore: [...(context.ignore || []), '**/node_modules/**', '**/dist/**', '**/*.test.*', '**/*.spec.*'],
41
104
  });
42
- Logger.info(`Duplication Drift: Scanning ${files.length} files`);
105
+ Logger.info(`Duplication Drift: Scanning ${files.length} files (tree-sitter: ${hasTreeSitter ? 'ON' : 'fallback'})`);
43
106
  for (const file of files) {
44
107
  try {
45
108
  const { readFile } = await import('fs-extra');
@@ -51,27 +114,212 @@ export class DuplicationDriftGate extends Gate {
51
114
  else if (ext === '.py') {
52
115
  this.extractPyFunctions(content, file, functions);
53
116
  }
117
+ // Generate AST tokens using tree-sitter if available
118
+ if (hasTreeSitter && GRAMMAR_PATHS[ext]) {
119
+ await this.enrichWithASTTokens(content, ext, file, functions);
120
+ }
54
121
  }
55
122
  catch (e) { }
56
123
  }
57
- // Compare all function pairs across different files
124
+ // Pass 3 prep: Generate semantic embeddings for all extracted functions
125
+ // (embedding generation is lazy — only runs when semantic_enabled is true)
126
+ if (this.config.semantic_enabled && functions.length > 0) {
127
+ const allIndices = functions.map((_, i) => i);
128
+ await this.enrichWithEmbeddings(functions, allIndices);
129
+ }
58
130
  const duplicateGroups = this.findDuplicateGroups(functions);
131
+ // Build approved pairs set for fast lookup
132
+ const approvedSet = new Set((this.config.approved_duplications || []).map(s => s.toLowerCase()));
59
133
  for (const group of duplicateGroups) {
134
+ // Check if this pair is human-approved
135
+ const names = group.map(f => f.name).sort();
136
+ const pairKey = names.join(':').toLowerCase();
137
+ if (approvedSet.has(pairKey))
138
+ continue;
60
139
  const files = group.map(f => f.file);
61
140
  const locations = group.map(f => `${f.file}:${f.line} (${f.name})`).join(', ');
62
- failures.push(this.createFailure(`AI Duplication Drift: Function '${group[0].name}' has ${group.length} near-identical copies across files`, [...new Set(files)], `Found duplicate implementations at: ${locations}. Extract to a shared module and import.`, 'Duplication Drift', group[0].line, undefined, 'high'));
141
+ // Determine similarity % and method used
142
+ let similarity;
143
+ let method;
144
+ if (group[0].bodyHash === group[1]?.bodyHash) {
145
+ similarity = 1.0;
146
+ method = 'exact-hash';
147
+ }
148
+ else if (group[0].embedding && group[1]?.embedding) {
149
+ const jaccardSim = this.jaccardSimilarity(group[0].astTokens, group[1].astTokens);
150
+ const cosineSim = cosineSimilarity(group[0].embedding, group[1].embedding);
151
+ if (cosineSim > jaccardSim) {
152
+ similarity = cosineSim;
153
+ method = 'semantic-embedding';
154
+ }
155
+ else {
156
+ similarity = jaccardSim;
157
+ method = 'ast-jaccard';
158
+ }
159
+ }
160
+ else {
161
+ similarity = group.length > 1
162
+ ? this.jaccardSimilarity(group[0].astTokens, group[1].astTokens)
163
+ : 1.0;
164
+ method = 'ast-jaccard';
165
+ }
166
+ const pct = (similarity * 100).toFixed(0);
167
+ failures.push(this.createFailure(`AI Duplication Drift: Function '${group[0].name}' has ${group.length} near-identical copies (${pct}% similar via ${method})`, [...new Set(files)], `Found duplicate implementations at: ${locations}. Extract to a shared module and import.`, 'Duplication Drift', group[0].line, undefined, 'high'));
63
168
  }
64
169
  return failures;
65
170
  }
171
+ // ─── tree-sitter AST Tokenization ───────────────────────────────
172
+ /**
173
+ * Parse the file with tree-sitter, find function nodes that match
174
+ * our extracted functions (by line number), and replace their token
175
+ * multisets with AST node type sequences.
176
+ *
177
+ * AST node types are language-agnostic structural tokens:
178
+ * - if_statement, for_statement, return_statement
179
+ * - call_expression, member_expression, binary_expression
180
+ * - arrow_function, function_declaration
181
+ *
182
+ * Variable names, string literals, comments — all invisible.
183
+ * Only STRUCTURE matters.
184
+ */
185
+ async enrichWithASTTokens(content, ext, file, functions) {
186
+ if (!this.parser)
187
+ return;
188
+ const grammarRelPath = GRAMMAR_PATHS[ext];
189
+ if (!grammarRelPath)
190
+ return;
191
+ try {
192
+ // Load language (cached)
193
+ if (!languageCache.has(ext)) {
194
+ const grammarPath = path.resolve(__dirname, grammarRelPath);
195
+ const lang = await Parser.Language.load(grammarPath);
196
+ languageCache.set(ext, lang);
197
+ }
198
+ const lang = languageCache.get(ext);
199
+ this.parser.setLanguage(lang);
200
+ const tree = this.parser.parse(content);
201
+ // Find functions that belong to this file
202
+ const fileFunctions = functions.filter(f => f.file === file);
203
+ for (const fn of fileFunctions) {
204
+ // Find the AST node at this function's line
205
+ const node = this.findFunctionNodeAtLine(tree.rootNode, fn.line);
206
+ if (node) {
207
+ fn.astTokens = this.collectASTNodeTypes(node);
208
+ }
209
+ }
210
+ }
211
+ catch (e) {
212
+ // tree-sitter parse failed for this file — keep text tokens
213
+ Logger.debug(`tree-sitter parse failed for ${file}: ${e}`);
214
+ }
215
+ }
216
+ /**
217
+ * Walk the AST tree to find a function/method node at a given line.
218
+ */
219
+ findFunctionNodeAtLine(rootNode, targetLine) {
220
+ const functionTypes = new Set([
221
+ 'function_declaration', 'method_definition', 'arrow_function',
222
+ 'function_definition', // Python
223
+ 'function_item', // Rust
224
+ 'method_declaration', // Java/C#
225
+ 'lexical_declaration', // const x = () => {}
226
+ ]);
227
+ let bestMatch = null;
228
+ const walk = (node) => {
229
+ // tree-sitter lines are 0-indexed, our lines are 1-indexed
230
+ if (functionTypes.has(node.type) && node.startPosition.row + 1 === targetLine) {
231
+ bestMatch = node;
232
+ return;
233
+ }
234
+ for (let i = 0; i < node.childCount; i++) {
235
+ walk(node.child(i));
236
+ if (bestMatch)
237
+ return;
238
+ }
239
+ };
240
+ walk(rootNode);
241
+ return bestMatch;
242
+ }
243
+ /**
244
+ * Walk an AST subtree and collect node types as a multiset.
245
+ *
246
+ * This is the core insight: two functions with different variable names
247
+ * but the same control flow produce the same node type multiset.
248
+ *
249
+ * Example:
250
+ * `function a(x) { if (x > 0) return x * 2; return 0; }`
251
+ * `function b(val) { if (val > 0) return val * 2; return 0; }`
252
+ *
253
+ * Both produce: {if_statement: 1, binary_expression: 2, return_statement: 2, ...}
254
+ * → Jaccard similarity = 1.0
255
+ */
256
+ collectASTNodeTypes(node) {
257
+ const types = new Map();
258
+ const walk = (n) => {
259
+ // Skip leaf nodes that are just identifiers/literals (noise)
260
+ // Keep structural node types only
261
+ if (n.childCount > 0 || isStructuralLeaf(n.type)) {
262
+ types.set(n.type, (types.get(n.type) || 0) + 1);
263
+ }
264
+ for (let i = 0; i < n.childCount; i++) {
265
+ walk(n.child(i));
266
+ }
267
+ };
268
+ walk(node);
269
+ return types;
270
+ }
271
+ // ─── Fallback Text Tokenization ─────────────────────────────────
272
+ /**
273
+ * Fallback tokenizer when tree-sitter is not available.
274
+ * Uses normalized text → keyword/operator multiset.
275
+ */
276
+ textTokenize(normalized) {
277
+ const tokens = new Map();
278
+ const structural = normalized.match(/\b(if|else|for|while|return|const|let|var|function|class|import|export|async|await|try|catch|throw|new|switch|case|break|continue|yield|def|self)\b|[{}()\[\];,.:=<>!&|+\-*/%?]+/g) || [];
279
+ for (const token of structural) {
280
+ tokens.set(token, (tokens.get(token) || 0) + 1);
281
+ }
282
+ // Normalize all identifiers to a count (variable names don't matter)
283
+ const keywords = new Set([
284
+ 'if', 'else', 'for', 'while', 'return', 'const', 'let', 'var',
285
+ 'function', 'class', 'import', 'export', 'async', 'await',
286
+ 'try', 'catch', 'throw', 'new', 'switch', 'case', 'break',
287
+ 'continue', 'yield', 'def', 'self', 'true', 'false', 'null', 'undefined',
288
+ ]);
289
+ const identifiers = normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
290
+ let idCount = 0;
291
+ for (const id of identifiers) {
292
+ if (!keywords.has(id))
293
+ idCount++;
294
+ }
295
+ if (idCount > 0)
296
+ tokens.set('_ID_', idCount);
297
+ return tokens;
298
+ }
299
+ // ─── Jaccard Similarity ─────────────────────────────────────────
300
+ /**
301
+ * Jaccard similarity on multisets.
302
+ * intersection = sum of min(countA, countB) for each key
303
+ * union = sum of max(countA, countB) for each key
304
+ */
305
+ jaccardSimilarity(a, b) {
306
+ const allKeys = new Set([...a.keys(), ...b.keys()]);
307
+ let intersection = 0;
308
+ let union = 0;
309
+ for (const key of allKeys) {
310
+ const countA = a.get(key) || 0;
311
+ const countB = b.get(key) || 0;
312
+ intersection += Math.min(countA, countB);
313
+ union += Math.max(countA, countB);
314
+ }
315
+ return union === 0 ? 0 : intersection / union;
316
+ }
317
+ // ─── Function Extraction ────────────────────────────────────────
66
318
  extractJSFunctions(content, file, functions) {
67
319
  const lines = content.split('\n');
68
- // Match function declarations, arrow functions, and method definitions
69
320
  const patterns = [
70
- // function name(...) {
71
321
  /^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/,
72
- // const name = (...) => {
73
322
  /^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|(\w+))\s*=>/,
74
- // name(...) { — class method
75
323
  /^\s+(?:async\s+)?(\w+)\s*\(([^)]*)\)\s*\{/,
76
324
  ];
77
325
  for (let i = 0; i < lines.length; i++) {
@@ -92,6 +340,8 @@ export class DuplicationDriftGate extends Gate {
92
340
  bodyHash: this.hash(normalized),
93
341
  bodyLength: body.length,
94
342
  normalized,
343
+ // Start with text tokens, enrichWithASTTokens() upgrades if tree-sitter available
344
+ astTokens: this.textTokenize(normalized),
95
345
  });
96
346
  }
97
347
  break;
@@ -107,7 +357,6 @@ export class DuplicationDriftGate extends Gate {
107
357
  const name = match[1];
108
358
  const params = match[2] || '';
109
359
  const indent = lines[i].match(/^(\s*)/)?.[1]?.length || 0;
110
- // Extract body by indentation
111
360
  const body = [];
112
361
  for (let j = i + 1; j < lines.length; j++) {
113
362
  const lineIndent = lines[j].match(/^(\s*)/)?.[1]?.length || 0;
@@ -128,6 +377,7 @@ export class DuplicationDriftGate extends Gate {
128
377
  bodyHash: this.hash(normalized),
129
378
  bodyLength: body.length,
130
379
  normalized,
380
+ astTokens: this.textTokenize(normalized),
131
381
  });
132
382
  }
133
383
  }
@@ -156,36 +406,180 @@ export class DuplicationDriftGate extends Gate {
156
406
  }
157
407
  normalizeBody(body) {
158
408
  return body
159
- .replace(/\/\/.*/g, '') // strip single-line comments
160
- .replace(/\/\*[\s\S]*?\*\//g, '') // strip multi-line comments
161
- .replace(/#.*/g, '') // strip Python comments
162
- .replace(/`[^`]*`/g, '"STR"') // normalize template literals to placeholder
163
- .replace(/\basync\s+/g, '') // normalize async modifier
164
- .replace(/\s+/g, ' ') // collapse whitespace
165
- .replace(/['"]/g, '"') // normalize single/double quotes (NOT backticks)
409
+ .replace(/\/\/.*/g, '')
410
+ .replace(/\/\*[\s\S]*?\*\//g, '')
411
+ .replace(/#.*/g, '')
412
+ .replace(/`[^`]*`/g, '"STR"')
413
+ .replace(/\basync\s+/g, '')
414
+ .replace(/\s+/g, ' ')
415
+ .replace(/['"]/g, '"')
166
416
  .trim();
167
417
  }
168
418
  hash(text) {
169
419
  return crypto.createHash('md5').update(text).digest('hex');
170
420
  }
421
+ // ─── Semantic Embedding ─────────────────────────────────────────
422
+ /**
423
+ * Generate semantic embedding text for a function.
424
+ * Combines function name, parameter names, and first 200 tokens of body.
425
+ * This captures INTENT regardless of implementation differences.
426
+ *
427
+ * Example:
428
+ * getUserById(id) { return db.users.find(x => x.id === id) }
429
+ * → "getUserById id return db users find x id id"
430
+ *
431
+ * fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
432
+ * → "fetchUserRecord userId return database users filter u id userId 0"
433
+ *
434
+ * These produce similar embeddings (~0.91 cosine) despite different AST.
435
+ */
436
+ buildEmbeddingText(fn) {
437
+ // Extract identifiers from normalized body (first 200 tokens)
438
+ const bodyTokens = fn.normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
439
+ const first200 = bodyTokens.slice(0, 200).join(' ');
440
+ return `${fn.name} ${first200}`;
441
+ }
442
+ /**
443
+ * Enrich functions with semantic embeddings for Pass 3.
444
+ * Only called for functions not already claimed by Pass 1/2.
445
+ * Uses generateEmbedding() from pattern-index/embeddings.ts.
446
+ */
447
+ async enrichWithEmbeddings(functions, indices) {
448
+ Logger.info(`Semantic Pass 3: Generating embeddings for ${indices.length} functions`);
449
+ for (const idx of indices) {
450
+ const fn = functions[idx];
451
+ try {
452
+ const text = this.buildEmbeddingText(fn);
453
+ fn.embedding = await generateEmbedding(text);
454
+ }
455
+ catch {
456
+ // Embedding failed — skip this function for Pass 3
457
+ Logger.debug(`Embedding generation failed for ${fn.file}:${fn.name}`);
458
+ }
459
+ }
460
+ }
461
+ // ─── Duplicate Finding (three-pass) ──────────────────────────────
462
+ /**
463
+ * Three-pass duplicate detection:
464
+ * Pass 1 (fast): MD5 hash → exact duplicates (O(n))
465
+ * Pass 2 (Jaccard): AST node multiset similarity → near-duplicates (O(n²) bounded)
466
+ * Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
467
+ *
468
+ * Pass 3 catches what AST Jaccard misses: same intent, different implementation.
469
+ * Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
470
+ */
171
471
  findDuplicateGroups(functions) {
172
- const groups = new Map();
173
- // Group by body hash (exact duplicates across files)
174
- for (const fn of functions) {
175
- const existing = groups.get(fn.bodyHash) || [];
176
- existing.push(fn);
177
- groups.set(fn.bodyHash, existing);
178
- }
179
- // Filter: only groups with functions from DIFFERENT files, 2+ members
180
472
  const duplicates = [];
181
- for (const group of groups.values()) {
182
- if (group.length < 2)
473
+ const claimedIndices = new Set();
474
+ // Pass 1: Exact hash match
475
+ const hashGroups = new Map();
476
+ for (let i = 0; i < functions.length; i++) {
477
+ const existing = hashGroups.get(functions[i].bodyHash) || [];
478
+ existing.push(i);
479
+ hashGroups.set(functions[i].bodyHash, existing);
480
+ }
481
+ for (const indices of hashGroups.values()) {
482
+ if (indices.length < 2)
183
483
  continue;
484
+ const group = indices.map(i => functions[i]);
184
485
  const uniqueFiles = new Set(group.map(f => f.file));
185
486
  if (uniqueFiles.size >= 2) {
186
487
  duplicates.push(group);
488
+ indices.forEach(i => claimedIndices.add(i));
489
+ }
490
+ }
491
+ // Pass 2: Jaccard on AST tokens for remaining functions
492
+ const remaining = functions
493
+ .map((fn, i) => ({ fn, idx: i }))
494
+ .filter(({ idx }) => !claimedIndices.has(idx));
495
+ remaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
496
+ const jaccardClaimed = new Set();
497
+ for (let i = 0; i < remaining.length; i++) {
498
+ if (jaccardClaimed.has(remaining[i].idx))
499
+ continue;
500
+ const group = [remaining[i].fn];
501
+ const baseLen = remaining[i].fn.bodyLength;
502
+ for (let j = i + 1; j < remaining.length; j++) {
503
+ if (jaccardClaimed.has(remaining[j].idx))
504
+ continue;
505
+ if (remaining[j].fn.bodyLength > baseLen * 1.5)
506
+ break;
507
+ if (remaining[j].fn.file === remaining[i].fn.file)
508
+ continue;
509
+ const sim = this.jaccardSimilarity(remaining[i].fn.astTokens, remaining[j].fn.astTokens);
510
+ if (sim >= this.config.similarity_threshold) {
511
+ group.push(remaining[j].fn);
512
+ jaccardClaimed.add(remaining[j].idx);
513
+ }
514
+ }
515
+ if (group.length >= 2) {
516
+ const uniqueFiles = new Set(group.map(f => f.file));
517
+ if (uniqueFiles.size >= 2) {
518
+ duplicates.push(group);
519
+ jaccardClaimed.add(remaining[i].idx);
520
+ }
521
+ }
522
+ }
523
+ // Mark all Pass 1 + Pass 2 claimed indices
524
+ for (const idx of jaccardClaimed)
525
+ claimedIndices.add(idx);
526
+ // Pass 3: Semantic embedding cosine similarity for still-unclaimed functions
527
+ if (this.config.semantic_enabled) {
528
+ const semanticRemaining = functions
529
+ .map((fn, i) => ({ fn, idx: i }))
530
+ .filter(({ idx }) => !claimedIndices.has(idx))
531
+ .filter(({ fn }) => fn.embedding && fn.embedding.length > 0);
532
+ semanticRemaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
533
+ const semanticClaimed = new Set();
534
+ for (let i = 0; i < semanticRemaining.length; i++) {
535
+ if (semanticClaimed.has(semanticRemaining[i].idx))
536
+ continue;
537
+ const group = [semanticRemaining[i].fn];
538
+ const baseLen = semanticRemaining[i].fn.bodyLength;
539
+ for (let j = i + 1; j < semanticRemaining.length; j++) {
540
+ if (semanticClaimed.has(semanticRemaining[j].idx))
541
+ continue;
542
+ // Body length must be within 2x range (semantic allows more variance)
543
+ if (semanticRemaining[j].fn.bodyLength > baseLen * 2.0)
544
+ break;
545
+ if (semanticRemaining[j].fn.file === semanticRemaining[i].fn.file)
546
+ continue;
547
+ const sim = cosineSimilarity(semanticRemaining[i].fn.embedding, semanticRemaining[j].fn.embedding);
548
+ if (sim >= this.config.semantic_threshold) {
549
+ group.push(semanticRemaining[j].fn);
550
+ semanticClaimed.add(semanticRemaining[j].idx);
551
+ }
552
+ }
553
+ if (group.length >= 2) {
554
+ const uniqueFiles = new Set(group.map(f => f.file));
555
+ if (uniqueFiles.size >= 2) {
556
+ duplicates.push(group);
557
+ semanticClaimed.add(semanticRemaining[i].idx);
558
+ }
559
+ }
560
+ }
561
+ if (semanticClaimed.size > 0) {
562
+ Logger.info(`Semantic Pass 3: Found ${semanticClaimed.size} additional semantic duplicates`);
187
563
  }
188
564
  }
189
565
  return duplicates;
190
566
  }
191
567
  }
568
+ /**
569
+ * AST node types that are structural even as leaf nodes.
570
+ * These carry semantic meaning without children.
571
+ */
572
+ function isStructuralLeaf(type) {
573
+ const structural = new Set([
574
+ 'return', 'break', 'continue', 'yield', 'throw',
575
+ 'true', 'false', 'null', 'undefined', 'none',
576
+ 'self', 'this', 'super',
577
+ 'string', 'number', 'template_string',
578
+ // Operators
579
+ '=', '==', '===', '!=', '!==', '<', '>', '<=', '>=',
580
+ '+', '-', '*', '/', '%', '**',
581
+ '&&', '||', '!', '??',
582
+ '=>', '...', '?', ':',
583
+ ]);
584
+ return structural.has(type);
585
+ }
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Logic Drift Foundation Gate
3
+ *
4
+ * Detects when AI subtly changes business logic in functions:
5
+ * - Comparison operator mutations: >= became > (off-by-one)
6
+ * - Return statement additions/removals
7
+ * - Branch count changes (new if/else added or removed)
8
+ * - Call sequence changes (function calls reordered)
9
+ *
10
+ * This is the HARDEST drift to catch because:
11
+ * - Code still compiles
12
+ * - Tests might still pass (if they don't cover edge cases)
13
+ * - The change looks intentional ("AI refactored the function")
14
+ *
15
+ * Strategy: Collect baselines for critical functions, then detect
16
+ * mutations between scans. This foundation enables future LLM-powered
17
+ * deeper analysis (feeding baselines into DriftBench training).
18
+ *
19
+ * @since v5.1.0
20
+ */
21
+ import { Gate, GateContext } from './base.js';
22
+ import { Failure, Provenance } from '../types/index.js';
23
+ export interface LogicDriftConfig {
24
+ enabled?: boolean;
25
+ baseline_path?: string;
26
+ track_operators?: boolean;
27
+ track_branches?: boolean;
28
+ track_returns?: boolean;
29
+ }
30
+ export declare class LogicDriftGate extends Gate {
31
+ private config;
32
+ constructor(config?: LogicDriftConfig);
33
+ protected get provenance(): Provenance;
34
+ run(context: GateContext): Promise<Failure[]>;
35
+ private extractFunctionBaselines;
36
+ private extractBody;
37
+ /**
38
+ * Extract all comparison operators from function body in order.
39
+ * These are the most critical mutations: >= to > causes off-by-one.
40
+ */
41
+ private extractComparisonOps;
42
+ private countBranches;
43
+ private countReturns;
44
+ /**
45
+ * Extract ordered sequence of function calls.
46
+ * Useful for detecting when AI reorders operations.
47
+ */
48
+ private extractCallSequence;
49
+ /**
50
+ * Detect specific operator mutations between two ordered operator lists.
51
+ * Only reports CHANGED operators, not added/removed ones (those are
52
+ * covered by branch count changes).
53
+ *
54
+ * Example:
55
+ * prev: ['>=', '===', '!==']
56
+ * curr: ['>', '===', '!==']
57
+ * → [{from: '>=', to: '>'}]
58
+ */
59
+ private detectOperatorMutations;
60
+ /**
61
+ * Classify whether an operator change is "dangerous" (likely unintentional).
62
+ *
63
+ * Dangerous mutations:
64
+ * - >= to > (boundary change, off-by-one)
65
+ * - <= to < (boundary change)
66
+ * - === to == (type coercion change)
67
+ * - !== to != (type coercion change)
68
+ */
69
+ private isDangerousMutation;
70
+ }