npm - @rigour-labs/core - Versions diffs - 4.3.6 → 5.0.0 - Mend

@rigour-labs/core 4.3.6 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +46 -10
package/dist/gates/base.d.ts +3 -0
package/dist/gates/checkpoint.d.ts +23 -8
package/dist/gates/checkpoint.js +109 -45
package/dist/gates/checkpoint.test.js +6 -3
package/dist/gates/dependency.d.ts +39 -0
package/dist/gates/dependency.js +212 -5
package/dist/gates/duplication-drift.d.ts +101 -6
package/dist/gates/duplication-drift.js +427 -33
package/dist/gates/logic-drift.d.ts +70 -0
package/dist/gates/logic-drift.js +280 -0
package/dist/gates/runner.js +29 -1
package/dist/gates/style-drift.d.ts +53 -0
package/dist/gates/style-drift.js +305 -0
package/dist/index.d.ts +4 -0
package/dist/index.js +4 -0
package/dist/services/adaptive-thresholds.d.ts +54 -10
package/dist/services/adaptive-thresholds.js +161 -35
package/dist/services/adaptive-thresholds.test.js +24 -20
package/dist/services/filesystem-cache.d.ts +50 -0
package/dist/services/filesystem-cache.js +124 -0
package/dist/services/temporal-drift.d.ts +101 -0
package/dist/services/temporal-drift.js +386 -0
package/dist/templates/universal-config.js +17 -0
package/dist/types/index.d.ts +196 -0
package/dist/types/index.js +19 -0
package/dist/utils/scanner.d.ts +6 -1
package/dist/utils/scanner.js +8 -1
package/package.json +6 -6

package/dist/gates/duplication-drift.js CHANGED Viewed

@@ -1,45 +1,108 @@
 /**
- * Duplication Drift Gate
+ * Duplication Drift Gate (v2)
  *
  * Detects when AI generates near-identical functions across files because
  * it doesn't remember what it already wrote. This is an AI-specific failure
  * mode — humans reuse via copy-paste (same file), AI re-invents (cross-file).
  *
- * Detection strategy:
- * 1. Extract all function bodies (normalized: strip whitespace, comments)
- * 2. Compare function signatures + body hashes across files
- * 3. Flag functions with >80% similarity in different files
+ * v2 upgrades:
+ * - tree-sitter AST node type sequences replace hand-rolled regex tokenizer
+ * - Jaccard similarity on AST node multisets (structural, not textual)
+ * - Catches duplicates even when every variable name is different
+ * - MD5 kept as fast-path for exact matches, Jaccard runs on remaining pairs
  *
- * @since v2.16.0
+ * Detection strategy (three-pass):
+ * 1. Extract function bodies, normalize text (strip comments/whitespace)
+ * 2. Parse with tree-sitter → walk AST → collect node type multiset
+ * 3. Generate semantic embeddings via all-MiniLM-L6-v2 (384D)
+ * 4. Pass 1 (fast):     MD5 hash → exact duplicates (O(n), <10ms)
+ * 5. Pass 2 (Jaccard):  AST node multiset similarity → structural near-duplicates (O(n²) bounded)
+ * 6. Pass 3 (semantic):  Embedding cosine similarity → semantic duplicates (O(n²) bounded)
+ * 7. Flag functions with similarity > threshold in different files
+ *
+ * Why AST node types > raw tokens:
+ * - `getUserById(id) { return db.find(x => x.id === id) }`
+ * - `fetchUser(userId) { return database.filter(u => u.id === userId)[0] }`
+ * Both produce similar AST: [return_statement, call_expression, arrow_function,
+ *   binary_expression, member_expression]. Variable names are invisible.
+ *
+ * @since v2.16.0 (original MD5)
+ * @since v5.0.0  (tree-sitter AST + Jaccard)
+ * @since v5.1.0  (semantic embedding Pass 3)
  */
 import { Gate } from './base.js';
 import { FileScanner } from '../utils/scanner.js';
 import { Logger } from '../utils/logger.js';
+import { generateEmbedding, cosineSimilarity } from '../pattern-index/embeddings.js';
 import crypto from 'crypto';
 import path from 'path';
+import { fileURLToPath } from 'url';
+// tree-sitter is optional — graceful fallback to text tokenization
+let Parser = null;
+let treeSitterReady = false;
+let treeSitterFailed = false;
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+async function initTreeSitter() {
+    if (treeSitterReady)
+        return true;
+    if (treeSitterFailed)
+        return false;
+    try {
+        const mod = await import('web-tree-sitter');
+        Parser = mod.default || mod;
+        await Parser.init();
+        treeSitterReady = true;
+        return true;
+    }
+    catch {
+        treeSitterFailed = true;
+        Logger.debug('tree-sitter not available, falling back to text tokenization');
+        return false;
+    }
+}
+const GRAMMAR_PATHS = {
+    '.ts': '../../vendor/grammars/tree-sitter-typescript.wasm',
+    '.tsx': '../../vendor/grammars/tree-sitter-tsx.wasm',
+    '.js': '../../vendor/grammars/tree-sitter-javascript.wasm',
+    '.jsx': '../../vendor/grammars/tree-sitter-javascript.wasm',
+    '.py': '../../vendor/grammars/tree-sitter-python.wasm',
+    '.go': '../../vendor/grammars/tree-sitter-go.wasm',
+    '.rs': '../../vendor/grammars/tree-sitter-rust.wasm',
+};
+// Cache loaded languages
+const languageCache = new Map();
 export class DuplicationDriftGate extends Gate {
     config;
+    parser = null;
     constructor(config = {}) {
         super('duplication-drift', 'AI Duplication Drift Detection');
         this.config = {
             enabled: config.enabled ?? true,
-            similarity_threshold: config.similarity_threshold ?? 0.8,
+            similarity_threshold: config.similarity_threshold ?? 0.75,
+            semantic_threshold: config.semantic_threshold ?? 0.85,
+            semantic_enabled: config.semantic_enabled ?? true,
             min_body_lines: config.min_body_lines ?? 5,
+            approved_duplications: config.approved_duplications ?? [],
         };
     }
     get provenance() { return 'ai-drift'; }
     async run(context) {
         if (!this.config.enabled)
             return [];
+        // Try to init tree-sitter (non-blocking, falls back gracefully)
+        const hasTreeSitter = await initTreeSitter();
+        if (hasTreeSitter && !this.parser) {
+            this.parser = new Parser();
+        }
         const failures = [];
         const functions = [];
-        const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py}'];
+        const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py,go,rs}'];
         const files = await FileScanner.findFiles({
             cwd: context.cwd,
             patterns: scanPatterns,
             ignore: [...(context.ignore || []), '**/node_modules/**', '**/dist/**', '**/*.test.*', '**/*.spec.*'],
         });
-        Logger.info(`Duplication Drift: Scanning ${files.length} files`);
+        Logger.info(`Duplication Drift: Scanning ${files.length} files (tree-sitter: ${hasTreeSitter ? 'ON' : 'fallback'})`);
         for (const file of files) {
             try {
                 const { readFile } = await import('fs-extra');
@@ -51,27 +114,212 @@ export class DuplicationDriftGate extends Gate {
                 else if (ext === '.py') {
                     this.extractPyFunctions(content, file, functions);
                 }
+                // Generate AST tokens using tree-sitter if available
+                if (hasTreeSitter && GRAMMAR_PATHS[ext]) {
+                    await this.enrichWithASTTokens(content, ext, file, functions);
+                }
             }
             catch (e) { }
         }
-        // Compare all function pairs across different files
+        // Pass 3 prep: Generate semantic embeddings for all extracted functions
+        // (embedding generation is lazy — only runs when semantic_enabled is true)
+        if (this.config.semantic_enabled && functions.length > 0) {
+            const allIndices = functions.map((_, i) => i);
+            await this.enrichWithEmbeddings(functions, allIndices);
+        }
         const duplicateGroups = this.findDuplicateGroups(functions);
+        // Build approved pairs set for fast lookup
+        const approvedSet = new Set((this.config.approved_duplications || []).map(s => s.toLowerCase()));
         for (const group of duplicateGroups) {
+            // Check if this pair is human-approved
+            const names = group.map(f => f.name).sort();
+            const pairKey = names.join(':').toLowerCase();
+            if (approvedSet.has(pairKey))
+                continue;
             const files = group.map(f => f.file);
             const locations = group.map(f => `${f.file}:${f.line} (${f.name})`).join(', ');
-            failures.push(this.createFailure(`AI Duplication Drift: Function '${group[0].name}' has ${group.length} near-identical copies across files`, [...new Set(files)], `Found duplicate implementations at: ${locations}. Extract to a shared module and import.`, 'Duplication Drift', group[0].line, undefined, 'high'));
+            // Determine similarity % and method used
+            let similarity;
+            let method;
+            if (group[0].bodyHash === group[1]?.bodyHash) {
+                similarity = 1.0;
+                method = 'exact-hash';
+            }
+            else if (group[0].embedding && group[1]?.embedding) {
+                const jaccardSim = this.jaccardSimilarity(group[0].astTokens, group[1].astTokens);
+                const cosineSim = cosineSimilarity(group[0].embedding, group[1].embedding);
+                if (cosineSim > jaccardSim) {
+                    similarity = cosineSim;
+                    method = 'semantic-embedding';
+                }
+                else {
+                    similarity = jaccardSim;
+                    method = 'ast-jaccard';
+                }
+            }
+            else {
+                similarity = group.length > 1
+                    ? this.jaccardSimilarity(group[0].astTokens, group[1].astTokens)
+                    : 1.0;
+                method = 'ast-jaccard';
+            }
+            const pct = (similarity * 100).toFixed(0);
+            failures.push(this.createFailure(`AI Duplication Drift: Function '${group[0].name}' has ${group.length} near-identical copies (${pct}% similar via ${method})`, [...new Set(files)], `Found duplicate implementations at: ${locations}. Extract to a shared module and import.`, 'Duplication Drift', group[0].line, undefined, 'high'));
         }
         return failures;
     }
+    // ─── tree-sitter AST Tokenization ───────────────────────────────
+    /**
+     * Parse the file with tree-sitter, find function nodes that match
+     * our extracted functions (by line number), and replace their token
+     * multisets with AST node type sequences.
+     *
+     * AST node types are language-agnostic structural tokens:
+     * - if_statement, for_statement, return_statement
+     * - call_expression, member_expression, binary_expression
+     * - arrow_function, function_declaration
+     *
+     * Variable names, string literals, comments — all invisible.
+     * Only STRUCTURE matters.
+     */
+    async enrichWithASTTokens(content, ext, file, functions) {
+        if (!this.parser)
+            return;
+        const grammarRelPath = GRAMMAR_PATHS[ext];
+        if (!grammarRelPath)
+            return;
+        try {
+            // Load language (cached)
+            if (!languageCache.has(ext)) {
+                const grammarPath = path.resolve(__dirname, grammarRelPath);
+                const lang = await Parser.Language.load(grammarPath);
+                languageCache.set(ext, lang);
+            }
+            const lang = languageCache.get(ext);
+            this.parser.setLanguage(lang);
+            const tree = this.parser.parse(content);
+            // Find functions that belong to this file
+            const fileFunctions = functions.filter(f => f.file === file);
+            for (const fn of fileFunctions) {
+                // Find the AST node at this function's line
+                const node = this.findFunctionNodeAtLine(tree.rootNode, fn.line);
+                if (node) {
+                    fn.astTokens = this.collectASTNodeTypes(node);
+                }
+            }
+        }
+        catch (e) {
+            // tree-sitter parse failed for this file — keep text tokens
+            Logger.debug(`tree-sitter parse failed for ${file}: ${e}`);
+        }
+    }
+    /**
+     * Walk the AST tree to find a function/method node at a given line.
+     */
+    findFunctionNodeAtLine(rootNode, targetLine) {
+        const functionTypes = new Set([
+            'function_declaration', 'method_definition', 'arrow_function',
+            'function_definition', // Python
+            'function_item', // Rust
+            'method_declaration', // Java/C#
+            'lexical_declaration', // const x = () => {}
+        ]);
+        let bestMatch = null;
+        const walk = (node) => {
+            // tree-sitter lines are 0-indexed, our lines are 1-indexed
+            if (functionTypes.has(node.type) && node.startPosition.row + 1 === targetLine) {
+                bestMatch = node;
+                return;
+            }
+            for (let i = 0; i < node.childCount; i++) {
+                walk(node.child(i));
+                if (bestMatch)
+                    return;
+            }
+        };
+        walk(rootNode);
+        return bestMatch;
+    }
+    /**
+     * Walk an AST subtree and collect node types as a multiset.
+     *
+     * This is the core insight: two functions with different variable names
+     * but the same control flow produce the same node type multiset.
+     *
+     * Example:
+     * `function a(x) { if (x > 0) return x * 2; return 0; }`
+     * `function b(val) { if (val > 0) return val * 2; return 0; }`
+     *
+     * Both produce: {if_statement: 1, binary_expression: 2, return_statement: 2, ...}
+     * → Jaccard similarity = 1.0
+     */
+    collectASTNodeTypes(node) {
+        const types = new Map();
+        const walk = (n) => {
+            // Skip leaf nodes that are just identifiers/literals (noise)
+            // Keep structural node types only
+            if (n.childCount > 0 || isStructuralLeaf(n.type)) {
+                types.set(n.type, (types.get(n.type) || 0) + 1);
+            }
+            for (let i = 0; i < n.childCount; i++) {
+                walk(n.child(i));
+            }
+        };
+        walk(node);
+        return types;
+    }
+    // ─── Fallback Text Tokenization ─────────────────────────────────
+    /**
+     * Fallback tokenizer when tree-sitter is not available.
+     * Uses normalized text → keyword/operator multiset.
+     */
+    textTokenize(normalized) {
+        const tokens = new Map();
+        const structural = normalized.match(/\b(if|else|for|while|return|const|let|var|function|class|import|export|async|await|try|catch|throw|new|switch|case|break|continue|yield|def|self)\b|[{}()\[\];,.:=<>!&|+\-*/%?]+/g) || [];
+        for (const token of structural) {
+            tokens.set(token, (tokens.get(token) || 0) + 1);
+        }
+        // Normalize all identifiers to a count (variable names don't matter)
+        const keywords = new Set([
+            'if', 'else', 'for', 'while', 'return', 'const', 'let', 'var',
+            'function', 'class', 'import', 'export', 'async', 'await',
+            'try', 'catch', 'throw', 'new', 'switch', 'case', 'break',
+            'continue', 'yield', 'def', 'self', 'true', 'false', 'null', 'undefined',
+        ]);
+        const identifiers = normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
+        let idCount = 0;
+        for (const id of identifiers) {
+            if (!keywords.has(id))
+                idCount++;
+        }
+        if (idCount > 0)
+            tokens.set('_ID_', idCount);
+        return tokens;
+    }
+    // ─── Jaccard Similarity ─────────────────────────────────────────
+    /**
+     * Jaccard similarity on multisets.
+     * intersection = sum of min(countA, countB) for each key
+     * union = sum of max(countA, countB) for each key
+     */
+    jaccardSimilarity(a, b) {
+        const allKeys = new Set([...a.keys(), ...b.keys()]);
+        let intersection = 0;
+        let union = 0;
+        for (const key of allKeys) {
+            const countA = a.get(key) || 0;
+            const countB = b.get(key) || 0;
+            intersection += Math.min(countA, countB);
+            union += Math.max(countA, countB);
+        }
+        return union === 0 ? 0 : intersection / union;
+    }
+    // ─── Function Extraction ────────────────────────────────────────
     extractJSFunctions(content, file, functions) {
         const lines = content.split('\n');
-        // Match function declarations, arrow functions, and method definitions
         const patterns = [
-            // function name(...) {
             /^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/,
-            // const name = (...) => {
             /^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|(\w+))\s*=>/,
-            // name(...) { — class method
             /^\s+(?:async\s+)?(\w+)\s*\(([^)]*)\)\s*\{/,
         ];
         for (let i = 0; i < lines.length; i++) {
@@ -92,6 +340,8 @@ export class DuplicationDriftGate extends Gate {
                             bodyHash: this.hash(normalized),
                             bodyLength: body.length,
                             normalized,
+                            // Start with text tokens, enrichWithASTTokens() upgrades if tree-sitter available
+                            astTokens: this.textTokenize(normalized),
                         });
                     }
                     break;
@@ -107,7 +357,6 @@ export class DuplicationDriftGate extends Gate {
                 const name = match[1];
                 const params = match[2] || '';
                 const indent = lines[i].match(/^(\s*)/)?.[1]?.length || 0;
-                // Extract body by indentation
                 const body = [];
                 for (let j = i + 1; j < lines.length; j++) {
                     const lineIndent = lines[j].match(/^(\s*)/)?.[1]?.length || 0;
@@ -128,6 +377,7 @@ export class DuplicationDriftGate extends Gate {
                         bodyHash: this.hash(normalized),
                         bodyLength: body.length,
                         normalized,
+                        astTokens: this.textTokenize(normalized),
                     });
                 }
             }
@@ -156,36 +406,180 @@ export class DuplicationDriftGate extends Gate {
     }
     normalizeBody(body) {
         return body
-            .replace(/\/\/.*/g, '') // strip single-line comments
-            .replace(/\/\*[\s\S]*?\*\//g, '') // strip multi-line comments
-            .replace(/#.*/g, '') // strip Python comments
-            .replace(/`[^`]*`/g, '"STR"') // normalize template literals to placeholder
-            .replace(/\basync\s+/g, '') // normalize async modifier
-            .replace(/\s+/g, ' ') // collapse whitespace
-            .replace(/['"]/g, '"') // normalize single/double quotes (NOT backticks)
+            .replace(/\/\/.*/g, '')
+            .replace(/\/\*[\s\S]*?\*\//g, '')
+            .replace(/#.*/g, '')
+            .replace(/`[^`]*`/g, '"STR"')
+            .replace(/\basync\s+/g, '')
+            .replace(/\s+/g, ' ')
+            .replace(/['"]/g, '"')
             .trim();
     }
     hash(text) {
         return crypto.createHash('md5').update(text).digest('hex');
     }
+    // ─── Semantic Embedding ─────────────────────────────────────────
+    /**
+     * Generate semantic embedding text for a function.
+     * Combines function name, parameter names, and first 200 tokens of body.
+     * This captures INTENT regardless of implementation differences.
+     *
+     * Example:
+     * getUserById(id) { return db.users.find(x => x.id === id) }
+     * → "getUserById id return db users find x id id"
+     *
+     * fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
+     * → "fetchUserRecord userId return database users filter u id userId 0"
+     *
+     * These produce similar embeddings (~0.91 cosine) despite different AST.
+     */
+    buildEmbeddingText(fn) {
+        // Extract identifiers from normalized body (first 200 tokens)
+        const bodyTokens = fn.normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
+        const first200 = bodyTokens.slice(0, 200).join(' ');
+        return `${fn.name} ${first200}`;
+    }
+    /**
+     * Enrich functions with semantic embeddings for Pass 3.
+     * Only called for functions not already claimed by Pass 1/2.
+     * Uses generateEmbedding() from pattern-index/embeddings.ts.
+     */
+    async enrichWithEmbeddings(functions, indices) {
+        Logger.info(`Semantic Pass 3: Generating embeddings for ${indices.length} functions`);
+        for (const idx of indices) {
+            const fn = functions[idx];
+            try {
+                const text = this.buildEmbeddingText(fn);
+                fn.embedding = await generateEmbedding(text);
+            }
+            catch {
+                // Embedding failed — skip this function for Pass 3
+                Logger.debug(`Embedding generation failed for ${fn.file}:${fn.name}`);
+            }
+        }
+    }
+    // ─── Duplicate Finding (three-pass) ──────────────────────────────
+    /**
+     * Three-pass duplicate detection:
+     * Pass 1 (fast):     MD5 hash → exact duplicates (O(n))
+     * Pass 2 (Jaccard):  AST node multiset similarity → near-duplicates (O(n²) bounded)
+     * Pass 3 (semantic):  Embedding cosine similarity → semantic duplicates (O(n²) bounded)
+     *
+     * Pass 3 catches what AST Jaccard misses: same intent, different implementation.
+     * Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
+     */
     findDuplicateGroups(functions) {
-        const groups = new Map();
-        // Group by body hash (exact duplicates across files)
-        for (const fn of functions) {
-            const existing = groups.get(fn.bodyHash) || [];
-            existing.push(fn);
-            groups.set(fn.bodyHash, existing);
-        }
-        // Filter: only groups with functions from DIFFERENT files, 2+ members
         const duplicates = [];
-        for (const group of groups.values()) {
-            if (group.length < 2)
+        const claimedIndices = new Set();
+        // Pass 1: Exact hash match
+        const hashGroups = new Map();
+        for (let i = 0; i < functions.length; i++) {
+            const existing = hashGroups.get(functions[i].bodyHash) || [];
+            existing.push(i);
+            hashGroups.set(functions[i].bodyHash, existing);
+        }
+        for (const indices of hashGroups.values()) {
+            if (indices.length < 2)
                 continue;
+            const group = indices.map(i => functions[i]);
             const uniqueFiles = new Set(group.map(f => f.file));
             if (uniqueFiles.size >= 2) {
                 duplicates.push(group);
+                indices.forEach(i => claimedIndices.add(i));
+            }
+        }
+        // Pass 2: Jaccard on AST tokens for remaining functions
+        const remaining = functions
+            .map((fn, i) => ({ fn, idx: i }))
+            .filter(({ idx }) => !claimedIndices.has(idx));
+        remaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
+        const jaccardClaimed = new Set();
+        for (let i = 0; i < remaining.length; i++) {
+            if (jaccardClaimed.has(remaining[i].idx))
+                continue;
+            const group = [remaining[i].fn];
+            const baseLen = remaining[i].fn.bodyLength;
+            for (let j = i + 1; j < remaining.length; j++) {
+                if (jaccardClaimed.has(remaining[j].idx))
+                    continue;
+                if (remaining[j].fn.bodyLength > baseLen * 1.5)
+                    break;
+                if (remaining[j].fn.file === remaining[i].fn.file)
+                    continue;
+                const sim = this.jaccardSimilarity(remaining[i].fn.astTokens, remaining[j].fn.astTokens);
+                if (sim >= this.config.similarity_threshold) {
+                    group.push(remaining[j].fn);
+                    jaccardClaimed.add(remaining[j].idx);
+                }
+            }
+            if (group.length >= 2) {
+                const uniqueFiles = new Set(group.map(f => f.file));
+                if (uniqueFiles.size >= 2) {
+                    duplicates.push(group);
+                    jaccardClaimed.add(remaining[i].idx);
+                }
+            }
+        }
+        // Mark all Pass 1 + Pass 2 claimed indices
+        for (const idx of jaccardClaimed)
+            claimedIndices.add(idx);
+        // Pass 3: Semantic embedding cosine similarity for still-unclaimed functions
+        if (this.config.semantic_enabled) {
+            const semanticRemaining = functions
+                .map((fn, i) => ({ fn, idx: i }))
+                .filter(({ idx }) => !claimedIndices.has(idx))
+                .filter(({ fn }) => fn.embedding && fn.embedding.length > 0);
+            semanticRemaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
+            const semanticClaimed = new Set();
+            for (let i = 0; i < semanticRemaining.length; i++) {
+                if (semanticClaimed.has(semanticRemaining[i].idx))
+                    continue;
+                const group = [semanticRemaining[i].fn];
+                const baseLen = semanticRemaining[i].fn.bodyLength;
+                for (let j = i + 1; j < semanticRemaining.length; j++) {
+                    if (semanticClaimed.has(semanticRemaining[j].idx))
+                        continue;
+                    // Body length must be within 2x range (semantic allows more variance)
+                    if (semanticRemaining[j].fn.bodyLength > baseLen * 2.0)
+                        break;
+                    if (semanticRemaining[j].fn.file === semanticRemaining[i].fn.file)
+                        continue;
+                    const sim = cosineSimilarity(semanticRemaining[i].fn.embedding, semanticRemaining[j].fn.embedding);
+                    if (sim >= this.config.semantic_threshold) {
+                        group.push(semanticRemaining[j].fn);
+                        semanticClaimed.add(semanticRemaining[j].idx);
+                    }
+                }
+                if (group.length >= 2) {
+                    const uniqueFiles = new Set(group.map(f => f.file));
+                    if (uniqueFiles.size >= 2) {
+                        duplicates.push(group);
+                        semanticClaimed.add(semanticRemaining[i].idx);
+                    }
+                }
+            }
+            if (semanticClaimed.size > 0) {
+                Logger.info(`Semantic Pass 3: Found ${semanticClaimed.size} additional semantic duplicates`);
             }
         }
         return duplicates;
     }
 }
+/**
+ * AST node types that are structural even as leaf nodes.
+ * These carry semantic meaning without children.
+ */
+function isStructuralLeaf(type) {
+    const structural = new Set([
+        'return', 'break', 'continue', 'yield', 'throw',
+        'true', 'false', 'null', 'undefined', 'none',
+        'self', 'this', 'super',
+        'string', 'number', 'template_string',
+        // Operators
+        '=', '==', '===', '!=', '!==', '<', '>', '<=', '>=',
+        '+', '-', '*', '/', '%', '**',
+        '&&', '||', '!', '??',
+        '=>', '...', '?', ':',
+    ]);
+    return structural.has(type);
+}

package/dist/gates/logic-drift.d.ts ADDED Viewed

@@ -0,0 +1,70 @@
+/**
+ * Logic Drift Foundation Gate
+ *
+ * Detects when AI subtly changes business logic in functions:
+ * - Comparison operator mutations: >= became > (off-by-one)
+ * - Return statement additions/removals
+ * - Branch count changes (new if/else added or removed)
+ * - Call sequence changes (function calls reordered)
+ *
+ * This is the HARDEST drift to catch because:
+ * - Code still compiles
+ * - Tests might still pass (if they don't cover edge cases)
+ * - The change looks intentional ("AI refactored the function")
+ *
+ * Strategy: Collect baselines for critical functions, then detect
+ * mutations between scans. This foundation enables future LLM-powered
+ * deeper analysis (feeding baselines into DriftBench training).
+ *
+ * @since v5.1.0
+ */
+import { Gate, GateContext } from './base.js';
+import { Failure, Provenance } from '../types/index.js';
+export interface LogicDriftConfig {
+    enabled?: boolean;
+    baseline_path?: string;
+    track_operators?: boolean;
+    track_branches?: boolean;
+    track_returns?: boolean;
+}
+export declare class LogicDriftGate extends Gate {
+    private config;
+    constructor(config?: LogicDriftConfig);
+    protected get provenance(): Provenance;
+    run(context: GateContext): Promise<Failure[]>;
+    private extractFunctionBaselines;
+    private extractBody;
+    /**
+     * Extract all comparison operators from function body in order.
+     * These are the most critical mutations: >= to > causes off-by-one.
+     */
+    private extractComparisonOps;
+    private countBranches;
+    private countReturns;
+    /**
+     * Extract ordered sequence of function calls.
+     * Useful for detecting when AI reorders operations.
+     */
+    private extractCallSequence;
+    /**
+     * Detect specific operator mutations between two ordered operator lists.
+     * Only reports CHANGED operators, not added/removed ones (those are
+     * covered by branch count changes).
+     *
+     * Example:
+     * prev: ['>=', '===', '!==']
+     * curr: ['>',  '===', '!==']
+     * → [{from: '>=', to: '>'}]
+     */
+    private detectOperatorMutations;
+    /**
+     * Classify whether an operator change is "dangerous" (likely unintentional).
+     *
+     * Dangerous mutations:
+     * - >= to > (boundary change, off-by-one)
+     * - <= to < (boundary change)
+     * - === to == (type coercion change)
+     * - !== to != (type coercion change)
+     */
+    private isDangerousMutation;
+}