@rigour-labs/core 5.0.1 → 5.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -1
- package/dist/gates/agent-team.d.ts +0 -1
- package/dist/gates/agent-team.js +0 -1
- package/dist/gates/checkpoint.d.ts +0 -2
- package/dist/gates/checkpoint.js +0 -2
- package/dist/gates/context-window-artifacts.d.ts +6 -2
- package/dist/gates/context-window-artifacts.js +107 -31
- package/dist/gates/deep-analysis.d.ts +2 -0
- package/dist/gates/deep-analysis.js +41 -11
- package/dist/gates/dependency.d.ts +0 -2
- package/dist/gates/dependency.js +23 -5
- package/dist/gates/deprecated-apis.d.ts +0 -2
- package/dist/gates/deprecated-apis.js +33 -20
- package/dist/gates/duplication-drift/index.d.ts +61 -0
- package/dist/gates/duplication-drift/index.js +240 -0
- package/dist/gates/duplication-drift/similarity.d.ts +68 -0
- package/dist/gates/duplication-drift/similarity.js +177 -0
- package/dist/gates/duplication-drift/tokenizer.d.ts +55 -0
- package/dist/gates/duplication-drift/tokenizer.js +195 -0
- package/dist/gates/frontend-secret-exposure.d.ts +0 -3
- package/dist/gates/frontend-secret-exposure.js +1 -114
- package/dist/gates/frontend-secret-patterns.d.ts +33 -0
- package/dist/gates/frontend-secret-patterns.js +119 -0
- package/dist/gates/{hallucinated-imports.d.ts → hallucinated-imports/index.d.ts} +2 -29
- package/dist/gates/hallucinated-imports/index.js +174 -0
- package/dist/gates/hallucinated-imports/js-resolver.d.ts +45 -0
- package/dist/gates/hallucinated-imports/js-resolver.js +320 -0
- package/dist/gates/hallucinated-imports/manifest-discovery.d.ts +28 -0
- package/dist/gates/hallucinated-imports/manifest-discovery.js +114 -0
- package/dist/gates/hallucinated-imports/python-resolver.d.ts +24 -0
- package/dist/gates/hallucinated-imports/python-resolver.js +306 -0
- package/dist/gates/hallucinated-imports-lang.d.ts +2 -2
- package/dist/gates/hallucinated-imports-lang.js +269 -34
- package/dist/gates/hallucinated-imports.test.js +1 -2
- package/dist/gates/inconsistent-error-handling.d.ts +0 -5
- package/dist/gates/inconsistent-error-handling.js +15 -144
- package/dist/gates/language-adapters/csharp-adapter.d.ts +16 -0
- package/dist/gates/language-adapters/csharp-adapter.js +211 -0
- package/dist/gates/language-adapters/go-adapter.d.ts +26 -0
- package/dist/gates/language-adapters/go-adapter.js +195 -0
- package/dist/gates/language-adapters/index.d.ts +15 -0
- package/dist/gates/language-adapters/index.js +16 -0
- package/dist/gates/language-adapters/java-adapter.d.ts +16 -0
- package/dist/gates/language-adapters/java-adapter.js +237 -0
- package/dist/gates/language-adapters/js-adapter.d.ts +26 -0
- package/dist/gates/language-adapters/js-adapter.js +279 -0
- package/dist/gates/language-adapters/python-adapter.d.ts +25 -0
- package/dist/gates/language-adapters/python-adapter.js +183 -0
- package/dist/gates/language-adapters/registry.d.ts +26 -0
- package/dist/gates/language-adapters/registry.js +65 -0
- package/dist/gates/language-adapters/ruby-adapter.d.ts +25 -0
- package/dist/gates/language-adapters/ruby-adapter.js +217 -0
- package/dist/gates/language-adapters/rust-adapter.d.ts +27 -0
- package/dist/gates/language-adapters/rust-adapter.js +235 -0
- package/dist/gates/language-adapters/types.d.ts +60 -0
- package/dist/gates/language-adapters/types.js +22 -0
- package/dist/gates/logic-drift-extractors.d.ts +15 -0
- package/dist/gates/logic-drift-extractors.js +34 -0
- package/dist/gates/logic-drift.d.ts +0 -30
- package/dist/gates/logic-drift.js +39 -129
- package/dist/gates/phantom-apis.d.ts +0 -2
- package/dist/gates/phantom-apis.js +49 -20
- package/dist/gates/promise-safety.d.ts +0 -1
- package/dist/gates/promise-safety.js +14 -2
- package/dist/gates/runner.js +52 -23
- package/dist/gates/runner.test.js +1 -1
- package/dist/gates/security-patterns-data.d.ts +14 -0
- package/dist/gates/security-patterns-data.js +235 -0
- package/dist/gates/security-patterns.d.ts +17 -3
- package/dist/gates/security-patterns.js +80 -211
- package/dist/gates/side-effect-analysis/categorizer.d.ts +32 -0
- package/dist/gates/side-effect-analysis/categorizer.js +83 -0
- package/dist/gates/{side-effect-analysis.d.ts → side-effect-analysis/index.d.ts} +3 -5
- package/dist/gates/{side-effect-analysis.js → side-effect-analysis/index.js} +33 -45
- package/dist/gates/side-effect-analysis/scope-tracker.d.ts +37 -0
- package/dist/gates/side-effect-analysis/scope-tracker.js +40 -0
- package/dist/gates/side-effect-helpers/index.d.ts +4 -0
- package/dist/gates/side-effect-helpers/index.js +4 -0
- package/dist/gates/side-effect-helpers/pattern-detection.d.ts +123 -0
- package/dist/gates/{side-effect-helpers.js → side-effect-helpers/pattern-detection.js} +22 -468
- package/dist/gates/side-effect-helpers/resource-tracking.d.ts +80 -0
- package/dist/gates/side-effect-helpers/resource-tracking.js +281 -0
- package/dist/gates/side-effect-helpers/scope-analysis.d.ts +21 -0
- package/dist/gates/side-effect-helpers/scope-analysis.js +146 -0
- package/dist/gates/side-effect-helpers/types.d.ts +38 -0
- package/dist/gates/side-effect-helpers/types.js +41 -0
- package/dist/gates/side-effect-rules.d.ts +0 -1
- package/dist/gates/side-effect-rules.js +0 -1
- package/dist/gates/style-drift-rules.d.ts +86 -0
- package/dist/gates/style-drift-rules.js +103 -0
- package/dist/gates/style-drift.d.ts +7 -16
- package/dist/gates/style-drift.js +101 -119
- package/dist/gates/test-quality-matchers.d.ts +53 -0
- package/dist/gates/test-quality-matchers.js +86 -0
- package/dist/gates/test-quality.d.ts +0 -3
- package/dist/gates/test-quality.js +47 -44
- package/dist/hooks/checker.d.ts +0 -1
- package/dist/hooks/checker.js +0 -2
- package/dist/hooks/dlp-templates.d.ts +0 -1
- package/dist/hooks/dlp-templates.js +0 -4
- package/dist/hooks/index.d.ts +0 -2
- package/dist/hooks/index.js +0 -2
- package/dist/hooks/input-validator.d.ts +0 -1
- package/dist/hooks/input-validator.js +0 -1
- package/dist/hooks/input-validator.test.js +0 -1
- package/dist/hooks/standalone-checker.d.ts +0 -1
- package/dist/hooks/standalone-checker.js +0 -1
- package/dist/hooks/standalone-dlp-checker.d.ts +0 -1
- package/dist/hooks/standalone-dlp-checker.js +0 -1
- package/dist/hooks/templates.d.ts +0 -1
- package/dist/hooks/templates.js +0 -1
- package/dist/hooks/types.d.ts +0 -1
- package/dist/hooks/types.js +0 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/inference/index.js +1 -1
- package/dist/services/adaptive-thresholds.d.ts +0 -2
- package/dist/services/adaptive-thresholds.js +0 -2
- package/dist/services/filesystem-cache.d.ts +0 -1
- package/dist/services/filesystem-cache.js +0 -1
- package/dist/services/score-history.d.ts +0 -1
- package/dist/services/score-history.js +0 -1
- package/dist/services/temporal-drift.d.ts +1 -2
- package/dist/services/temporal-drift.js +7 -8
- package/dist/storage/db.d.ts +23 -7
- package/dist/storage/db.js +116 -55
- package/dist/storage/findings.d.ts +4 -3
- package/dist/storage/findings.js +13 -20
- package/dist/storage/local-memory.d.ts +4 -4
- package/dist/storage/local-memory.js +20 -22
- package/dist/storage/patterns.d.ts +5 -5
- package/dist/storage/patterns.js +20 -26
- package/dist/storage/scans.d.ts +6 -6
- package/dist/storage/scans.js +12 -21
- package/dist/types/index.d.ts +1 -0
- package/dist/utils/scanner.js +1 -1
- package/package.json +7 -8
- package/dist/gates/duplication-drift.d.ts +0 -128
- package/dist/gates/duplication-drift.js +0 -585
- package/dist/gates/hallucinated-imports.js +0 -641
- package/dist/gates/side-effect-helpers.d.ts +0 -260
|
@@ -1,585 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Duplication Drift Gate (v2)
|
|
3
|
-
*
|
|
4
|
-
* Detects when AI generates near-identical functions across files because
|
|
5
|
-
* it doesn't remember what it already wrote. This is an AI-specific failure
|
|
6
|
-
* mode — humans reuse via copy-paste (same file), AI re-invents (cross-file).
|
|
7
|
-
*
|
|
8
|
-
* v2 upgrades:
|
|
9
|
-
* - tree-sitter AST node type sequences replace hand-rolled regex tokenizer
|
|
10
|
-
* - Jaccard similarity on AST node multisets (structural, not textual)
|
|
11
|
-
* - Catches duplicates even when every variable name is different
|
|
12
|
-
* - MD5 kept as fast-path for exact matches, Jaccard runs on remaining pairs
|
|
13
|
-
*
|
|
14
|
-
* Detection strategy (three-pass):
|
|
15
|
-
* 1. Extract function bodies, normalize text (strip comments/whitespace)
|
|
16
|
-
* 2. Parse with tree-sitter → walk AST → collect node type multiset
|
|
17
|
-
* 3. Generate semantic embeddings via all-MiniLM-L6-v2 (384D)
|
|
18
|
-
* 4. Pass 1 (fast): MD5 hash → exact duplicates (O(n), <10ms)
|
|
19
|
-
* 5. Pass 2 (Jaccard): AST node multiset similarity → structural near-duplicates (O(n²) bounded)
|
|
20
|
-
* 6. Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
21
|
-
* 7. Flag functions with similarity > threshold in different files
|
|
22
|
-
*
|
|
23
|
-
* Why AST node types > raw tokens:
|
|
24
|
-
* - `getUserById(id) { return db.find(x => x.id === id) }`
|
|
25
|
-
* - `fetchUser(userId) { return database.filter(u => u.id === userId)[0] }`
|
|
26
|
-
* Both produce similar AST: [return_statement, call_expression, arrow_function,
|
|
27
|
-
* binary_expression, member_expression]. Variable names are invisible.
|
|
28
|
-
*
|
|
29
|
-
* @since v2.16.0 (original MD5)
|
|
30
|
-
* @since v5.0.0 (tree-sitter AST + Jaccard)
|
|
31
|
-
* @since v5.1.0 (semantic embedding Pass 3)
|
|
32
|
-
*/
|
|
33
|
-
import { Gate } from './base.js';
|
|
34
|
-
import { FileScanner } from '../utils/scanner.js';
|
|
35
|
-
import { Logger } from '../utils/logger.js';
|
|
36
|
-
import { generateEmbedding, cosineSimilarity } from '../pattern-index/embeddings.js';
|
|
37
|
-
import crypto from 'crypto';
|
|
38
|
-
import path from 'path';
|
|
39
|
-
import { fileURLToPath } from 'url';
|
|
40
|
-
// tree-sitter is optional — graceful fallback to text tokenization
|
|
41
|
-
let Parser = null;
|
|
42
|
-
let treeSitterReady = false;
|
|
43
|
-
let treeSitterFailed = false;
|
|
44
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
45
|
-
async function initTreeSitter() {
|
|
46
|
-
if (treeSitterReady)
|
|
47
|
-
return true;
|
|
48
|
-
if (treeSitterFailed)
|
|
49
|
-
return false;
|
|
50
|
-
try {
|
|
51
|
-
const mod = await import('web-tree-sitter');
|
|
52
|
-
Parser = mod.default || mod;
|
|
53
|
-
await Parser.init();
|
|
54
|
-
treeSitterReady = true;
|
|
55
|
-
return true;
|
|
56
|
-
}
|
|
57
|
-
catch {
|
|
58
|
-
treeSitterFailed = true;
|
|
59
|
-
Logger.debug('tree-sitter not available, falling back to text tokenization');
|
|
60
|
-
return false;
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
const GRAMMAR_PATHS = {
|
|
64
|
-
'.ts': '../../vendor/grammars/tree-sitter-typescript.wasm',
|
|
65
|
-
'.tsx': '../../vendor/grammars/tree-sitter-tsx.wasm',
|
|
66
|
-
'.js': '../../vendor/grammars/tree-sitter-javascript.wasm',
|
|
67
|
-
'.jsx': '../../vendor/grammars/tree-sitter-javascript.wasm',
|
|
68
|
-
'.py': '../../vendor/grammars/tree-sitter-python.wasm',
|
|
69
|
-
'.go': '../../vendor/grammars/tree-sitter-go.wasm',
|
|
70
|
-
'.rs': '../../vendor/grammars/tree-sitter-rust.wasm',
|
|
71
|
-
};
|
|
72
|
-
// Cache loaded languages
|
|
73
|
-
const languageCache = new Map();
|
|
74
|
-
export class DuplicationDriftGate extends Gate {
|
|
75
|
-
config;
|
|
76
|
-
parser = null;
|
|
77
|
-
constructor(config = {}) {
|
|
78
|
-
super('duplication-drift', 'AI Duplication Drift Detection');
|
|
79
|
-
this.config = {
|
|
80
|
-
enabled: config.enabled ?? true,
|
|
81
|
-
similarity_threshold: config.similarity_threshold ?? 0.75,
|
|
82
|
-
semantic_threshold: config.semantic_threshold ?? 0.85,
|
|
83
|
-
semantic_enabled: config.semantic_enabled ?? true,
|
|
84
|
-
min_body_lines: config.min_body_lines ?? 5,
|
|
85
|
-
approved_duplications: config.approved_duplications ?? [],
|
|
86
|
-
};
|
|
87
|
-
}
|
|
88
|
-
get provenance() { return 'ai-drift'; }
|
|
89
|
-
async run(context) {
|
|
90
|
-
if (!this.config.enabled)
|
|
91
|
-
return [];
|
|
92
|
-
// Try to init tree-sitter (non-blocking, falls back gracefully)
|
|
93
|
-
const hasTreeSitter = await initTreeSitter();
|
|
94
|
-
if (hasTreeSitter && !this.parser) {
|
|
95
|
-
this.parser = new Parser();
|
|
96
|
-
}
|
|
97
|
-
const failures = [];
|
|
98
|
-
const functions = [];
|
|
99
|
-
const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py,go,rs}'];
|
|
100
|
-
const files = await FileScanner.findFiles({
|
|
101
|
-
cwd: context.cwd,
|
|
102
|
-
patterns: scanPatterns,
|
|
103
|
-
ignore: [...(context.ignore || []), '**/node_modules/**', '**/dist/**', '**/*.test.*', '**/*.spec.*'],
|
|
104
|
-
});
|
|
105
|
-
Logger.info(`Duplication Drift: Scanning ${files.length} files (tree-sitter: ${hasTreeSitter ? 'ON' : 'fallback'})`);
|
|
106
|
-
for (const file of files) {
|
|
107
|
-
try {
|
|
108
|
-
const { readFile } = await import('fs-extra');
|
|
109
|
-
const content = await readFile(path.join(context.cwd, file), 'utf-8');
|
|
110
|
-
const ext = path.extname(file);
|
|
111
|
-
if (['.ts', '.js', '.tsx', '.jsx'].includes(ext)) {
|
|
112
|
-
this.extractJSFunctions(content, file, functions);
|
|
113
|
-
}
|
|
114
|
-
else if (ext === '.py') {
|
|
115
|
-
this.extractPyFunctions(content, file, functions);
|
|
116
|
-
}
|
|
117
|
-
// Generate AST tokens using tree-sitter if available
|
|
118
|
-
if (hasTreeSitter && GRAMMAR_PATHS[ext]) {
|
|
119
|
-
await this.enrichWithASTTokens(content, ext, file, functions);
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
catch (e) { }
|
|
123
|
-
}
|
|
124
|
-
// Pass 3 prep: Generate semantic embeddings for all extracted functions
|
|
125
|
-
// (embedding generation is lazy — only runs when semantic_enabled is true)
|
|
126
|
-
if (this.config.semantic_enabled && functions.length > 0) {
|
|
127
|
-
const allIndices = functions.map((_, i) => i);
|
|
128
|
-
await this.enrichWithEmbeddings(functions, allIndices);
|
|
129
|
-
}
|
|
130
|
-
const duplicateGroups = this.findDuplicateGroups(functions);
|
|
131
|
-
// Build approved pairs set for fast lookup
|
|
132
|
-
const approvedSet = new Set((this.config.approved_duplications || []).map(s => s.toLowerCase()));
|
|
133
|
-
for (const group of duplicateGroups) {
|
|
134
|
-
// Check if this pair is human-approved
|
|
135
|
-
const names = group.map(f => f.name).sort();
|
|
136
|
-
const pairKey = names.join(':').toLowerCase();
|
|
137
|
-
if (approvedSet.has(pairKey))
|
|
138
|
-
continue;
|
|
139
|
-
const files = group.map(f => f.file);
|
|
140
|
-
const locations = group.map(f => `${f.file}:${f.line} (${f.name})`).join(', ');
|
|
141
|
-
// Determine similarity % and method used
|
|
142
|
-
let similarity;
|
|
143
|
-
let method;
|
|
144
|
-
if (group[0].bodyHash === group[1]?.bodyHash) {
|
|
145
|
-
similarity = 1.0;
|
|
146
|
-
method = 'exact-hash';
|
|
147
|
-
}
|
|
148
|
-
else if (group[0].embedding && group[1]?.embedding) {
|
|
149
|
-
const jaccardSim = this.jaccardSimilarity(group[0].astTokens, group[1].astTokens);
|
|
150
|
-
const cosineSim = cosineSimilarity(group[0].embedding, group[1].embedding);
|
|
151
|
-
if (cosineSim > jaccardSim) {
|
|
152
|
-
similarity = cosineSim;
|
|
153
|
-
method = 'semantic-embedding';
|
|
154
|
-
}
|
|
155
|
-
else {
|
|
156
|
-
similarity = jaccardSim;
|
|
157
|
-
method = 'ast-jaccard';
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
else {
|
|
161
|
-
similarity = group.length > 1
|
|
162
|
-
? this.jaccardSimilarity(group[0].astTokens, group[1].astTokens)
|
|
163
|
-
: 1.0;
|
|
164
|
-
method = 'ast-jaccard';
|
|
165
|
-
}
|
|
166
|
-
const pct = (similarity * 100).toFixed(0);
|
|
167
|
-
failures.push(this.createFailure(`AI Duplication Drift: Function '${group[0].name}' has ${group.length} near-identical copies (${pct}% similar via ${method})`, [...new Set(files)], `Found duplicate implementations at: ${locations}. Extract to a shared module and import.`, 'Duplication Drift', group[0].line, undefined, 'high'));
|
|
168
|
-
}
|
|
169
|
-
return failures;
|
|
170
|
-
}
|
|
171
|
-
// ─── tree-sitter AST Tokenization ───────────────────────────────
|
|
172
|
-
/**
|
|
173
|
-
* Parse the file with tree-sitter, find function nodes that match
|
|
174
|
-
* our extracted functions (by line number), and replace their token
|
|
175
|
-
* multisets with AST node type sequences.
|
|
176
|
-
*
|
|
177
|
-
* AST node types are language-agnostic structural tokens:
|
|
178
|
-
* - if_statement, for_statement, return_statement
|
|
179
|
-
* - call_expression, member_expression, binary_expression
|
|
180
|
-
* - arrow_function, function_declaration
|
|
181
|
-
*
|
|
182
|
-
* Variable names, string literals, comments — all invisible.
|
|
183
|
-
* Only STRUCTURE matters.
|
|
184
|
-
*/
|
|
185
|
-
async enrichWithASTTokens(content, ext, file, functions) {
|
|
186
|
-
if (!this.parser)
|
|
187
|
-
return;
|
|
188
|
-
const grammarRelPath = GRAMMAR_PATHS[ext];
|
|
189
|
-
if (!grammarRelPath)
|
|
190
|
-
return;
|
|
191
|
-
try {
|
|
192
|
-
// Load language (cached)
|
|
193
|
-
if (!languageCache.has(ext)) {
|
|
194
|
-
const grammarPath = path.resolve(__dirname, grammarRelPath);
|
|
195
|
-
const lang = await Parser.Language.load(grammarPath);
|
|
196
|
-
languageCache.set(ext, lang);
|
|
197
|
-
}
|
|
198
|
-
const lang = languageCache.get(ext);
|
|
199
|
-
this.parser.setLanguage(lang);
|
|
200
|
-
const tree = this.parser.parse(content);
|
|
201
|
-
// Find functions that belong to this file
|
|
202
|
-
const fileFunctions = functions.filter(f => f.file === file);
|
|
203
|
-
for (const fn of fileFunctions) {
|
|
204
|
-
// Find the AST node at this function's line
|
|
205
|
-
const node = this.findFunctionNodeAtLine(tree.rootNode, fn.line);
|
|
206
|
-
if (node) {
|
|
207
|
-
fn.astTokens = this.collectASTNodeTypes(node);
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
catch (e) {
|
|
212
|
-
// tree-sitter parse failed for this file — keep text tokens
|
|
213
|
-
Logger.debug(`tree-sitter parse failed for ${file}: ${e}`);
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
/**
|
|
217
|
-
* Walk the AST tree to find a function/method node at a given line.
|
|
218
|
-
*/
|
|
219
|
-
findFunctionNodeAtLine(rootNode, targetLine) {
|
|
220
|
-
const functionTypes = new Set([
|
|
221
|
-
'function_declaration', 'method_definition', 'arrow_function',
|
|
222
|
-
'function_definition', // Python
|
|
223
|
-
'function_item', // Rust
|
|
224
|
-
'method_declaration', // Java/C#
|
|
225
|
-
'lexical_declaration', // const x = () => {}
|
|
226
|
-
]);
|
|
227
|
-
let bestMatch = null;
|
|
228
|
-
const walk = (node) => {
|
|
229
|
-
// tree-sitter lines are 0-indexed, our lines are 1-indexed
|
|
230
|
-
if (functionTypes.has(node.type) && node.startPosition.row + 1 === targetLine) {
|
|
231
|
-
bestMatch = node;
|
|
232
|
-
return;
|
|
233
|
-
}
|
|
234
|
-
for (let i = 0; i < node.childCount; i++) {
|
|
235
|
-
walk(node.child(i));
|
|
236
|
-
if (bestMatch)
|
|
237
|
-
return;
|
|
238
|
-
}
|
|
239
|
-
};
|
|
240
|
-
walk(rootNode);
|
|
241
|
-
return bestMatch;
|
|
242
|
-
}
|
|
243
|
-
/**
|
|
244
|
-
* Walk an AST subtree and collect node types as a multiset.
|
|
245
|
-
*
|
|
246
|
-
* This is the core insight: two functions with different variable names
|
|
247
|
-
* but the same control flow produce the same node type multiset.
|
|
248
|
-
*
|
|
249
|
-
* Example:
|
|
250
|
-
* `function a(x) { if (x > 0) return x * 2; return 0; }`
|
|
251
|
-
* `function b(val) { if (val > 0) return val * 2; return 0; }`
|
|
252
|
-
*
|
|
253
|
-
* Both produce: {if_statement: 1, binary_expression: 2, return_statement: 2, ...}
|
|
254
|
-
* → Jaccard similarity = 1.0
|
|
255
|
-
*/
|
|
256
|
-
collectASTNodeTypes(node) {
|
|
257
|
-
const types = new Map();
|
|
258
|
-
const walk = (n) => {
|
|
259
|
-
// Skip leaf nodes that are just identifiers/literals (noise)
|
|
260
|
-
// Keep structural node types only
|
|
261
|
-
if (n.childCount > 0 || isStructuralLeaf(n.type)) {
|
|
262
|
-
types.set(n.type, (types.get(n.type) || 0) + 1);
|
|
263
|
-
}
|
|
264
|
-
for (let i = 0; i < n.childCount; i++) {
|
|
265
|
-
walk(n.child(i));
|
|
266
|
-
}
|
|
267
|
-
};
|
|
268
|
-
walk(node);
|
|
269
|
-
return types;
|
|
270
|
-
}
|
|
271
|
-
// ─── Fallback Text Tokenization ─────────────────────────────────
|
|
272
|
-
/**
|
|
273
|
-
* Fallback tokenizer when tree-sitter is not available.
|
|
274
|
-
* Uses normalized text → keyword/operator multiset.
|
|
275
|
-
*/
|
|
276
|
-
textTokenize(normalized) {
|
|
277
|
-
const tokens = new Map();
|
|
278
|
-
const structural = normalized.match(/\b(if|else|for|while|return|const|let|var|function|class|import|export|async|await|try|catch|throw|new|switch|case|break|continue|yield|def|self)\b|[{}()\[\];,.:=<>!&|+\-*/%?]+/g) || [];
|
|
279
|
-
for (const token of structural) {
|
|
280
|
-
tokens.set(token, (tokens.get(token) || 0) + 1);
|
|
281
|
-
}
|
|
282
|
-
// Normalize all identifiers to a count (variable names don't matter)
|
|
283
|
-
const keywords = new Set([
|
|
284
|
-
'if', 'else', 'for', 'while', 'return', 'const', 'let', 'var',
|
|
285
|
-
'function', 'class', 'import', 'export', 'async', 'await',
|
|
286
|
-
'try', 'catch', 'throw', 'new', 'switch', 'case', 'break',
|
|
287
|
-
'continue', 'yield', 'def', 'self', 'true', 'false', 'null', 'undefined',
|
|
288
|
-
]);
|
|
289
|
-
const identifiers = normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
|
|
290
|
-
let idCount = 0;
|
|
291
|
-
for (const id of identifiers) {
|
|
292
|
-
if (!keywords.has(id))
|
|
293
|
-
idCount++;
|
|
294
|
-
}
|
|
295
|
-
if (idCount > 0)
|
|
296
|
-
tokens.set('_ID_', idCount);
|
|
297
|
-
return tokens;
|
|
298
|
-
}
|
|
299
|
-
// ─── Jaccard Similarity ─────────────────────────────────────────
|
|
300
|
-
/**
|
|
301
|
-
* Jaccard similarity on multisets.
|
|
302
|
-
* intersection = sum of min(countA, countB) for each key
|
|
303
|
-
* union = sum of max(countA, countB) for each key
|
|
304
|
-
*/
|
|
305
|
-
jaccardSimilarity(a, b) {
|
|
306
|
-
const allKeys = new Set([...a.keys(), ...b.keys()]);
|
|
307
|
-
let intersection = 0;
|
|
308
|
-
let union = 0;
|
|
309
|
-
for (const key of allKeys) {
|
|
310
|
-
const countA = a.get(key) || 0;
|
|
311
|
-
const countB = b.get(key) || 0;
|
|
312
|
-
intersection += Math.min(countA, countB);
|
|
313
|
-
union += Math.max(countA, countB);
|
|
314
|
-
}
|
|
315
|
-
return union === 0 ? 0 : intersection / union;
|
|
316
|
-
}
|
|
317
|
-
// ─── Function Extraction ────────────────────────────────────────
|
|
318
|
-
extractJSFunctions(content, file, functions) {
|
|
319
|
-
const lines = content.split('\n');
|
|
320
|
-
const patterns = [
|
|
321
|
-
/^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/,
|
|
322
|
-
/^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|(\w+))\s*=>/,
|
|
323
|
-
/^\s+(?:async\s+)?(\w+)\s*\(([^)]*)\)\s*\{/,
|
|
324
|
-
];
|
|
325
|
-
for (let i = 0; i < lines.length; i++) {
|
|
326
|
-
const line = lines[i];
|
|
327
|
-
for (const pattern of patterns) {
|
|
328
|
-
const match = line.match(pattern);
|
|
329
|
-
if (match) {
|
|
330
|
-
const name = match[1];
|
|
331
|
-
const params = match[2] || '';
|
|
332
|
-
const body = this.extractFunctionBody(lines, i);
|
|
333
|
-
if (body.length >= this.config.min_body_lines) {
|
|
334
|
-
const normalized = this.normalizeBody(body.join('\n'));
|
|
335
|
-
functions.push({
|
|
336
|
-
name,
|
|
337
|
-
file,
|
|
338
|
-
line: i + 1,
|
|
339
|
-
paramCount: params ? params.split(',').length : 0,
|
|
340
|
-
bodyHash: this.hash(normalized),
|
|
341
|
-
bodyLength: body.length,
|
|
342
|
-
normalized,
|
|
343
|
-
// Start with text tokens, enrichWithASTTokens() upgrades if tree-sitter available
|
|
344
|
-
astTokens: this.textTokenize(normalized),
|
|
345
|
-
});
|
|
346
|
-
}
|
|
347
|
-
break;
|
|
348
|
-
}
|
|
349
|
-
}
|
|
350
|
-
}
|
|
351
|
-
}
|
|
352
|
-
extractPyFunctions(content, file, functions) {
|
|
353
|
-
const lines = content.split('\n');
|
|
354
|
-
for (let i = 0; i < lines.length; i++) {
|
|
355
|
-
const match = lines[i].match(/^(?:\s*)(?:async\s+)?def\s+(\w+)\s*\(([^)]*)\)/);
|
|
356
|
-
if (match) {
|
|
357
|
-
const name = match[1];
|
|
358
|
-
const params = match[2] || '';
|
|
359
|
-
const indent = lines[i].match(/^(\s*)/)?.[1]?.length || 0;
|
|
360
|
-
const body = [];
|
|
361
|
-
for (let j = i + 1; j < lines.length; j++) {
|
|
362
|
-
const lineIndent = lines[j].match(/^(\s*)/)?.[1]?.length || 0;
|
|
363
|
-
if (lines[j].trim() === '' || lineIndent > indent) {
|
|
364
|
-
body.push(lines[j]);
|
|
365
|
-
}
|
|
366
|
-
else {
|
|
367
|
-
break;
|
|
368
|
-
}
|
|
369
|
-
}
|
|
370
|
-
if (body.length >= this.config.min_body_lines) {
|
|
371
|
-
const normalized = this.normalizeBody(body.join('\n'));
|
|
372
|
-
functions.push({
|
|
373
|
-
name,
|
|
374
|
-
file,
|
|
375
|
-
line: i + 1,
|
|
376
|
-
paramCount: params ? params.split(',').length : 0,
|
|
377
|
-
bodyHash: this.hash(normalized),
|
|
378
|
-
bodyLength: body.length,
|
|
379
|
-
normalized,
|
|
380
|
-
astTokens: this.textTokenize(normalized),
|
|
381
|
-
});
|
|
382
|
-
}
|
|
383
|
-
}
|
|
384
|
-
}
|
|
385
|
-
}
|
|
386
|
-
extractFunctionBody(lines, startIndex) {
|
|
387
|
-
let braceDepth = 0;
|
|
388
|
-
let started = false;
|
|
389
|
-
const body = [];
|
|
390
|
-
for (let i = startIndex; i < lines.length; i++) {
|
|
391
|
-
const line = lines[i];
|
|
392
|
-
for (const ch of line) {
|
|
393
|
-
if (ch === '{') {
|
|
394
|
-
braceDepth++;
|
|
395
|
-
started = true;
|
|
396
|
-
}
|
|
397
|
-
if (ch === '}')
|
|
398
|
-
braceDepth--;
|
|
399
|
-
}
|
|
400
|
-
if (started)
|
|
401
|
-
body.push(line);
|
|
402
|
-
if (started && braceDepth === 0)
|
|
403
|
-
break;
|
|
404
|
-
}
|
|
405
|
-
return body;
|
|
406
|
-
}
|
|
407
|
-
normalizeBody(body) {
|
|
408
|
-
return body
|
|
409
|
-
.replace(/\/\/.*/g, '')
|
|
410
|
-
.replace(/\/\*[\s\S]*?\*\//g, '')
|
|
411
|
-
.replace(/#.*/g, '')
|
|
412
|
-
.replace(/`[^`]*`/g, '"STR"')
|
|
413
|
-
.replace(/\basync\s+/g, '')
|
|
414
|
-
.replace(/\s+/g, ' ')
|
|
415
|
-
.replace(/['"]/g, '"')
|
|
416
|
-
.trim();
|
|
417
|
-
}
|
|
418
|
-
hash(text) {
|
|
419
|
-
return crypto.createHash('md5').update(text).digest('hex');
|
|
420
|
-
}
|
|
421
|
-
// ─── Semantic Embedding ─────────────────────────────────────────
|
|
422
|
-
/**
|
|
423
|
-
* Generate semantic embedding text for a function.
|
|
424
|
-
* Combines function name, parameter names, and first 200 tokens of body.
|
|
425
|
-
* This captures INTENT regardless of implementation differences.
|
|
426
|
-
*
|
|
427
|
-
* Example:
|
|
428
|
-
* getUserById(id) { return db.users.find(x => x.id === id) }
|
|
429
|
-
* → "getUserById id return db users find x id id"
|
|
430
|
-
*
|
|
431
|
-
* fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
|
|
432
|
-
* → "fetchUserRecord userId return database users filter u id userId 0"
|
|
433
|
-
*
|
|
434
|
-
* These produce similar embeddings (~0.91 cosine) despite different AST.
|
|
435
|
-
*/
|
|
436
|
-
buildEmbeddingText(fn) {
|
|
437
|
-
// Extract identifiers from normalized body (first 200 tokens)
|
|
438
|
-
const bodyTokens = fn.normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
|
|
439
|
-
const first200 = bodyTokens.slice(0, 200).join(' ');
|
|
440
|
-
return `${fn.name} ${first200}`;
|
|
441
|
-
}
|
|
442
|
-
/**
|
|
443
|
-
* Enrich functions with semantic embeddings for Pass 3.
|
|
444
|
-
* Only called for functions not already claimed by Pass 1/2.
|
|
445
|
-
* Uses generateEmbedding() from pattern-index/embeddings.ts.
|
|
446
|
-
*/
|
|
447
|
-
async enrichWithEmbeddings(functions, indices) {
|
|
448
|
-
Logger.info(`Semantic Pass 3: Generating embeddings for ${indices.length} functions`);
|
|
449
|
-
for (const idx of indices) {
|
|
450
|
-
const fn = functions[idx];
|
|
451
|
-
try {
|
|
452
|
-
const text = this.buildEmbeddingText(fn);
|
|
453
|
-
fn.embedding = await generateEmbedding(text);
|
|
454
|
-
}
|
|
455
|
-
catch {
|
|
456
|
-
// Embedding failed — skip this function for Pass 3
|
|
457
|
-
Logger.debug(`Embedding generation failed for ${fn.file}:${fn.name}`);
|
|
458
|
-
}
|
|
459
|
-
}
|
|
460
|
-
}
|
|
461
|
-
// ─── Duplicate Finding (three-pass) ──────────────────────────────
|
|
462
|
-
/**
|
|
463
|
-
* Three-pass duplicate detection:
|
|
464
|
-
* Pass 1 (fast): MD5 hash → exact duplicates (O(n))
|
|
465
|
-
* Pass 2 (Jaccard): AST node multiset similarity → near-duplicates (O(n²) bounded)
|
|
466
|
-
* Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
467
|
-
*
|
|
468
|
-
* Pass 3 catches what AST Jaccard misses: same intent, different implementation.
|
|
469
|
-
* Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
|
|
470
|
-
*/
|
|
471
|
-
findDuplicateGroups(functions) {
|
|
472
|
-
const duplicates = [];
|
|
473
|
-
const claimedIndices = new Set();
|
|
474
|
-
// Pass 1: Exact hash match
|
|
475
|
-
const hashGroups = new Map();
|
|
476
|
-
for (let i = 0; i < functions.length; i++) {
|
|
477
|
-
const existing = hashGroups.get(functions[i].bodyHash) || [];
|
|
478
|
-
existing.push(i);
|
|
479
|
-
hashGroups.set(functions[i].bodyHash, existing);
|
|
480
|
-
}
|
|
481
|
-
for (const indices of hashGroups.values()) {
|
|
482
|
-
if (indices.length < 2)
|
|
483
|
-
continue;
|
|
484
|
-
const group = indices.map(i => functions[i]);
|
|
485
|
-
const uniqueFiles = new Set(group.map(f => f.file));
|
|
486
|
-
if (uniqueFiles.size >= 2) {
|
|
487
|
-
duplicates.push(group);
|
|
488
|
-
indices.forEach(i => claimedIndices.add(i));
|
|
489
|
-
}
|
|
490
|
-
}
|
|
491
|
-
// Pass 2: Jaccard on AST tokens for remaining functions
|
|
492
|
-
const remaining = functions
|
|
493
|
-
.map((fn, i) => ({ fn, idx: i }))
|
|
494
|
-
.filter(({ idx }) => !claimedIndices.has(idx));
|
|
495
|
-
remaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
|
|
496
|
-
const jaccardClaimed = new Set();
|
|
497
|
-
for (let i = 0; i < remaining.length; i++) {
|
|
498
|
-
if (jaccardClaimed.has(remaining[i].idx))
|
|
499
|
-
continue;
|
|
500
|
-
const group = [remaining[i].fn];
|
|
501
|
-
const baseLen = remaining[i].fn.bodyLength;
|
|
502
|
-
for (let j = i + 1; j < remaining.length; j++) {
|
|
503
|
-
if (jaccardClaimed.has(remaining[j].idx))
|
|
504
|
-
continue;
|
|
505
|
-
if (remaining[j].fn.bodyLength > baseLen * 1.5)
|
|
506
|
-
break;
|
|
507
|
-
if (remaining[j].fn.file === remaining[i].fn.file)
|
|
508
|
-
continue;
|
|
509
|
-
const sim = this.jaccardSimilarity(remaining[i].fn.astTokens, remaining[j].fn.astTokens);
|
|
510
|
-
if (sim >= this.config.similarity_threshold) {
|
|
511
|
-
group.push(remaining[j].fn);
|
|
512
|
-
jaccardClaimed.add(remaining[j].idx);
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
if (group.length >= 2) {
|
|
516
|
-
const uniqueFiles = new Set(group.map(f => f.file));
|
|
517
|
-
if (uniqueFiles.size >= 2) {
|
|
518
|
-
duplicates.push(group);
|
|
519
|
-
jaccardClaimed.add(remaining[i].idx);
|
|
520
|
-
}
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
// Mark all Pass 1 + Pass 2 claimed indices
|
|
524
|
-
for (const idx of jaccardClaimed)
|
|
525
|
-
claimedIndices.add(idx);
|
|
526
|
-
// Pass 3: Semantic embedding cosine similarity for still-unclaimed functions
|
|
527
|
-
if (this.config.semantic_enabled) {
|
|
528
|
-
const semanticRemaining = functions
|
|
529
|
-
.map((fn, i) => ({ fn, idx: i }))
|
|
530
|
-
.filter(({ idx }) => !claimedIndices.has(idx))
|
|
531
|
-
.filter(({ fn }) => fn.embedding && fn.embedding.length > 0);
|
|
532
|
-
semanticRemaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
|
|
533
|
-
const semanticClaimed = new Set();
|
|
534
|
-
for (let i = 0; i < semanticRemaining.length; i++) {
|
|
535
|
-
if (semanticClaimed.has(semanticRemaining[i].idx))
|
|
536
|
-
continue;
|
|
537
|
-
const group = [semanticRemaining[i].fn];
|
|
538
|
-
const baseLen = semanticRemaining[i].fn.bodyLength;
|
|
539
|
-
for (let j = i + 1; j < semanticRemaining.length; j++) {
|
|
540
|
-
if (semanticClaimed.has(semanticRemaining[j].idx))
|
|
541
|
-
continue;
|
|
542
|
-
// Body length must be within 2x range (semantic allows more variance)
|
|
543
|
-
if (semanticRemaining[j].fn.bodyLength > baseLen * 2.0)
|
|
544
|
-
break;
|
|
545
|
-
if (semanticRemaining[j].fn.file === semanticRemaining[i].fn.file)
|
|
546
|
-
continue;
|
|
547
|
-
const sim = cosineSimilarity(semanticRemaining[i].fn.embedding, semanticRemaining[j].fn.embedding);
|
|
548
|
-
if (sim >= this.config.semantic_threshold) {
|
|
549
|
-
group.push(semanticRemaining[j].fn);
|
|
550
|
-
semanticClaimed.add(semanticRemaining[j].idx);
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
if (group.length >= 2) {
|
|
554
|
-
const uniqueFiles = new Set(group.map(f => f.file));
|
|
555
|
-
if (uniqueFiles.size >= 2) {
|
|
556
|
-
duplicates.push(group);
|
|
557
|
-
semanticClaimed.add(semanticRemaining[i].idx);
|
|
558
|
-
}
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
if (semanticClaimed.size > 0) {
|
|
562
|
-
Logger.info(`Semantic Pass 3: Found ${semanticClaimed.size} additional semantic duplicates`);
|
|
563
|
-
}
|
|
564
|
-
}
|
|
565
|
-
return duplicates;
|
|
566
|
-
}
|
|
567
|
-
}
|
|
568
|
-
/**
|
|
569
|
-
* AST node types that are structural even as leaf nodes.
|
|
570
|
-
* These carry semantic meaning without children.
|
|
571
|
-
*/
|
|
572
|
-
function isStructuralLeaf(type) {
|
|
573
|
-
const structural = new Set([
|
|
574
|
-
'return', 'break', 'continue', 'yield', 'throw',
|
|
575
|
-
'true', 'false', 'null', 'undefined', 'none',
|
|
576
|
-
'self', 'this', 'super',
|
|
577
|
-
'string', 'number', 'template_string',
|
|
578
|
-
// Operators
|
|
579
|
-
'=', '==', '===', '!=', '!==', '<', '>', '<=', '>=',
|
|
580
|
-
'+', '-', '*', '/', '%', '**',
|
|
581
|
-
'&&', '||', '!', '??',
|
|
582
|
-
'=>', '...', '?', ':',
|
|
583
|
-
]);
|
|
584
|
-
return structural.has(type);
|
|
585
|
-
}
|