@rigour-labs/core 5.0.1 → 5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/README.md +9 -1
  2. package/dist/gates/agent-team.d.ts +0 -1
  3. package/dist/gates/agent-team.js +0 -1
  4. package/dist/gates/checkpoint.d.ts +0 -2
  5. package/dist/gates/checkpoint.js +0 -2
  6. package/dist/gates/context-window-artifacts.d.ts +6 -2
  7. package/dist/gates/context-window-artifacts.js +107 -31
  8. package/dist/gates/deep-analysis.d.ts +2 -0
  9. package/dist/gates/deep-analysis.js +41 -11
  10. package/dist/gates/dependency.d.ts +0 -2
  11. package/dist/gates/dependency.js +23 -5
  12. package/dist/gates/deprecated-apis.d.ts +0 -2
  13. package/dist/gates/deprecated-apis.js +33 -20
  14. package/dist/gates/duplication-drift/index.d.ts +61 -0
  15. package/dist/gates/duplication-drift/index.js +240 -0
  16. package/dist/gates/duplication-drift/similarity.d.ts +68 -0
  17. package/dist/gates/duplication-drift/similarity.js +177 -0
  18. package/dist/gates/duplication-drift/tokenizer.d.ts +55 -0
  19. package/dist/gates/duplication-drift/tokenizer.js +195 -0
  20. package/dist/gates/frontend-secret-exposure.d.ts +0 -3
  21. package/dist/gates/frontend-secret-exposure.js +1 -114
  22. package/dist/gates/frontend-secret-patterns.d.ts +33 -0
  23. package/dist/gates/frontend-secret-patterns.js +119 -0
  24. package/dist/gates/{hallucinated-imports.d.ts → hallucinated-imports/index.d.ts} +2 -29
  25. package/dist/gates/hallucinated-imports/index.js +174 -0
  26. package/dist/gates/hallucinated-imports/js-resolver.d.ts +45 -0
  27. package/dist/gates/hallucinated-imports/js-resolver.js +320 -0
  28. package/dist/gates/hallucinated-imports/manifest-discovery.d.ts +28 -0
  29. package/dist/gates/hallucinated-imports/manifest-discovery.js +114 -0
  30. package/dist/gates/hallucinated-imports/python-resolver.d.ts +24 -0
  31. package/dist/gates/hallucinated-imports/python-resolver.js +306 -0
  32. package/dist/gates/hallucinated-imports-lang.d.ts +2 -2
  33. package/dist/gates/hallucinated-imports-lang.js +269 -34
  34. package/dist/gates/hallucinated-imports.test.js +1 -2
  35. package/dist/gates/inconsistent-error-handling.d.ts +0 -5
  36. package/dist/gates/inconsistent-error-handling.js +15 -144
  37. package/dist/gates/language-adapters/csharp-adapter.d.ts +16 -0
  38. package/dist/gates/language-adapters/csharp-adapter.js +211 -0
  39. package/dist/gates/language-adapters/go-adapter.d.ts +26 -0
  40. package/dist/gates/language-adapters/go-adapter.js +195 -0
  41. package/dist/gates/language-adapters/index.d.ts +15 -0
  42. package/dist/gates/language-adapters/index.js +16 -0
  43. package/dist/gates/language-adapters/java-adapter.d.ts +16 -0
  44. package/dist/gates/language-adapters/java-adapter.js +237 -0
  45. package/dist/gates/language-adapters/js-adapter.d.ts +26 -0
  46. package/dist/gates/language-adapters/js-adapter.js +279 -0
  47. package/dist/gates/language-adapters/python-adapter.d.ts +25 -0
  48. package/dist/gates/language-adapters/python-adapter.js +183 -0
  49. package/dist/gates/language-adapters/registry.d.ts +26 -0
  50. package/dist/gates/language-adapters/registry.js +65 -0
  51. package/dist/gates/language-adapters/ruby-adapter.d.ts +25 -0
  52. package/dist/gates/language-adapters/ruby-adapter.js +217 -0
  53. package/dist/gates/language-adapters/rust-adapter.d.ts +27 -0
  54. package/dist/gates/language-adapters/rust-adapter.js +235 -0
  55. package/dist/gates/language-adapters/types.d.ts +60 -0
  56. package/dist/gates/language-adapters/types.js +22 -0
  57. package/dist/gates/logic-drift-extractors.d.ts +15 -0
  58. package/dist/gates/logic-drift-extractors.js +34 -0
  59. package/dist/gates/logic-drift.d.ts +0 -30
  60. package/dist/gates/logic-drift.js +39 -129
  61. package/dist/gates/phantom-apis.d.ts +0 -2
  62. package/dist/gates/phantom-apis.js +49 -20
  63. package/dist/gates/promise-safety.d.ts +0 -1
  64. package/dist/gates/promise-safety.js +14 -2
  65. package/dist/gates/runner.js +51 -22
  66. package/dist/gates/security-patterns-data.d.ts +14 -0
  67. package/dist/gates/security-patterns-data.js +235 -0
  68. package/dist/gates/security-patterns.d.ts +17 -3
  69. package/dist/gates/security-patterns.js +80 -211
  70. package/dist/gates/side-effect-analysis/categorizer.d.ts +32 -0
  71. package/dist/gates/side-effect-analysis/categorizer.js +83 -0
  72. package/dist/gates/{side-effect-analysis.d.ts → side-effect-analysis/index.d.ts} +3 -5
  73. package/dist/gates/{side-effect-analysis.js → side-effect-analysis/index.js} +33 -45
  74. package/dist/gates/side-effect-analysis/scope-tracker.d.ts +37 -0
  75. package/dist/gates/side-effect-analysis/scope-tracker.js +40 -0
  76. package/dist/gates/side-effect-helpers/index.d.ts +4 -0
  77. package/dist/gates/side-effect-helpers/index.js +4 -0
  78. package/dist/gates/side-effect-helpers/pattern-detection.d.ts +123 -0
  79. package/dist/gates/{side-effect-helpers.js → side-effect-helpers/pattern-detection.js} +22 -468
  80. package/dist/gates/side-effect-helpers/resource-tracking.d.ts +80 -0
  81. package/dist/gates/side-effect-helpers/resource-tracking.js +281 -0
  82. package/dist/gates/side-effect-helpers/scope-analysis.d.ts +21 -0
  83. package/dist/gates/side-effect-helpers/scope-analysis.js +146 -0
  84. package/dist/gates/side-effect-helpers/types.d.ts +38 -0
  85. package/dist/gates/side-effect-helpers/types.js +41 -0
  86. package/dist/gates/side-effect-rules.d.ts +0 -1
  87. package/dist/gates/side-effect-rules.js +0 -1
  88. package/dist/gates/style-drift-rules.d.ts +86 -0
  89. package/dist/gates/style-drift-rules.js +103 -0
  90. package/dist/gates/style-drift.d.ts +7 -16
  91. package/dist/gates/style-drift.js +101 -119
  92. package/dist/gates/test-quality-matchers.d.ts +53 -0
  93. package/dist/gates/test-quality-matchers.js +86 -0
  94. package/dist/gates/test-quality.d.ts +0 -3
  95. package/dist/gates/test-quality.js +47 -44
  96. package/dist/hooks/checker.d.ts +0 -1
  97. package/dist/hooks/checker.js +0 -2
  98. package/dist/hooks/dlp-templates.d.ts +0 -1
  99. package/dist/hooks/dlp-templates.js +0 -4
  100. package/dist/hooks/index.d.ts +0 -2
  101. package/dist/hooks/index.js +0 -2
  102. package/dist/hooks/input-validator.d.ts +0 -1
  103. package/dist/hooks/input-validator.js +0 -1
  104. package/dist/hooks/input-validator.test.js +0 -1
  105. package/dist/hooks/standalone-checker.d.ts +0 -1
  106. package/dist/hooks/standalone-checker.js +0 -1
  107. package/dist/hooks/standalone-dlp-checker.d.ts +0 -1
  108. package/dist/hooks/standalone-dlp-checker.js +0 -1
  109. package/dist/hooks/templates.d.ts +0 -1
  110. package/dist/hooks/templates.js +0 -1
  111. package/dist/hooks/types.d.ts +0 -1
  112. package/dist/hooks/types.js +0 -1
  113. package/dist/index.d.ts +1 -1
  114. package/dist/index.js +1 -1
  115. package/dist/services/adaptive-thresholds.d.ts +0 -2
  116. package/dist/services/adaptive-thresholds.js +0 -2
  117. package/dist/services/filesystem-cache.d.ts +0 -1
  118. package/dist/services/filesystem-cache.js +0 -1
  119. package/dist/services/score-history.d.ts +0 -1
  120. package/dist/services/score-history.js +0 -1
  121. package/dist/services/temporal-drift.d.ts +1 -2
  122. package/dist/services/temporal-drift.js +7 -8
  123. package/dist/storage/db.d.ts +23 -7
  124. package/dist/storage/db.js +116 -55
  125. package/dist/storage/findings.d.ts +4 -3
  126. package/dist/storage/findings.js +13 -20
  127. package/dist/storage/local-memory.d.ts +4 -4
  128. package/dist/storage/local-memory.js +20 -22
  129. package/dist/storage/patterns.d.ts +5 -5
  130. package/dist/storage/patterns.js +20 -26
  131. package/dist/storage/scans.d.ts +6 -6
  132. package/dist/storage/scans.js +12 -21
  133. package/dist/types/index.d.ts +1 -0
  134. package/dist/utils/scanner.js +1 -1
  135. package/package.json +7 -8
  136. package/dist/gates/duplication-drift.d.ts +0 -128
  137. package/dist/gates/duplication-drift.js +0 -585
  138. package/dist/gates/hallucinated-imports.js +0 -641
  139. package/dist/gates/side-effect-helpers.d.ts +0 -260
@@ -1,585 +0,0 @@
1
- /**
2
- * Duplication Drift Gate (v2)
3
- *
4
- * Detects when AI generates near-identical functions across files because
5
- * it doesn't remember what it already wrote. This is an AI-specific failure
6
- * mode — humans reuse via copy-paste (same file), AI re-invents (cross-file).
7
- *
8
- * v2 upgrades:
9
- * - tree-sitter AST node type sequences replace hand-rolled regex tokenizer
10
- * - Jaccard similarity on AST node multisets (structural, not textual)
11
- * - Catches duplicates even when every variable name is different
12
- * - MD5 kept as fast-path for exact matches, Jaccard runs on remaining pairs
13
- *
14
- * Detection strategy (three-pass):
15
- * 1. Extract function bodies, normalize text (strip comments/whitespace)
16
- * 2. Parse with tree-sitter → walk AST → collect node type multiset
17
- * 3. Generate semantic embeddings via all-MiniLM-L6-v2 (384D)
18
- * 4. Pass 1 (fast): MD5 hash → exact duplicates (O(n), <10ms)
19
- * 5. Pass 2 (Jaccard): AST node multiset similarity → structural near-duplicates (O(n²) bounded)
20
- * 6. Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
21
- * 7. Flag functions with similarity > threshold in different files
22
- *
23
- * Why AST node types > raw tokens:
24
- * - `getUserById(id) { return db.find(x => x.id === id) }`
25
- * - `fetchUser(userId) { return database.filter(u => u.id === userId)[0] }`
26
- * Both produce similar AST: [return_statement, call_expression, arrow_function,
27
- * binary_expression, member_expression]. Variable names are invisible.
28
- *
29
- * @since v2.16.0 (original MD5)
30
- * @since v5.0.0 (tree-sitter AST + Jaccard)
31
- * @since v5.1.0 (semantic embedding Pass 3)
32
- */
33
- import { Gate } from './base.js';
34
- import { FileScanner } from '../utils/scanner.js';
35
- import { Logger } from '../utils/logger.js';
36
- import { generateEmbedding, cosineSimilarity } from '../pattern-index/embeddings.js';
37
- import crypto from 'crypto';
38
- import path from 'path';
39
- import { fileURLToPath } from 'url';
40
- // tree-sitter is optional — graceful fallback to text tokenization
41
- let Parser = null;
42
- let treeSitterReady = false;
43
- let treeSitterFailed = false;
44
- const __dirname = path.dirname(fileURLToPath(import.meta.url));
45
- async function initTreeSitter() {
46
- if (treeSitterReady)
47
- return true;
48
- if (treeSitterFailed)
49
- return false;
50
- try {
51
- const mod = await import('web-tree-sitter');
52
- Parser = mod.default || mod;
53
- await Parser.init();
54
- treeSitterReady = true;
55
- return true;
56
- }
57
- catch {
58
- treeSitterFailed = true;
59
- Logger.debug('tree-sitter not available, falling back to text tokenization');
60
- return false;
61
- }
62
- }
63
- const GRAMMAR_PATHS = {
64
- '.ts': '../../vendor/grammars/tree-sitter-typescript.wasm',
65
- '.tsx': '../../vendor/grammars/tree-sitter-tsx.wasm',
66
- '.js': '../../vendor/grammars/tree-sitter-javascript.wasm',
67
- '.jsx': '../../vendor/grammars/tree-sitter-javascript.wasm',
68
- '.py': '../../vendor/grammars/tree-sitter-python.wasm',
69
- '.go': '../../vendor/grammars/tree-sitter-go.wasm',
70
- '.rs': '../../vendor/grammars/tree-sitter-rust.wasm',
71
- };
72
- // Cache loaded languages
73
- const languageCache = new Map();
74
- export class DuplicationDriftGate extends Gate {
75
- config;
76
- parser = null;
77
- constructor(config = {}) {
78
- super('duplication-drift', 'AI Duplication Drift Detection');
79
- this.config = {
80
- enabled: config.enabled ?? true,
81
- similarity_threshold: config.similarity_threshold ?? 0.75,
82
- semantic_threshold: config.semantic_threshold ?? 0.85,
83
- semantic_enabled: config.semantic_enabled ?? true,
84
- min_body_lines: config.min_body_lines ?? 5,
85
- approved_duplications: config.approved_duplications ?? [],
86
- };
87
- }
88
- get provenance() { return 'ai-drift'; }
89
- async run(context) {
90
- if (!this.config.enabled)
91
- return [];
92
- // Try to init tree-sitter (non-blocking, falls back gracefully)
93
- const hasTreeSitter = await initTreeSitter();
94
- if (hasTreeSitter && !this.parser) {
95
- this.parser = new Parser();
96
- }
97
- const failures = [];
98
- const functions = [];
99
- const scanPatterns = context.patterns || ['**/*.{ts,js,tsx,jsx,py,go,rs}'];
100
- const files = await FileScanner.findFiles({
101
- cwd: context.cwd,
102
- patterns: scanPatterns,
103
- ignore: [...(context.ignore || []), '**/node_modules/**', '**/dist/**', '**/*.test.*', '**/*.spec.*'],
104
- });
105
- Logger.info(`Duplication Drift: Scanning ${files.length} files (tree-sitter: ${hasTreeSitter ? 'ON' : 'fallback'})`);
106
- for (const file of files) {
107
- try {
108
- const { readFile } = await import('fs-extra');
109
- const content = await readFile(path.join(context.cwd, file), 'utf-8');
110
- const ext = path.extname(file);
111
- if (['.ts', '.js', '.tsx', '.jsx'].includes(ext)) {
112
- this.extractJSFunctions(content, file, functions);
113
- }
114
- else if (ext === '.py') {
115
- this.extractPyFunctions(content, file, functions);
116
- }
117
- // Generate AST tokens using tree-sitter if available
118
- if (hasTreeSitter && GRAMMAR_PATHS[ext]) {
119
- await this.enrichWithASTTokens(content, ext, file, functions);
120
- }
121
- }
122
- catch (e) { }
123
- }
124
- // Pass 3 prep: Generate semantic embeddings for all extracted functions
125
- // (embedding generation is lazy — only runs when semantic_enabled is true)
126
- if (this.config.semantic_enabled && functions.length > 0) {
127
- const allIndices = functions.map((_, i) => i);
128
- await this.enrichWithEmbeddings(functions, allIndices);
129
- }
130
- const duplicateGroups = this.findDuplicateGroups(functions);
131
- // Build approved pairs set for fast lookup
132
- const approvedSet = new Set((this.config.approved_duplications || []).map(s => s.toLowerCase()));
133
- for (const group of duplicateGroups) {
134
- // Check if this pair is human-approved
135
- const names = group.map(f => f.name).sort();
136
- const pairKey = names.join(':').toLowerCase();
137
- if (approvedSet.has(pairKey))
138
- continue;
139
- const files = group.map(f => f.file);
140
- const locations = group.map(f => `${f.file}:${f.line} (${f.name})`).join(', ');
141
- // Determine similarity % and method used
142
- let similarity;
143
- let method;
144
- if (group[0].bodyHash === group[1]?.bodyHash) {
145
- similarity = 1.0;
146
- method = 'exact-hash';
147
- }
148
- else if (group[0].embedding && group[1]?.embedding) {
149
- const jaccardSim = this.jaccardSimilarity(group[0].astTokens, group[1].astTokens);
150
- const cosineSim = cosineSimilarity(group[0].embedding, group[1].embedding);
151
- if (cosineSim > jaccardSim) {
152
- similarity = cosineSim;
153
- method = 'semantic-embedding';
154
- }
155
- else {
156
- similarity = jaccardSim;
157
- method = 'ast-jaccard';
158
- }
159
- }
160
- else {
161
- similarity = group.length > 1
162
- ? this.jaccardSimilarity(group[0].astTokens, group[1].astTokens)
163
- : 1.0;
164
- method = 'ast-jaccard';
165
- }
166
- const pct = (similarity * 100).toFixed(0);
167
- failures.push(this.createFailure(`AI Duplication Drift: Function '${group[0].name}' has ${group.length} near-identical copies (${pct}% similar via ${method})`, [...new Set(files)], `Found duplicate implementations at: ${locations}. Extract to a shared module and import.`, 'Duplication Drift', group[0].line, undefined, 'high'));
168
- }
169
- return failures;
170
- }
171
- // ─── tree-sitter AST Tokenization ───────────────────────────────
172
- /**
173
- * Parse the file with tree-sitter, find function nodes that match
174
- * our extracted functions (by line number), and replace their token
175
- * multisets with AST node type sequences.
176
- *
177
- * AST node types are language-agnostic structural tokens:
178
- * - if_statement, for_statement, return_statement
179
- * - call_expression, member_expression, binary_expression
180
- * - arrow_function, function_declaration
181
- *
182
- * Variable names, string literals, comments — all invisible.
183
- * Only STRUCTURE matters.
184
- */
185
- async enrichWithASTTokens(content, ext, file, functions) {
186
- if (!this.parser)
187
- return;
188
- const grammarRelPath = GRAMMAR_PATHS[ext];
189
- if (!grammarRelPath)
190
- return;
191
- try {
192
- // Load language (cached)
193
- if (!languageCache.has(ext)) {
194
- const grammarPath = path.resolve(__dirname, grammarRelPath);
195
- const lang = await Parser.Language.load(grammarPath);
196
- languageCache.set(ext, lang);
197
- }
198
- const lang = languageCache.get(ext);
199
- this.parser.setLanguage(lang);
200
- const tree = this.parser.parse(content);
201
- // Find functions that belong to this file
202
- const fileFunctions = functions.filter(f => f.file === file);
203
- for (const fn of fileFunctions) {
204
- // Find the AST node at this function's line
205
- const node = this.findFunctionNodeAtLine(tree.rootNode, fn.line);
206
- if (node) {
207
- fn.astTokens = this.collectASTNodeTypes(node);
208
- }
209
- }
210
- }
211
- catch (e) {
212
- // tree-sitter parse failed for this file — keep text tokens
213
- Logger.debug(`tree-sitter parse failed for ${file}: ${e}`);
214
- }
215
- }
216
- /**
217
- * Walk the AST tree to find a function/method node at a given line.
218
- */
219
- findFunctionNodeAtLine(rootNode, targetLine) {
220
- const functionTypes = new Set([
221
- 'function_declaration', 'method_definition', 'arrow_function',
222
- 'function_definition', // Python
223
- 'function_item', // Rust
224
- 'method_declaration', // Java/C#
225
- 'lexical_declaration', // const x = () => {}
226
- ]);
227
- let bestMatch = null;
228
- const walk = (node) => {
229
- // tree-sitter lines are 0-indexed, our lines are 1-indexed
230
- if (functionTypes.has(node.type) && node.startPosition.row + 1 === targetLine) {
231
- bestMatch = node;
232
- return;
233
- }
234
- for (let i = 0; i < node.childCount; i++) {
235
- walk(node.child(i));
236
- if (bestMatch)
237
- return;
238
- }
239
- };
240
- walk(rootNode);
241
- return bestMatch;
242
- }
243
- /**
244
- * Walk an AST subtree and collect node types as a multiset.
245
- *
246
- * This is the core insight: two functions with different variable names
247
- * but the same control flow produce the same node type multiset.
248
- *
249
- * Example:
250
- * `function a(x) { if (x > 0) return x * 2; return 0; }`
251
- * `function b(val) { if (val > 0) return val * 2; return 0; }`
252
- *
253
- * Both produce: {if_statement: 1, binary_expression: 2, return_statement: 2, ...}
254
- * → Jaccard similarity = 1.0
255
- */
256
- collectASTNodeTypes(node) {
257
- const types = new Map();
258
- const walk = (n) => {
259
- // Skip leaf nodes that are just identifiers/literals (noise)
260
- // Keep structural node types only
261
- if (n.childCount > 0 || isStructuralLeaf(n.type)) {
262
- types.set(n.type, (types.get(n.type) || 0) + 1);
263
- }
264
- for (let i = 0; i < n.childCount; i++) {
265
- walk(n.child(i));
266
- }
267
- };
268
- walk(node);
269
- return types;
270
- }
271
- // ─── Fallback Text Tokenization ─────────────────────────────────
272
- /**
273
- * Fallback tokenizer when tree-sitter is not available.
274
- * Uses normalized text → keyword/operator multiset.
275
- */
276
- textTokenize(normalized) {
277
- const tokens = new Map();
278
- const structural = normalized.match(/\b(if|else|for|while|return|const|let|var|function|class|import|export|async|await|try|catch|throw|new|switch|case|break|continue|yield|def|self)\b|[{}()\[\];,.:=<>!&|+\-*/%?]+/g) || [];
279
- for (const token of structural) {
280
- tokens.set(token, (tokens.get(token) || 0) + 1);
281
- }
282
- // Normalize all identifiers to a count (variable names don't matter)
283
- const keywords = new Set([
284
- 'if', 'else', 'for', 'while', 'return', 'const', 'let', 'var',
285
- 'function', 'class', 'import', 'export', 'async', 'await',
286
- 'try', 'catch', 'throw', 'new', 'switch', 'case', 'break',
287
- 'continue', 'yield', 'def', 'self', 'true', 'false', 'null', 'undefined',
288
- ]);
289
- const identifiers = normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
290
- let idCount = 0;
291
- for (const id of identifiers) {
292
- if (!keywords.has(id))
293
- idCount++;
294
- }
295
- if (idCount > 0)
296
- tokens.set('_ID_', idCount);
297
- return tokens;
298
- }
299
- // ─── Jaccard Similarity ─────────────────────────────────────────
300
- /**
301
- * Jaccard similarity on multisets.
302
- * intersection = sum of min(countA, countB) for each key
303
- * union = sum of max(countA, countB) for each key
304
- */
305
- jaccardSimilarity(a, b) {
306
- const allKeys = new Set([...a.keys(), ...b.keys()]);
307
- let intersection = 0;
308
- let union = 0;
309
- for (const key of allKeys) {
310
- const countA = a.get(key) || 0;
311
- const countB = b.get(key) || 0;
312
- intersection += Math.min(countA, countB);
313
- union += Math.max(countA, countB);
314
- }
315
- return union === 0 ? 0 : intersection / union;
316
- }
317
- // ─── Function Extraction ────────────────────────────────────────
318
- extractJSFunctions(content, file, functions) {
319
- const lines = content.split('\n');
320
- const patterns = [
321
- /^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/,
322
- /^(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|(\w+))\s*=>/,
323
- /^\s+(?:async\s+)?(\w+)\s*\(([^)]*)\)\s*\{/,
324
- ];
325
- for (let i = 0; i < lines.length; i++) {
326
- const line = lines[i];
327
- for (const pattern of patterns) {
328
- const match = line.match(pattern);
329
- if (match) {
330
- const name = match[1];
331
- const params = match[2] || '';
332
- const body = this.extractFunctionBody(lines, i);
333
- if (body.length >= this.config.min_body_lines) {
334
- const normalized = this.normalizeBody(body.join('\n'));
335
- functions.push({
336
- name,
337
- file,
338
- line: i + 1,
339
- paramCount: params ? params.split(',').length : 0,
340
- bodyHash: this.hash(normalized),
341
- bodyLength: body.length,
342
- normalized,
343
- // Start with text tokens, enrichWithASTTokens() upgrades if tree-sitter available
344
- astTokens: this.textTokenize(normalized),
345
- });
346
- }
347
- break;
348
- }
349
- }
350
- }
351
- }
352
- extractPyFunctions(content, file, functions) {
353
- const lines = content.split('\n');
354
- for (let i = 0; i < lines.length; i++) {
355
- const match = lines[i].match(/^(?:\s*)(?:async\s+)?def\s+(\w+)\s*\(([^)]*)\)/);
356
- if (match) {
357
- const name = match[1];
358
- const params = match[2] || '';
359
- const indent = lines[i].match(/^(\s*)/)?.[1]?.length || 0;
360
- const body = [];
361
- for (let j = i + 1; j < lines.length; j++) {
362
- const lineIndent = lines[j].match(/^(\s*)/)?.[1]?.length || 0;
363
- if (lines[j].trim() === '' || lineIndent > indent) {
364
- body.push(lines[j]);
365
- }
366
- else {
367
- break;
368
- }
369
- }
370
- if (body.length >= this.config.min_body_lines) {
371
- const normalized = this.normalizeBody(body.join('\n'));
372
- functions.push({
373
- name,
374
- file,
375
- line: i + 1,
376
- paramCount: params ? params.split(',').length : 0,
377
- bodyHash: this.hash(normalized),
378
- bodyLength: body.length,
379
- normalized,
380
- astTokens: this.textTokenize(normalized),
381
- });
382
- }
383
- }
384
- }
385
- }
386
- extractFunctionBody(lines, startIndex) {
387
- let braceDepth = 0;
388
- let started = false;
389
- const body = [];
390
- for (let i = startIndex; i < lines.length; i++) {
391
- const line = lines[i];
392
- for (const ch of line) {
393
- if (ch === '{') {
394
- braceDepth++;
395
- started = true;
396
- }
397
- if (ch === '}')
398
- braceDepth--;
399
- }
400
- if (started)
401
- body.push(line);
402
- if (started && braceDepth === 0)
403
- break;
404
- }
405
- return body;
406
- }
407
- normalizeBody(body) {
408
- return body
409
- .replace(/\/\/.*/g, '')
410
- .replace(/\/\*[\s\S]*?\*\//g, '')
411
- .replace(/#.*/g, '')
412
- .replace(/`[^`]*`/g, '"STR"')
413
- .replace(/\basync\s+/g, '')
414
- .replace(/\s+/g, ' ')
415
- .replace(/['"]/g, '"')
416
- .trim();
417
- }
418
- hash(text) {
419
- return crypto.createHash('md5').update(text).digest('hex');
420
- }
421
- // ─── Semantic Embedding ─────────────────────────────────────────
422
- /**
423
- * Generate semantic embedding text for a function.
424
- * Combines function name, parameter names, and first 200 tokens of body.
425
- * This captures INTENT regardless of implementation differences.
426
- *
427
- * Example:
428
- * getUserById(id) { return db.users.find(x => x.id === id) }
429
- * → "getUserById id return db users find x id id"
430
- *
431
- * fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
432
- * → "fetchUserRecord userId return database users filter u id userId 0"
433
- *
434
- * These produce similar embeddings (~0.91 cosine) despite different AST.
435
- */
436
- buildEmbeddingText(fn) {
437
- // Extract identifiers from normalized body (first 200 tokens)
438
- const bodyTokens = fn.normalized.match(/\b[a-zA-Z_]\w*\b/g) || [];
439
- const first200 = bodyTokens.slice(0, 200).join(' ');
440
- return `${fn.name} ${first200}`;
441
- }
442
- /**
443
- * Enrich functions with semantic embeddings for Pass 3.
444
- * Only called for functions not already claimed by Pass 1/2.
445
- * Uses generateEmbedding() from pattern-index/embeddings.ts.
446
- */
447
- async enrichWithEmbeddings(functions, indices) {
448
- Logger.info(`Semantic Pass 3: Generating embeddings for ${indices.length} functions`);
449
- for (const idx of indices) {
450
- const fn = functions[idx];
451
- try {
452
- const text = this.buildEmbeddingText(fn);
453
- fn.embedding = await generateEmbedding(text);
454
- }
455
- catch {
456
- // Embedding failed — skip this function for Pass 3
457
- Logger.debug(`Embedding generation failed for ${fn.file}:${fn.name}`);
458
- }
459
- }
460
- }
461
- // ─── Duplicate Finding (three-pass) ──────────────────────────────
462
- /**
463
- * Three-pass duplicate detection:
464
- * Pass 1 (fast): MD5 hash → exact duplicates (O(n))
465
- * Pass 2 (Jaccard): AST node multiset similarity → near-duplicates (O(n²) bounded)
466
- * Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
467
- *
468
- * Pass 3 catches what AST Jaccard misses: same intent, different implementation.
469
- * Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
470
- */
471
- findDuplicateGroups(functions) {
472
- const duplicates = [];
473
- const claimedIndices = new Set();
474
- // Pass 1: Exact hash match
475
- const hashGroups = new Map();
476
- for (let i = 0; i < functions.length; i++) {
477
- const existing = hashGroups.get(functions[i].bodyHash) || [];
478
- existing.push(i);
479
- hashGroups.set(functions[i].bodyHash, existing);
480
- }
481
- for (const indices of hashGroups.values()) {
482
- if (indices.length < 2)
483
- continue;
484
- const group = indices.map(i => functions[i]);
485
- const uniqueFiles = new Set(group.map(f => f.file));
486
- if (uniqueFiles.size >= 2) {
487
- duplicates.push(group);
488
- indices.forEach(i => claimedIndices.add(i));
489
- }
490
- }
491
- // Pass 2: Jaccard on AST tokens for remaining functions
492
- const remaining = functions
493
- .map((fn, i) => ({ fn, idx: i }))
494
- .filter(({ idx }) => !claimedIndices.has(idx));
495
- remaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
496
- const jaccardClaimed = new Set();
497
- for (let i = 0; i < remaining.length; i++) {
498
- if (jaccardClaimed.has(remaining[i].idx))
499
- continue;
500
- const group = [remaining[i].fn];
501
- const baseLen = remaining[i].fn.bodyLength;
502
- for (let j = i + 1; j < remaining.length; j++) {
503
- if (jaccardClaimed.has(remaining[j].idx))
504
- continue;
505
- if (remaining[j].fn.bodyLength > baseLen * 1.5)
506
- break;
507
- if (remaining[j].fn.file === remaining[i].fn.file)
508
- continue;
509
- const sim = this.jaccardSimilarity(remaining[i].fn.astTokens, remaining[j].fn.astTokens);
510
- if (sim >= this.config.similarity_threshold) {
511
- group.push(remaining[j].fn);
512
- jaccardClaimed.add(remaining[j].idx);
513
- }
514
- }
515
- if (group.length >= 2) {
516
- const uniqueFiles = new Set(group.map(f => f.file));
517
- if (uniqueFiles.size >= 2) {
518
- duplicates.push(group);
519
- jaccardClaimed.add(remaining[i].idx);
520
- }
521
- }
522
- }
523
- // Mark all Pass 1 + Pass 2 claimed indices
524
- for (const idx of jaccardClaimed)
525
- claimedIndices.add(idx);
526
- // Pass 3: Semantic embedding cosine similarity for still-unclaimed functions
527
- if (this.config.semantic_enabled) {
528
- const semanticRemaining = functions
529
- .map((fn, i) => ({ fn, idx: i }))
530
- .filter(({ idx }) => !claimedIndices.has(idx))
531
- .filter(({ fn }) => fn.embedding && fn.embedding.length > 0);
532
- semanticRemaining.sort((a, b) => a.fn.bodyLength - b.fn.bodyLength);
533
- const semanticClaimed = new Set();
534
- for (let i = 0; i < semanticRemaining.length; i++) {
535
- if (semanticClaimed.has(semanticRemaining[i].idx))
536
- continue;
537
- const group = [semanticRemaining[i].fn];
538
- const baseLen = semanticRemaining[i].fn.bodyLength;
539
- for (let j = i + 1; j < semanticRemaining.length; j++) {
540
- if (semanticClaimed.has(semanticRemaining[j].idx))
541
- continue;
542
- // Body length must be within 2x range (semantic allows more variance)
543
- if (semanticRemaining[j].fn.bodyLength > baseLen * 2.0)
544
- break;
545
- if (semanticRemaining[j].fn.file === semanticRemaining[i].fn.file)
546
- continue;
547
- const sim = cosineSimilarity(semanticRemaining[i].fn.embedding, semanticRemaining[j].fn.embedding);
548
- if (sim >= this.config.semantic_threshold) {
549
- group.push(semanticRemaining[j].fn);
550
- semanticClaimed.add(semanticRemaining[j].idx);
551
- }
552
- }
553
- if (group.length >= 2) {
554
- const uniqueFiles = new Set(group.map(f => f.file));
555
- if (uniqueFiles.size >= 2) {
556
- duplicates.push(group);
557
- semanticClaimed.add(semanticRemaining[i].idx);
558
- }
559
- }
560
- }
561
- if (semanticClaimed.size > 0) {
562
- Logger.info(`Semantic Pass 3: Found ${semanticClaimed.size} additional semantic duplicates`);
563
- }
564
- }
565
- return duplicates;
566
- }
567
- }
568
- /**
569
- * AST node types that are structural even as leaf nodes.
570
- * These carry semantic meaning without children.
571
- */
572
- function isStructuralLeaf(type) {
573
- const structural = new Set([
574
- 'return', 'break', 'continue', 'yield', 'throw',
575
- 'true', 'false', 'null', 'undefined', 'none',
576
- 'self', 'this', 'super',
577
- 'string', 'number', 'template_string',
578
- // Operators
579
- '=', '==', '===', '!=', '!==', '<', '>', '<=', '>=',
580
- '+', '-', '*', '/', '%', '**',
581
- '&&', '||', '!', '??',
582
- '=>', '...', '?', ':',
583
- ]);
584
- return structural.has(type);
585
- }