@rigour-labs/core 4.3.5 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -10
- package/dist/gates/base.d.ts +3 -0
- package/dist/gates/checkpoint.d.ts +23 -8
- package/dist/gates/checkpoint.js +109 -45
- package/dist/gates/checkpoint.test.js +6 -3
- package/dist/gates/dependency.d.ts +39 -0
- package/dist/gates/dependency.js +212 -5
- package/dist/gates/duplication-drift.d.ts +101 -6
- package/dist/gates/duplication-drift.js +427 -33
- package/dist/gates/logic-drift.d.ts +70 -0
- package/dist/gates/logic-drift.js +280 -0
- package/dist/gates/runner.js +29 -1
- package/dist/gates/style-drift.d.ts +53 -0
- package/dist/gates/style-drift.js +305 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +4 -0
- package/dist/inference/model-manager.js +5 -1
- package/dist/inference/types.d.ts +6 -1
- package/dist/inference/types.js +6 -1
- package/dist/services/adaptive-thresholds.d.ts +54 -10
- package/dist/services/adaptive-thresholds.js +161 -35
- package/dist/services/adaptive-thresholds.test.js +24 -20
- package/dist/services/filesystem-cache.d.ts +50 -0
- package/dist/services/filesystem-cache.js +124 -0
- package/dist/services/temporal-drift.d.ts +101 -0
- package/dist/services/temporal-drift.js +386 -0
- package/dist/templates/universal-config.js +17 -0
- package/dist/types/index.d.ts +196 -0
- package/dist/types/index.js +19 -0
- package/dist/utils/scanner.d.ts +6 -1
- package/dist/utils/scanner.js +8 -1
- package/package.json +6 -6
package/dist/gates/dependency.js
CHANGED
|
@@ -1,6 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dependency Guardian Gate (v2)
|
|
3
|
+
*
|
|
4
|
+
* Detects dependency issues that AI agents commonly introduce:
|
|
5
|
+
* 1. Forbidden dependencies (existing) — packages banned by project standards
|
|
6
|
+
* 2. Unused dependencies (NEW) — installed but never imported
|
|
7
|
+
* 3. Heavy alternatives (NEW) — bloated packages with lighter alternatives
|
|
8
|
+
* 4. Duplicate purpose (NEW) — multiple packages solving the same problem
|
|
9
|
+
*
|
|
10
|
+
* AI agents are particularly prone to:
|
|
11
|
+
* - Adding packages they've seen in training data without checking existing deps
|
|
12
|
+
* - Using heavy/popular packages when lighter alternatives exist
|
|
13
|
+
* - Installing multiple HTTP clients, date libs, etc. across different sessions
|
|
14
|
+
*
|
|
15
|
+
* @since v2.0.0 (forbidden deps)
|
|
16
|
+
* @since v5.1.0 (unused, heavy alternatives, duplicate purpose)
|
|
17
|
+
*/
|
|
1
18
|
import fs from 'fs-extra';
|
|
2
19
|
import path from 'path';
|
|
3
20
|
import { Gate } from './base.js';
|
|
21
|
+
import { FileScanner } from '../utils/scanner.js';
|
|
22
|
+
import { Logger } from '../utils/logger.js';
|
|
23
|
+
/**
|
|
24
|
+
* Known heavy packages with lighter alternatives.
|
|
25
|
+
* Format: package → "alternative (size comparison)"
|
|
26
|
+
*/
|
|
27
|
+
const HEAVY_ALTERNATIVES = {
|
|
28
|
+
'moment': 'date-fns or dayjs (67KB → 2KB gzipped)',
|
|
29
|
+
'lodash': 'lodash-es (tree-shakeable) or native Array/Object methods',
|
|
30
|
+
'underscore': 'native ES6+ methods (Array.map, Object.entries, etc.)',
|
|
31
|
+
'axios': 'native fetch API (built into Node 18+)',
|
|
32
|
+
'request': 'node-fetch or native fetch (deprecated since 2020)',
|
|
33
|
+
'bluebird': 'native Promise (built-in since ES2015)',
|
|
34
|
+
'jquery': 'native DOM APIs (querySelector, fetch, classList)',
|
|
35
|
+
'classnames': 'clsx (0.3KB vs 1KB) or template literals',
|
|
36
|
+
'uuid': 'crypto.randomUUID() (built into Node 19+ and browsers)',
|
|
37
|
+
'left-pad': 'String.prototype.padStart() (built-in)',
|
|
38
|
+
'is-even': 'n % 2 === 0 (one-liner)',
|
|
39
|
+
'is-odd': 'n % 2 !== 0 (one-liner)',
|
|
40
|
+
'chalk': 'picocolors (14x smaller, faster)',
|
|
41
|
+
};
|
|
42
|
+
/**
|
|
43
|
+
* Functional groups — if >1 package from same group is installed,
|
|
44
|
+
* it's likely a duplicate purpose issue from AI drift.
|
|
45
|
+
*/
|
|
46
|
+
const FUNCTIONAL_GROUPS = [
|
|
47
|
+
{ name: 'HTTP clients', packages: ['axios', 'node-fetch', 'got', 'request', 'ky', 'superagent', 'undici'] },
|
|
48
|
+
{ name: 'Date/time libraries', packages: ['moment', 'dayjs', 'date-fns', 'luxon', 'fecha', 'chrono-node'] },
|
|
49
|
+
{ name: 'Terminal colors', packages: ['chalk', 'kleur', 'ansi-colors', 'picocolors', 'colorette'] },
|
|
50
|
+
{ name: 'CLI argument parsers', packages: ['commander', 'yargs', 'meow', 'cac', 'minimist', 'arg'] },
|
|
51
|
+
{ name: 'Schema validation', packages: ['zod', 'joi', 'yup', 'ajv', 'superstruct', 'io-ts'] },
|
|
52
|
+
{ name: 'Logging libraries', packages: ['winston', 'pino', 'bunyan', 'log4js', 'signale', 'consola'] },
|
|
53
|
+
{ name: 'UUID generators', packages: ['uuid', 'nanoid', 'cuid', 'ulid', 'shortid'] },
|
|
54
|
+
{ name: 'Markdown parsers', packages: ['marked', 'markdown-it', 'remark', 'showdown', 'remarkable'] },
|
|
55
|
+
{ name: 'Testing frameworks', packages: ['jest', 'mocha', 'vitest', 'ava', 'tap', 'jasmine'] },
|
|
56
|
+
{ name: 'CSS-in-JS', packages: ['styled-components', 'emotion', '@emotion/react', 'linaria', 'vanilla-extract'] },
|
|
57
|
+
{ name: 'State management', packages: ['redux', 'mobx', 'zustand', 'jotai', 'recoil', 'valtio'] },
|
|
58
|
+
{ name: 'Environment config', packages: ['dotenv', 'env-var', 'envalid', 'convict'] },
|
|
59
|
+
];
|
|
4
60
|
export class DependencyGate extends Gate {
|
|
5
61
|
config;
|
|
6
62
|
constructor(config) {
|
|
@@ -9,11 +65,10 @@ export class DependencyGate extends Gate {
|
|
|
9
65
|
}
|
|
10
66
|
async run(context) {
|
|
11
67
|
const failures = [];
|
|
12
|
-
const
|
|
13
|
-
|
|
14
|
-
return [];
|
|
68
|
+
const depConfig = this.config.gates.dependencies || {};
|
|
69
|
+
const forbidden = depConfig.forbid || [];
|
|
15
70
|
const { cwd } = context;
|
|
16
|
-
// 1. Scan Node.js (package.json)
|
|
71
|
+
// 1. Scan Node.js (package.json) — forbidden + new checks
|
|
17
72
|
const pkgPath = path.join(cwd, 'package.json');
|
|
18
73
|
if (await fs.pathExists(pkgPath)) {
|
|
19
74
|
try {
|
|
@@ -23,15 +78,32 @@ export class DependencyGate extends Gate {
|
|
|
23
78
|
...(pkg.devDependencies || {}),
|
|
24
79
|
...(pkg.peerDependencies || {}),
|
|
25
80
|
};
|
|
81
|
+
const depNames = Object.keys(allDeps);
|
|
82
|
+
// Forbidden deps check
|
|
26
83
|
for (const dep of forbidden) {
|
|
27
84
|
if (allDeps[dep]) {
|
|
28
85
|
failures.push(this.createFailure(`The package '${dep}' is forbidden by project standards.`, ['package.json'], `Remove '${dep}' from package.json and use approved alternatives.`, 'Forbidden Dependency', undefined, undefined, 'medium'));
|
|
29
86
|
}
|
|
30
87
|
}
|
|
88
|
+
// NEW: Unused dependency detection
|
|
89
|
+
if (depConfig.detect_unused !== false && depNames.length > 0) {
|
|
90
|
+
const unusedFailures = await this.detectUnusedDeps(context, pkg, depNames, depConfig.unused_allowlist || []);
|
|
91
|
+
failures.push(...unusedFailures);
|
|
92
|
+
}
|
|
93
|
+
// NEW: Heavy alternative detection
|
|
94
|
+
if (depConfig.detect_heavy_alternatives !== false) {
|
|
95
|
+
const heavyFailures = this.detectHeavyAlternatives(depNames);
|
|
96
|
+
failures.push(...heavyFailures);
|
|
97
|
+
}
|
|
98
|
+
// NEW: Duplicate purpose detection
|
|
99
|
+
if (depConfig.detect_duplicate_purpose !== false) {
|
|
100
|
+
const dupeFailures = this.detectDuplicatePurpose(depNames);
|
|
101
|
+
failures.push(...dupeFailures);
|
|
102
|
+
}
|
|
31
103
|
}
|
|
32
104
|
catch (e) { }
|
|
33
105
|
}
|
|
34
|
-
// 2. Scan Python (requirements.txt, pyproject.toml)
|
|
106
|
+
// 2. Scan Python (requirements.txt, pyproject.toml) — forbidden only
|
|
35
107
|
const reqPath = path.join(cwd, 'requirements.txt');
|
|
36
108
|
if (await fs.pathExists(reqPath)) {
|
|
37
109
|
const content = await fs.readFile(reqPath, 'utf-8');
|
|
@@ -72,4 +144,139 @@ export class DependencyGate extends Gate {
|
|
|
72
144
|
}
|
|
73
145
|
return failures;
|
|
74
146
|
}
|
|
147
|
+
// ─── Unused Dependency Detection ─────────────────────────────────
|
|
148
|
+
/**
|
|
149
|
+
* Detect dependencies listed in package.json but never imported.
|
|
150
|
+
* Scans all source files for import/require statements.
|
|
151
|
+
*
|
|
152
|
+
* Allowlist handles side-effect imports like:
|
|
153
|
+
* - @types/* (TypeScript type packages)
|
|
154
|
+
* - polyfills (core-js, regenerator-runtime)
|
|
155
|
+
* - PostCSS/Babel plugins (used in config files, not source)
|
|
156
|
+
*/
|
|
157
|
+
async detectUnusedDeps(context, pkg, depNames, allowlist) {
|
|
158
|
+
const failures = [];
|
|
159
|
+
// Only check production + dev dependencies (not peer)
|
|
160
|
+
const prodDeps = Object.keys(pkg.dependencies || {});
|
|
161
|
+
const devDeps = Object.keys(pkg.devDependencies || {});
|
|
162
|
+
const checkDeps = [...prodDeps, ...devDeps];
|
|
163
|
+
if (checkDeps.length === 0)
|
|
164
|
+
return [];
|
|
165
|
+
// Default allowlist patterns for known side-effect packages
|
|
166
|
+
const defaultAllowPatterns = [
|
|
167
|
+
/^@types\//, // TypeScript type packages
|
|
168
|
+
/^typescript$/, // Used by tsc, not imported
|
|
169
|
+
/^eslint/, // Used by config, not imported
|
|
170
|
+
/^prettier$/, // Used by config
|
|
171
|
+
/^@eslint/, // ESLint config packages
|
|
172
|
+
/^husky$/, // Git hooks
|
|
173
|
+
/^lint-staged$/, // Pre-commit
|
|
174
|
+
/^core-js/, // Polyfills
|
|
175
|
+
/^regenerator-runtime$/, // Babel polyfill
|
|
176
|
+
/^postcss/, // PostCSS plugins
|
|
177
|
+
/^autoprefixer$/, // PostCSS plugin
|
|
178
|
+
/^tailwindcss$/, // Build tool
|
|
179
|
+
/^@tailwindcss\//, // Build tool
|
|
180
|
+
/^webpack/, // Build tool
|
|
181
|
+
/^vite$/, // Build tool
|
|
182
|
+
/^@vitejs\//, // Vite plugins
|
|
183
|
+
/^rollup/, // Build tool
|
|
184
|
+
/^esbuild$/, // Build tool
|
|
185
|
+
/^tsup$/, // Build tool
|
|
186
|
+
/^turbo$/, // Build tool
|
|
187
|
+
/^nodemon$/, // Dev tool
|
|
188
|
+
/^ts-node$/, // Dev tool
|
|
189
|
+
/^tsx$/, // Dev tool
|
|
190
|
+
/^concurrently$/, // Dev tool
|
|
191
|
+
];
|
|
192
|
+
// Combine with user allowlist
|
|
193
|
+
const userPatterns = allowlist.map(p => {
|
|
194
|
+
if (p.endsWith('*'))
|
|
195
|
+
return new RegExp(`^${p.slice(0, -1)}`);
|
|
196
|
+
return new RegExp(`^${p}$`);
|
|
197
|
+
});
|
|
198
|
+
const allPatterns = [...defaultAllowPatterns, ...userPatterns];
|
|
199
|
+
// Filter deps that need checking
|
|
200
|
+
const depsToCheck = checkDeps.filter(dep => !allPatterns.some(pattern => pattern.test(dep)));
|
|
201
|
+
if (depsToCheck.length === 0)
|
|
202
|
+
return [];
|
|
203
|
+
// Scan source files for import/require patterns
|
|
204
|
+
const sourceFiles = await FileScanner.findFiles({
|
|
205
|
+
cwd: context.cwd,
|
|
206
|
+
patterns: ['**/*.{ts,tsx,js,jsx,mjs,cjs,mts,cts,vue,svelte}'],
|
|
207
|
+
ignore: [...(context.ignore || []), '**/node_modules/**', '**/dist/**'],
|
|
208
|
+
});
|
|
209
|
+
// Read all source files
|
|
210
|
+
const contents = await FileScanner.readFiles(context.cwd, sourceFiles, context.fileCache);
|
|
211
|
+
// Also check config files that might reference deps
|
|
212
|
+
const configFiles = ['vite.config.ts', 'vite.config.js', 'next.config.js', 'next.config.mjs',
|
|
213
|
+
'jest.config.ts', 'jest.config.js', 'vitest.config.ts', 'tsconfig.json',
|
|
214
|
+
'babel.config.js', '.babelrc', 'postcss.config.js', 'tailwind.config.js',
|
|
215
|
+
'webpack.config.js', 'rollup.config.js', 'esbuild.config.js'];
|
|
216
|
+
for (const cf of configFiles) {
|
|
217
|
+
const cfPath = path.join(context.cwd, cf);
|
|
218
|
+
if (await fs.pathExists(cfPath)) {
|
|
219
|
+
try {
|
|
220
|
+
contents.set(cf, await fs.readFile(cfPath, 'utf-8'));
|
|
221
|
+
}
|
|
222
|
+
catch { }
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
// Build a combined source text for fast searching
|
|
226
|
+
const allSource = Array.from(contents.values()).join('\n');
|
|
227
|
+
for (const dep of depsToCheck) {
|
|
228
|
+
// Check for various import patterns:
|
|
229
|
+
// import ... from 'dep'
|
|
230
|
+
// require('dep')
|
|
231
|
+
// import('dep')
|
|
232
|
+
// Also check for scoped package partial imports: @scope/package/sub
|
|
233
|
+
const depEscaped = dep.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
234
|
+
const importPattern = new RegExp(`(?:from\\s+['"\`]${depEscaped}(?:/[^'"\`]*)?['"\`])|` +
|
|
235
|
+
`(?:require\\s*\\(\\s*['"\`]${depEscaped}(?:/[^'"\`]*)?['"\`]\\s*\\))|` +
|
|
236
|
+
`(?:import\\s*\\(\\s*['"\`]${depEscaped}(?:/[^'"\`]*)?['"\`]\\s*\\))|` +
|
|
237
|
+
`(?:['"\`]${depEscaped}['"\`])` // Config file references
|
|
238
|
+
);
|
|
239
|
+
if (!importPattern.test(allSource)) {
|
|
240
|
+
const isDevDep = devDeps.includes(dep);
|
|
241
|
+
failures.push(this.createFailure(`${isDevDep ? 'Dev dependency' : 'Dependency'} '${dep}' appears unused — no import/require found in source files.`, ['package.json'], `Remove '${dep}' from package.json if it's truly unused, or add it to unused_allowlist if it's a side-effect import.`, 'Unused Dependency', undefined, undefined, 'low'));
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
if (failures.length > 0) {
|
|
245
|
+
Logger.info(`Dependency Guardian: Found ${failures.length} potentially unused dependencies`);
|
|
246
|
+
}
|
|
247
|
+
return failures;
|
|
248
|
+
}
|
|
249
|
+
// ─── Heavy Alternative Detection ─────────────────────────────────
|
|
250
|
+
/**
|
|
251
|
+
* Detect heavy/bloated packages that have lighter modern alternatives.
|
|
252
|
+
* AI agents tend to reach for the most popular (heaviest) package
|
|
253
|
+
* because that's what they've seen most in training data.
|
|
254
|
+
*/
|
|
255
|
+
detectHeavyAlternatives(depNames) {
|
|
256
|
+
const failures = [];
|
|
257
|
+
for (const dep of depNames) {
|
|
258
|
+
const alternative = HEAVY_ALTERNATIVES[dep];
|
|
259
|
+
if (alternative) {
|
|
260
|
+
failures.push(this.createFailure(`Package '${dep}' has a lighter alternative: ${alternative}.`, ['package.json'], `Consider replacing '${dep}' with ${alternative}. AI agents often default to popular-but-heavy packages.`, 'Heavy Dependency', undefined, undefined, 'low'));
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
return failures;
|
|
264
|
+
}
|
|
265
|
+
// ─── Duplicate Purpose Detection ─────────────────────────────────
|
|
266
|
+
/**
|
|
267
|
+
* Detect when multiple packages serve the same purpose.
|
|
268
|
+
* This is a classic AI drift symptom — different sessions install different
|
|
269
|
+
* packages for the same task (e.g., axios in one PR, got in another).
|
|
270
|
+
*/
|
|
271
|
+
detectDuplicatePurpose(depNames) {
|
|
272
|
+
const failures = [];
|
|
273
|
+
const depSet = new Set(depNames);
|
|
274
|
+
for (const group of FUNCTIONAL_GROUPS) {
|
|
275
|
+
const installed = group.packages.filter(pkg => depSet.has(pkg));
|
|
276
|
+
if (installed.length >= 2) {
|
|
277
|
+
failures.push(this.createFailure(`Multiple ${group.name} installed: ${installed.join(', ')}. Pick one and remove the rest.`, ['package.json'], `Having multiple packages for ${group.name} is a sign of AI drift — different sessions chose different packages. Standardize on one.`, 'Duplicate Purpose Dependencies', undefined, undefined, 'medium'));
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
return failures;
|
|
281
|
+
}
|
|
75
282
|
}
|
|
@@ -1,33 +1,128 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Duplication Drift Gate
|
|
2
|
+
* Duplication Drift Gate (v2)
|
|
3
3
|
*
|
|
4
4
|
* Detects when AI generates near-identical functions across files because
|
|
5
5
|
* it doesn't remember what it already wrote. This is an AI-specific failure
|
|
6
6
|
* mode — humans reuse via copy-paste (same file), AI re-invents (cross-file).
|
|
7
7
|
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
8
|
+
* v2 upgrades:
|
|
9
|
+
* - tree-sitter AST node type sequences replace hand-rolled regex tokenizer
|
|
10
|
+
* - Jaccard similarity on AST node multisets (structural, not textual)
|
|
11
|
+
* - Catches duplicates even when every variable name is different
|
|
12
|
+
* - MD5 kept as fast-path for exact matches, Jaccard runs on remaining pairs
|
|
12
13
|
*
|
|
13
|
-
*
|
|
14
|
+
* Detection strategy (three-pass):
|
|
15
|
+
* 1. Extract function bodies, normalize text (strip comments/whitespace)
|
|
16
|
+
* 2. Parse with tree-sitter → walk AST → collect node type multiset
|
|
17
|
+
* 3. Generate semantic embeddings via all-MiniLM-L6-v2 (384D)
|
|
18
|
+
* 4. Pass 1 (fast): MD5 hash → exact duplicates (O(n), <10ms)
|
|
19
|
+
* 5. Pass 2 (Jaccard): AST node multiset similarity → structural near-duplicates (O(n²) bounded)
|
|
20
|
+
* 6. Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
21
|
+
* 7. Flag functions with similarity > threshold in different files
|
|
22
|
+
*
|
|
23
|
+
* Why AST node types > raw tokens:
|
|
24
|
+
* - `getUserById(id) { return db.find(x => x.id === id) }`
|
|
25
|
+
* - `fetchUser(userId) { return database.filter(u => u.id === userId)[0] }`
|
|
26
|
+
* Both produce similar AST: [return_statement, call_expression, arrow_function,
|
|
27
|
+
* binary_expression, member_expression]. Variable names are invisible.
|
|
28
|
+
*
|
|
29
|
+
* @since v2.16.0 (original MD5)
|
|
30
|
+
* @since v5.0.0 (tree-sitter AST + Jaccard)
|
|
31
|
+
* @since v5.1.0 (semantic embedding Pass 3)
|
|
14
32
|
*/
|
|
15
33
|
import { Gate, GateContext } from './base.js';
|
|
16
34
|
import { Failure, Provenance } from '../types/index.js';
|
|
17
35
|
export interface DuplicationDriftConfig {
|
|
18
36
|
enabled?: boolean;
|
|
19
37
|
similarity_threshold?: number;
|
|
38
|
+
semantic_threshold?: number;
|
|
39
|
+
semantic_enabled?: boolean;
|
|
20
40
|
min_body_lines?: number;
|
|
41
|
+
approved_duplications?: string[];
|
|
21
42
|
}
|
|
22
43
|
export declare class DuplicationDriftGate extends Gate {
|
|
23
44
|
private config;
|
|
45
|
+
private parser;
|
|
24
46
|
constructor(config?: DuplicationDriftConfig);
|
|
25
47
|
protected get provenance(): Provenance;
|
|
26
48
|
run(context: GateContext): Promise<Failure[]>;
|
|
49
|
+
/**
|
|
50
|
+
* Parse the file with tree-sitter, find function nodes that match
|
|
51
|
+
* our extracted functions (by line number), and replace their token
|
|
52
|
+
* multisets with AST node type sequences.
|
|
53
|
+
*
|
|
54
|
+
* AST node types are language-agnostic structural tokens:
|
|
55
|
+
* - if_statement, for_statement, return_statement
|
|
56
|
+
* - call_expression, member_expression, binary_expression
|
|
57
|
+
* - arrow_function, function_declaration
|
|
58
|
+
*
|
|
59
|
+
* Variable names, string literals, comments — all invisible.
|
|
60
|
+
* Only STRUCTURE matters.
|
|
61
|
+
*/
|
|
62
|
+
private enrichWithASTTokens;
|
|
63
|
+
/**
|
|
64
|
+
* Walk the AST tree to find a function/method node at a given line.
|
|
65
|
+
*/
|
|
66
|
+
private findFunctionNodeAtLine;
|
|
67
|
+
/**
|
|
68
|
+
* Walk an AST subtree and collect node types as a multiset.
|
|
69
|
+
*
|
|
70
|
+
* This is the core insight: two functions with different variable names
|
|
71
|
+
* but the same control flow produce the same node type multiset.
|
|
72
|
+
*
|
|
73
|
+
* Example:
|
|
74
|
+
* `function a(x) { if (x > 0) return x * 2; return 0; }`
|
|
75
|
+
* `function b(val) { if (val > 0) return val * 2; return 0; }`
|
|
76
|
+
*
|
|
77
|
+
* Both produce: {if_statement: 1, binary_expression: 2, return_statement: 2, ...}
|
|
78
|
+
* → Jaccard similarity = 1.0
|
|
79
|
+
*/
|
|
80
|
+
private collectASTNodeTypes;
|
|
81
|
+
/**
|
|
82
|
+
* Fallback tokenizer when tree-sitter is not available.
|
|
83
|
+
* Uses normalized text → keyword/operator multiset.
|
|
84
|
+
*/
|
|
85
|
+
private textTokenize;
|
|
86
|
+
/**
|
|
87
|
+
* Jaccard similarity on multisets.
|
|
88
|
+
* intersection = sum of min(countA, countB) for each key
|
|
89
|
+
* union = sum of max(countA, countB) for each key
|
|
90
|
+
*/
|
|
91
|
+
private jaccardSimilarity;
|
|
27
92
|
private extractJSFunctions;
|
|
28
93
|
private extractPyFunctions;
|
|
29
94
|
private extractFunctionBody;
|
|
30
95
|
private normalizeBody;
|
|
31
96
|
private hash;
|
|
97
|
+
/**
|
|
98
|
+
* Generate semantic embedding text for a function.
|
|
99
|
+
* Combines function name, parameter names, and first 200 tokens of body.
|
|
100
|
+
* This captures INTENT regardless of implementation differences.
|
|
101
|
+
*
|
|
102
|
+
* Example:
|
|
103
|
+
* getUserById(id) { return db.users.find(x => x.id === id) }
|
|
104
|
+
* → "getUserById id return db users find x id id"
|
|
105
|
+
*
|
|
106
|
+
* fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
|
|
107
|
+
* → "fetchUserRecord userId return database users filter u id userId 0"
|
|
108
|
+
*
|
|
109
|
+
* These produce similar embeddings (~0.91 cosine) despite different AST.
|
|
110
|
+
*/
|
|
111
|
+
private buildEmbeddingText;
|
|
112
|
+
/**
|
|
113
|
+
* Enrich functions with semantic embeddings for Pass 3.
|
|
114
|
+
* Only called for functions not already claimed by Pass 1/2.
|
|
115
|
+
* Uses generateEmbedding() from pattern-index/embeddings.ts.
|
|
116
|
+
*/
|
|
117
|
+
private enrichWithEmbeddings;
|
|
118
|
+
/**
|
|
119
|
+
* Three-pass duplicate detection:
|
|
120
|
+
* Pass 1 (fast): MD5 hash → exact duplicates (O(n))
|
|
121
|
+
* Pass 2 (Jaccard): AST node multiset similarity → near-duplicates (O(n²) bounded)
|
|
122
|
+
* Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
|
|
123
|
+
*
|
|
124
|
+
* Pass 3 catches what AST Jaccard misses: same intent, different implementation.
|
|
125
|
+
* Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
|
|
126
|
+
*/
|
|
32
127
|
private findDuplicateGroups;
|
|
33
128
|
}
|