@rigour-labs/core 4.3.6 → 5.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,62 @@
1
+ /**
2
+ * Dependency Guardian Gate (v2)
3
+ *
4
+ * Detects dependency issues that AI agents commonly introduce:
5
+ * 1. Forbidden dependencies (existing) — packages banned by project standards
6
+ * 2. Unused dependencies (NEW) — installed but never imported
7
+ * 3. Heavy alternatives (NEW) — bloated packages with lighter alternatives
8
+ * 4. Duplicate purpose (NEW) — multiple packages solving the same problem
9
+ *
10
+ * AI agents are particularly prone to:
11
+ * - Adding packages they've seen in training data without checking existing deps
12
+ * - Using heavy/popular packages when lighter alternatives exist
13
+ * - Installing multiple HTTP clients, date libs, etc. across different sessions
14
+ *
15
+ * @since v2.0.0 (forbidden deps)
16
+ * @since v5.1.0 (unused, heavy alternatives, duplicate purpose)
17
+ */
1
18
  import fs from 'fs-extra';
2
19
  import path from 'path';
3
20
  import { Gate } from './base.js';
21
+ import { FileScanner } from '../utils/scanner.js';
22
+ import { Logger } from '../utils/logger.js';
23
+ /**
24
+ * Known heavy packages with lighter alternatives.
25
+ * Format: package → "alternative (size comparison)"
26
+ */
27
+ const HEAVY_ALTERNATIVES = {
28
+ 'moment': 'date-fns or dayjs (67KB → 2KB gzipped)',
29
+ 'lodash': 'lodash-es (tree-shakeable) or native Array/Object methods',
30
+ 'underscore': 'native ES6+ methods (Array.map, Object.entries, etc.)',
31
+ 'axios': 'native fetch API (built into Node 18+)',
32
+ 'request': 'node-fetch or native fetch (deprecated since 2020)',
33
+ 'bluebird': 'native Promise (built-in since ES2015)',
34
+ 'jquery': 'native DOM APIs (querySelector, fetch, classList)',
35
+ 'classnames': 'clsx (0.3KB vs 1KB) or template literals',
36
+ 'uuid': 'crypto.randomUUID() (built into Node 19+ and browsers)',
37
+ 'left-pad': 'String.prototype.padStart() (built-in)',
38
+ 'is-even': 'n % 2 === 0 (one-liner)',
39
+ 'is-odd': 'n % 2 !== 0 (one-liner)',
40
+ 'chalk': 'picocolors (14x smaller, faster)',
41
+ };
42
+ /**
43
+ * Functional groups — if >1 package from same group is installed,
44
+ * it's likely a duplicate purpose issue from AI drift.
45
+ */
46
+ const FUNCTIONAL_GROUPS = [
47
+ { name: 'HTTP clients', packages: ['axios', 'node-fetch', 'got', 'request', 'ky', 'superagent', 'undici'] },
48
+ { name: 'Date/time libraries', packages: ['moment', 'dayjs', 'date-fns', 'luxon', 'fecha', 'chrono-node'] },
49
+ { name: 'Terminal colors', packages: ['chalk', 'kleur', 'ansi-colors', 'picocolors', 'colorette'] },
50
+ { name: 'CLI argument parsers', packages: ['commander', 'yargs', 'meow', 'cac', 'minimist', 'arg'] },
51
+ { name: 'Schema validation', packages: ['zod', 'joi', 'yup', 'ajv', 'superstruct', 'io-ts'] },
52
+ { name: 'Logging libraries', packages: ['winston', 'pino', 'bunyan', 'log4js', 'signale', 'consola'] },
53
+ { name: 'UUID generators', packages: ['uuid', 'nanoid', 'cuid', 'ulid', 'shortid'] },
54
+ { name: 'Markdown parsers', packages: ['marked', 'markdown-it', 'remark', 'showdown', 'remarkable'] },
55
+ { name: 'Testing frameworks', packages: ['jest', 'mocha', 'vitest', 'ava', 'tap', 'jasmine'] },
56
+ { name: 'CSS-in-JS', packages: ['styled-components', 'emotion', '@emotion/react', 'linaria', 'vanilla-extract'] },
57
+ { name: 'State management', packages: ['redux', 'mobx', 'zustand', 'jotai', 'recoil', 'valtio'] },
58
+ { name: 'Environment config', packages: ['dotenv', 'env-var', 'envalid', 'convict'] },
59
+ ];
4
60
  export class DependencyGate extends Gate {
5
61
  config;
6
62
  constructor(config) {
@@ -9,11 +65,10 @@ export class DependencyGate extends Gate {
9
65
  }
10
66
  async run(context) {
11
67
  const failures = [];
12
- const forbidden = this.config.gates.dependencies?.forbid || [];
13
- if (forbidden.length === 0)
14
- return [];
68
+ const depConfig = this.config.gates.dependencies || {};
69
+ const forbidden = depConfig.forbid || [];
15
70
  const { cwd } = context;
16
- // 1. Scan Node.js (package.json)
71
+ // 1. Scan Node.js (package.json) — forbidden + new checks
17
72
  const pkgPath = path.join(cwd, 'package.json');
18
73
  if (await fs.pathExists(pkgPath)) {
19
74
  try {
@@ -23,15 +78,32 @@ export class DependencyGate extends Gate {
23
78
  ...(pkg.devDependencies || {}),
24
79
  ...(pkg.peerDependencies || {}),
25
80
  };
81
+ const depNames = Object.keys(allDeps);
82
+ // Forbidden deps check
26
83
  for (const dep of forbidden) {
27
84
  if (allDeps[dep]) {
28
85
  failures.push(this.createFailure(`The package '${dep}' is forbidden by project standards.`, ['package.json'], `Remove '${dep}' from package.json and use approved alternatives.`, 'Forbidden Dependency', undefined, undefined, 'medium'));
29
86
  }
30
87
  }
88
+ // NEW: Unused dependency detection
89
+ if (depConfig.detect_unused !== false && depNames.length > 0) {
90
+ const unusedFailures = await this.detectUnusedDeps(context, pkg, depNames, depConfig.unused_allowlist || []);
91
+ failures.push(...unusedFailures);
92
+ }
93
+ // NEW: Heavy alternative detection
94
+ if (depConfig.detect_heavy_alternatives !== false) {
95
+ const heavyFailures = this.detectHeavyAlternatives(depNames);
96
+ failures.push(...heavyFailures);
97
+ }
98
+ // NEW: Duplicate purpose detection
99
+ if (depConfig.detect_duplicate_purpose !== false) {
100
+ const dupeFailures = this.detectDuplicatePurpose(depNames);
101
+ failures.push(...dupeFailures);
102
+ }
31
103
  }
32
104
  catch (e) { }
33
105
  }
34
- // 2. Scan Python (requirements.txt, pyproject.toml)
106
+ // 2. Scan Python (requirements.txt, pyproject.toml) — forbidden only
35
107
  const reqPath = path.join(cwd, 'requirements.txt');
36
108
  if (await fs.pathExists(reqPath)) {
37
109
  const content = await fs.readFile(reqPath, 'utf-8');
@@ -72,4 +144,139 @@ export class DependencyGate extends Gate {
72
144
  }
73
145
  return failures;
74
146
  }
147
+ // ─── Unused Dependency Detection ─────────────────────────────────
148
+ /**
149
+ * Detect dependencies listed in package.json but never imported.
150
+ * Scans all source files for import/require statements.
151
+ *
152
+ * Allowlist handles side-effect imports like:
153
+ * - @types/* (TypeScript type packages)
154
+ * - polyfills (core-js, regenerator-runtime)
155
+ * - PostCSS/Babel plugins (used in config files, not source)
156
+ */
157
+ async detectUnusedDeps(context, pkg, depNames, allowlist) {
158
+ const failures = [];
159
+ // Only check production + dev dependencies (not peer)
160
+ const prodDeps = Object.keys(pkg.dependencies || {});
161
+ const devDeps = Object.keys(pkg.devDependencies || {});
162
+ const checkDeps = [...prodDeps, ...devDeps];
163
+ if (checkDeps.length === 0)
164
+ return [];
165
+ // Default allowlist patterns for known side-effect packages
166
+ const defaultAllowPatterns = [
167
+ /^@types\//, // TypeScript type packages
168
+ /^typescript$/, // Used by tsc, not imported
169
+ /^eslint/, // Used by config, not imported
170
+ /^prettier$/, // Used by config
171
+ /^@eslint/, // ESLint config packages
172
+ /^husky$/, // Git hooks
173
+ /^lint-staged$/, // Pre-commit
174
+ /^core-js/, // Polyfills
175
+ /^regenerator-runtime$/, // Babel polyfill
176
+ /^postcss/, // PostCSS plugins
177
+ /^autoprefixer$/, // PostCSS plugin
178
+ /^tailwindcss$/, // Build tool
179
+ /^@tailwindcss\//, // Build tool
180
+ /^webpack/, // Build tool
181
+ /^vite$/, // Build tool
182
+ /^@vitejs\//, // Vite plugins
183
+ /^rollup/, // Build tool
184
+ /^esbuild$/, // Build tool
185
+ /^tsup$/, // Build tool
186
+ /^turbo$/, // Build tool
187
+ /^nodemon$/, // Dev tool
188
+ /^ts-node$/, // Dev tool
189
+ /^tsx$/, // Dev tool
190
+ /^concurrently$/, // Dev tool
191
+ ];
192
+ // Combine with user allowlist
193
+ const userPatterns = allowlist.map(p => {
194
+ if (p.endsWith('*'))
195
+ return new RegExp(`^${p.slice(0, -1)}`);
196
+ return new RegExp(`^${p}$`);
197
+ });
198
+ const allPatterns = [...defaultAllowPatterns, ...userPatterns];
199
+ // Filter deps that need checking
200
+ const depsToCheck = checkDeps.filter(dep => !allPatterns.some(pattern => pattern.test(dep)));
201
+ if (depsToCheck.length === 0)
202
+ return [];
203
+ // Scan source files for import/require patterns
204
+ const sourceFiles = await FileScanner.findFiles({
205
+ cwd: context.cwd,
206
+ patterns: ['**/*.{ts,tsx,js,jsx,mjs,cjs,mts,cts,vue,svelte}'],
207
+ ignore: [...(context.ignore || []), '**/node_modules/**', '**/dist/**'],
208
+ });
209
+ // Read all source files
210
+ const contents = await FileScanner.readFiles(context.cwd, sourceFiles, context.fileCache);
211
+ // Also check config files that might reference deps
212
+ const configFiles = ['vite.config.ts', 'vite.config.js', 'next.config.js', 'next.config.mjs',
213
+ 'jest.config.ts', 'jest.config.js', 'vitest.config.ts', 'tsconfig.json',
214
+ 'babel.config.js', '.babelrc', 'postcss.config.js', 'tailwind.config.js',
215
+ 'webpack.config.js', 'rollup.config.js', 'esbuild.config.js'];
216
+ for (const cf of configFiles) {
217
+ const cfPath = path.join(context.cwd, cf);
218
+ if (await fs.pathExists(cfPath)) {
219
+ try {
220
+ contents.set(cf, await fs.readFile(cfPath, 'utf-8'));
221
+ }
222
+ catch { }
223
+ }
224
+ }
225
+ // Build a combined source text for fast searching
226
+ const allSource = Array.from(contents.values()).join('\n');
227
+ for (const dep of depsToCheck) {
228
+ // Check for various import patterns:
229
+ // import ... from 'dep'
230
+ // require('dep')
231
+ // import('dep')
232
+ // Also check for scoped package partial imports: @scope/package/sub
233
+ const depEscaped = dep.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
234
+ const importPattern = new RegExp(`(?:from\\s+['"\`]${depEscaped}(?:/[^'"\`]*)?['"\`])|` +
235
+ `(?:require\\s*\\(\\s*['"\`]${depEscaped}(?:/[^'"\`]*)?['"\`]\\s*\\))|` +
236
+ `(?:import\\s*\\(\\s*['"\`]${depEscaped}(?:/[^'"\`]*)?['"\`]\\s*\\))|` +
237
+ `(?:['"\`]${depEscaped}['"\`])` // Config file references
238
+ );
239
+ if (!importPattern.test(allSource)) {
240
+ const isDevDep = devDeps.includes(dep);
241
+ failures.push(this.createFailure(`${isDevDep ? 'Dev dependency' : 'Dependency'} '${dep}' appears unused — no import/require found in source files.`, ['package.json'], `Remove '${dep}' from package.json if it's truly unused, or add it to unused_allowlist if it's a side-effect import.`, 'Unused Dependency', undefined, undefined, 'low'));
242
+ }
243
+ }
244
+ if (failures.length > 0) {
245
+ Logger.info(`Dependency Guardian: Found ${failures.length} potentially unused dependencies`);
246
+ }
247
+ return failures;
248
+ }
249
+ // ─── Heavy Alternative Detection ─────────────────────────────────
250
+ /**
251
+ * Detect heavy/bloated packages that have lighter modern alternatives.
252
+ * AI agents tend to reach for the most popular (heaviest) package
253
+ * because that's what they've seen most in training data.
254
+ */
255
+ detectHeavyAlternatives(depNames) {
256
+ const failures = [];
257
+ for (const dep of depNames) {
258
+ const alternative = HEAVY_ALTERNATIVES[dep];
259
+ if (alternative) {
260
+ failures.push(this.createFailure(`Package '${dep}' has a lighter alternative: ${alternative}.`, ['package.json'], `Consider replacing '${dep}' with ${alternative}. AI agents often default to popular-but-heavy packages.`, 'Heavy Dependency', undefined, undefined, 'low'));
261
+ }
262
+ }
263
+ return failures;
264
+ }
265
+ // ─── Duplicate Purpose Detection ─────────────────────────────────
266
+ /**
267
+ * Detect when multiple packages serve the same purpose.
268
+ * This is a classic AI drift symptom — different sessions install different
269
+ * packages for the same task (e.g., axios in one PR, got in another).
270
+ */
271
+ detectDuplicatePurpose(depNames) {
272
+ const failures = [];
273
+ const depSet = new Set(depNames);
274
+ for (const group of FUNCTIONAL_GROUPS) {
275
+ const installed = group.packages.filter(pkg => depSet.has(pkg));
276
+ if (installed.length >= 2) {
277
+ failures.push(this.createFailure(`Multiple ${group.name} installed: ${installed.join(', ')}. Pick one and remove the rest.`, ['package.json'], `Having multiple packages for ${group.name} is a sign of AI drift — different sessions chose different packages. Standardize on one.`, 'Duplicate Purpose Dependencies', undefined, undefined, 'medium'));
278
+ }
279
+ }
280
+ return failures;
281
+ }
75
282
  }
@@ -1,33 +1,128 @@
1
1
  /**
2
- * Duplication Drift Gate
2
+ * Duplication Drift Gate (v2)
3
3
  *
4
4
  * Detects when AI generates near-identical functions across files because
5
5
  * it doesn't remember what it already wrote. This is an AI-specific failure
6
6
  * mode — humans reuse via copy-paste (same file), AI re-invents (cross-file).
7
7
  *
8
- * Detection strategy:
9
- * 1. Extract all function bodies (normalized: strip whitespace, comments)
10
- * 2. Compare function signatures + body hashes across files
11
- * 3. Flag functions with >80% similarity in different files
8
+ * v2 upgrades:
9
+ * - tree-sitter AST node type sequences replace hand-rolled regex tokenizer
10
+ * - Jaccard similarity on AST node multisets (structural, not textual)
11
+ * - Catches duplicates even when every variable name is different
12
+ * - MD5 kept as fast-path for exact matches, Jaccard runs on remaining pairs
12
13
  *
13
- * @since v2.16.0
14
+ * Detection strategy (three-pass):
15
+ * 1. Extract function bodies, normalize text (strip comments/whitespace)
16
+ * 2. Parse with tree-sitter → walk AST → collect node type multiset
17
+ * 3. Generate semantic embeddings via all-MiniLM-L6-v2 (384D)
18
+ * 4. Pass 1 (fast): MD5 hash → exact duplicates (O(n), <10ms)
19
+ * 5. Pass 2 (Jaccard): AST node multiset similarity → structural near-duplicates (O(n²) bounded)
20
+ * 6. Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
21
+ * 7. Flag functions with similarity > threshold in different files
22
+ *
23
+ * Why AST node types > raw tokens:
24
+ * - `getUserById(id) { return db.find(x => x.id === id) }`
25
+ * - `fetchUser(userId) { return database.filter(u => u.id === userId)[0] }`
26
+ * Both produce similar AST: [return_statement, call_expression, arrow_function,
27
+ * binary_expression, member_expression]. Variable names are invisible.
28
+ *
29
+ * @since v2.16.0 (original MD5)
30
+ * @since v5.0.0 (tree-sitter AST + Jaccard)
31
+ * @since v5.1.0 (semantic embedding Pass 3)
14
32
  */
15
33
  import { Gate, GateContext } from './base.js';
16
34
  import { Failure, Provenance } from '../types/index.js';
17
35
  export interface DuplicationDriftConfig {
18
36
  enabled?: boolean;
19
37
  similarity_threshold?: number;
38
+ semantic_threshold?: number;
39
+ semantic_enabled?: boolean;
20
40
  min_body_lines?: number;
41
+ approved_duplications?: string[];
21
42
  }
22
43
  export declare class DuplicationDriftGate extends Gate {
23
44
  private config;
45
+ private parser;
24
46
  constructor(config?: DuplicationDriftConfig);
25
47
  protected get provenance(): Provenance;
26
48
  run(context: GateContext): Promise<Failure[]>;
49
+ /**
50
+ * Parse the file with tree-sitter, find function nodes that match
51
+ * our extracted functions (by line number), and replace their token
52
+ * multisets with AST node type sequences.
53
+ *
54
+ * AST node types are language-agnostic structural tokens:
55
+ * - if_statement, for_statement, return_statement
56
+ * - call_expression, member_expression, binary_expression
57
+ * - arrow_function, function_declaration
58
+ *
59
+ * Variable names, string literals, comments — all invisible.
60
+ * Only STRUCTURE matters.
61
+ */
62
+ private enrichWithASTTokens;
63
+ /**
64
+ * Walk the AST tree to find a function/method node at a given line.
65
+ */
66
+ private findFunctionNodeAtLine;
67
+ /**
68
+ * Walk an AST subtree and collect node types as a multiset.
69
+ *
70
+ * This is the core insight: two functions with different variable names
71
+ * but the same control flow produce the same node type multiset.
72
+ *
73
+ * Example:
74
+ * `function a(x) { if (x > 0) return x * 2; return 0; }`
75
+ * `function b(val) { if (val > 0) return val * 2; return 0; }`
76
+ *
77
+ * Both produce: {if_statement: 1, binary_expression: 2, return_statement: 2, ...}
78
+ * → Jaccard similarity = 1.0
79
+ */
80
+ private collectASTNodeTypes;
81
+ /**
82
+ * Fallback tokenizer when tree-sitter is not available.
83
+ * Uses normalized text → keyword/operator multiset.
84
+ */
85
+ private textTokenize;
86
+ /**
87
+ * Jaccard similarity on multisets.
88
+ * intersection = sum of min(countA, countB) for each key
89
+ * union = sum of max(countA, countB) for each key
90
+ */
91
+ private jaccardSimilarity;
27
92
  private extractJSFunctions;
28
93
  private extractPyFunctions;
29
94
  private extractFunctionBody;
30
95
  private normalizeBody;
31
96
  private hash;
97
+ /**
98
+ * Generate semantic embedding text for a function.
99
+ * Combines function name, parameter names, and first 200 tokens of body.
100
+ * This captures INTENT regardless of implementation differences.
101
+ *
102
+ * Example:
103
+ * getUserById(id) { return db.users.find(x => x.id === id) }
104
+ * → "getUserById id return db users find x id id"
105
+ *
106
+ * fetchUserRecord(userId) { return database.users.filter(u => u.id === userId)[0] }
107
+ * → "fetchUserRecord userId return database users filter u id userId 0"
108
+ *
109
+ * These produce similar embeddings (~0.91 cosine) despite different AST.
110
+ */
111
+ private buildEmbeddingText;
112
+ /**
113
+ * Enrich functions with semantic embeddings for Pass 3.
114
+ * Only called for functions not already claimed by Pass 1/2.
115
+ * Uses generateEmbedding() from pattern-index/embeddings.ts.
116
+ */
117
+ private enrichWithEmbeddings;
118
+ /**
119
+ * Three-pass duplicate detection:
120
+ * Pass 1 (fast): MD5 hash → exact duplicates (O(n))
121
+ * Pass 2 (Jaccard): AST node multiset similarity → near-duplicates (O(n²) bounded)
122
+ * Pass 3 (semantic): Embedding cosine similarity → semantic duplicates (O(n²) bounded)
123
+ *
124
+ * Pass 3 catches what AST Jaccard misses: same intent, different implementation.
125
+ * Example: .find() vs .filter()[0] — different AST nodes, same semantic meaning.
126
+ */
32
127
  private findDuplicateGroups;
33
128
  }