npm - @getmikk/intent-engine - Versions diffs - 1.5.0 → 1.6.0 - Mend

@getmikk/intent-engine 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +61 -6
package/package.json +13 -4
package/src/index.ts +2 -0
package/src/semantic-searcher.ts +170 -0
package/src/xeno-transformers.d.ts +9 -0
package/tests/semantic-searcher.test.ts +178 -0

package/README.md CHANGED Viewed

@@ -1,17 +1,15 @@
 # @getmikk/intent-engine
-> AI pre-flight system — parses natural-language prompts into structured intents, detects constraint conflicts, and generates implementation suggestions before any code changes happen.
+> Architectural pre-flight — check if your idea is safe before writing a single line.
 [![npm](https://img.shields.io/npm/v/@getmikk/intent-engine)](https://www.npmjs.com/package/@getmikk/intent-engine)
 [![License: Apache-2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](../../LICENSE)
-`@getmikk/intent-engine` is the "pre-flight check" layer. Given a developer's natural-language description of what they want to do (e.g., *"add a caching layer to the auth module"*), the engine:
+`@getmikk/intent-engine` is the pre-flight check layer. You describe what you want to build in plain English — *"add a caching layer to the auth module"* — and before any code is written, the engine interprets your intent into structured objects, checks it against every architectural constraint in `mikk.json`, detects conflicts and layer violations, and generates a concrete implementation plan with which files to touch and what to create.
-1. **Interprets** the prompt into structured `Intent` objects
-2. **Detects** conflicts against architectural constraints defined in `mikk.json`
-3. **Suggests** which files to touch, what to create, and the estimated blast radius
+For AI coding agents, this is the guardrail that prevents architecturally unsafe code generation. For human developers, it's the equivalent of running your idea past a senior architect who knows every constraint in the codebase.
-All of this happens *before* any code is written — giving AI coding agents (or human developers) a guardrail system.
+> Part of [Mikk](../../README.md) — the codebase nervous system for AI-assisted development.
 ---
@@ -212,6 +210,63 @@ for (const s of suggestions) {
 ---
+### SemanticSearcher
+Finds functions semantically similar to a natural-language query using local embeddings via [`@xenova/transformers`](https://github.com/xenova/transformers.js). No API key required — the model runs entirely offline.
+**Model:** `Xenova/all-MiniLM-L6-v2` (~22 MB, downloaded once to `~/.cache/huggingface` on first use)
+**Optional peer dependency:** `@xenova/transformers >= 2`
+```bash
+bun add @xenova/transformers   # only needed if you use SemanticSearcher
+```
+```typescript
+import { SemanticSearcher } from '@getmikk/intent-engine'
+// Check if @xenova/transformers is installed before using
+if (await SemanticSearcher.isAvailable()) {
+  const searcher = new SemanticSearcher(projectRoot)
+  // index() builds embeddings; subsequent calls are O(1) cache hits
+  await searcher.index(lock)
+  // search() returns the topK most relevant functions
+  const results = await searcher.search('validate JWT and return user payload', lock, 5)
+  for (const r of results) {
+    console.log(`${r.name} (${r.file}:${r.lines}) — score: ${r.score}`)
+    console.log(`  ${r.purpose}`)
+  }
+}
+```
+**Cache behaviour:** Embeddings are persisted to `{projectRoot}/.mikk/embeddings.json` and fingerprinted by function count + first 20 sorted IDs. Re-indexing only re-embeds when the lock actually changes (e.g. after `mikk sync`). A cache hit costs a single disk read.
+**`SemanticMatch`:**
+| Field | Type | Description |
+|-------|------|-------------|
+| `id` | `string` | Function ID (`fn:module:name`) |
+| `name` | `string` | Function name |
+| `file` | `string` | Source file path |
+| `moduleId` | `string` | Owning module |
+| `purpose` | `string` | One-line purpose from the lock |
+| `lines` | `string` | Line range, e.g. `"12-34"` |
+| `score` | `number` | Cosine similarity `[0, 1]` — higher is more relevant |
+**API:**
+| Method | Description |
+|--------|-------------|
+| `SemanticSearcher.isAvailable()` | Returns `true` if `@xenova/transformers` is importable |
+| `new SemanticSearcher(projectRoot)` | Creates an instance scoped to a project root |
+| `.index(lock)` | Builds/loads embeddings for all functions in the lock |
+| `.search(query, lock, topK?)` | Returns top `topK` (default 10) semantically similar functions |
+> **Note:** Call `index()` before `search()`, otherwise `search()` throws `"Call index() before search()"`. The MCP server keeps a per-project singleton to avoid repeated model loads.
+---
 ## Usage with AI Agents
 The intent engine is designed to be called by AI coding agents as a pre-flight check:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@getmikk/intent-engine",
-    "version": "1.5.0",
+    "version": "1.6.0",
     "license": "Apache-2.0",
     "repository": {
         "type": "git",
@@ -21,11 +21,20 @@
         "dev": "tsc --watch"
     },
     "dependencies": {
-        "@getmikk/core": "^1.5.0",
+        "@getmikk/core": "^1.6.0",
         "zod": "^3.22.0"
     },
+    "peerDependencies": {
+        "@xenova/transformers": ""
+    },
+    "peerDependenciesMeta": {
+        "@xenova/transformers": {
+            "optional": true
+        }
+    },
     "devDependencies": {
-        "typescript": "^5.7.0",
-        "@types/node": "^22.0.0"
+        "@types/bun": "^1.3.10",
+        "@types/node": "^22.0.0",
+        "typescript": "^5.7.0"
     }
 }

package/src/index.ts CHANGED Viewed

@@ -2,5 +2,7 @@ export { IntentInterpreter } from './interpreter.js'
 export { ConflictDetector } from './conflict-detector.js'
 export { Suggester } from './suggester.js'
 export { PreflightPipeline } from './preflight.js'
+export { SemanticSearcher } from './semantic-searcher.js'
+export type { SemanticMatch } from './semantic-searcher.js'
 export type { Intent, Conflict, ConflictResult, Suggestion, PreflightResult, AIProviderConfig } from './types.js'
 export { IntentSchema } from './types.js'

package/src/semantic-searcher.ts ADDED Viewed

@@ -0,0 +1,170 @@
+import * as path from 'node:path'
+import * as fs from 'node:fs/promises'
+import type { MikkLock } from '@getmikk/core'
+interface EmbeddingCache {
+    lockFingerprint: string
+    model: string
+    embeddings: Record<string, number[]> // fnId → unit-normed vector
+}
+export interface SemanticMatch {
+    id: string
+    name: string
+    file: string
+    moduleId: string
+    purpose: string
+    lines: string
+    score: number // cosine similarity [0, 1]
+}
+/**
+ * SemanticSearcher — finds functions semantically similar to a natural-language
+ * query using local embeddings via @xenova/transformers.
+ *
+ * Model: Xenova/all-MiniLM-L6-v2 (~22 MB, downloads once to ~/.cache/huggingface).
+ * Embeddings are incrementally cached in {projectRoot}/.mikk/embeddings.json and
+ * recomputed only when the lock changes (fingerprinted by function count + IDs).
+ *
+ * Usage:
+ *   const searcher = new SemanticSearcher(projectRoot)
+ *   await searcher.index(lock)
+ *   const results = await searcher.search('validate JWT token', lock)
+ */
+export class SemanticSearcher {
+    static readonly MODEL = 'Xenova/all-MiniLM-L6-v2'
+    private readonly cachePath: string
+    private pipeline: any = null
+    private cache: EmbeddingCache | null = null
+    constructor(private readonly projectRoot: string) {
+        this.cachePath = path.join(projectRoot, '.mikk', 'embeddings.json')
+    }
+    /**
+     * Returns true when @xenova/transformers is installed and importable.
+     * The MCP tool calls this to decide whether to surface the semantic search tool.
+     */
+    static async isAvailable(): Promise<boolean> {
+        try {
+            await import('@xenova/transformers')
+            return true
+        } catch {
+            return false
+        }
+    }
+    /**
+     * Build (or load from cache) embeddings for every function in the lock.
+     * Safe to call on every MCP request — cache hit is O(1) disk read.
+     */
+    async index(lock: MikkLock): Promise<void> {
+        const fingerprint = lockFingerprint(lock)
+        // ── Cache hit ──────────────────────────────────────────────────────
+        try {
+            const raw = await fs.readFile(this.cachePath, 'utf-8')
+            const cached: EmbeddingCache = JSON.parse(raw)
+            // Validate shape before trusting it
+            if (
+                typeof cached.lockFingerprint === 'string' &&
+                typeof cached.model === 'string' &&
+                typeof cached.embeddings === 'object' && cached.embeddings !== null &&
+                cached.lockFingerprint === fingerprint &&
+                cached.model === SemanticSearcher.MODEL
+            ) {
+                this.cache = cached
+                return
+            }
+        } catch { /* miss or corrupt — rebuild */ }
+        // ── Empty lock fast-path — nothing to embed ────────────────────────
+        const fns = Object.values(lock.functions)
+        if (fns.length === 0) {
+            this.cache = { lockFingerprint: fingerprint, model: SemanticSearcher.MODEL, embeddings: {} }
+            return
+        }
+        // Text representation: name + purpose + param names (no bodies, keeps it fast)
+        const texts = fns.map(fn => {
+            const parts: string[] = [fn.name]
+            if (fn.purpose) parts.push(fn.purpose)
+            if (fn.params?.length) parts.push(fn.params.map((p: any) => p.name).join(' '))
+            if (fn.returnType && fn.returnType !== 'void' && fn.returnType !== 'any') {
+                parts.push('returns ' + fn.returnType)
+            }
+            return parts.join(' ')
+        })
+        await this.ensurePipeline()
+        const embeddings: Record<string, number[]> = {}
+        const BATCH = 64
+        for (let i = 0; i < fns.length; i += BATCH) {
+            const batch = texts.slice(i, i + BATCH)
+            const output = await this.pipeline(batch, { pooling: 'mean', normalize: true })
+            for (let j = 0; j < batch.length; j++) {
+                embeddings[fns[i + j].id] = Array.from(output[j].data as Float32Array)
+            }
+        }
+        this.cache = { lockFingerprint: fingerprint, model: SemanticSearcher.MODEL, embeddings }
+        await fs.mkdir(path.dirname(this.cachePath), { recursive: true })
+        await fs.writeFile(this.cachePath, JSON.stringify(this.cache))
+    }
+    /**
+     * Find the `topK` functions most semantically similar to `query`.
+     * Call index() first.
+     */
+    async search(query: string, lock: MikkLock, topK = 10): Promise<SemanticMatch[]> {
+        if (!this.cache) throw new Error('Call index() before search()')
+        await this.ensurePipeline()
+        const queryOut = await this.pipeline([query], { pooling: 'mean', normalize: true })
+        const queryVec: number[] = Array.from(queryOut[0].data as Float32Array)
+        const scored = Object.entries(this.cache.embeddings).map(([id, vec]) => ({
+            id,
+            score: cosineSimilarity(queryVec, vec),
+        }))
+        scored.sort((a, b) => b.score - a.score)
+        return scored.slice(0, topK).map(({ id, score }) => {
+            const fn = lock.functions[id]
+            // Skip IDs that are in the embedding cache but no longer in the lock
+            // (can happen if cache was read from disk and lock changed in same session)
+            if (!fn) return null
+            return {
+                id,
+                name: fn.name,
+                file: fn.file ?? '',
+                moduleId: fn.moduleId ?? '',
+                purpose: fn.purpose ?? '',
+                lines: `${fn.startLine}-${fn.endLine}`,
+                score: Math.round(score * 1000) / 1000,
+            }
+        }).filter((r): r is SemanticMatch => r !== null)
+    }
+    private async ensurePipeline() {
+        if (this.pipeline) return
+        const { pipeline } = await import('@xenova/transformers')
+        this.pipeline = await pipeline('feature-extraction', SemanticSearcher.MODEL)
+    }
+}
+// ─── Helpers ─────────────────────────────────────────────────────────────────
+/** Lightweight fingerprint: function count + first 20 sorted IDs */
+function lockFingerprint(lock: MikkLock): string {
+    const ids = Object.keys(lock.functions).sort().slice(0, 20).join('|')
+    return `${Object.keys(lock.functions).length}:${ids}`
+}
+function cosineSimilarity(a: number[], b: number[]): number {
+    let dot = 0
+    for (let i = 0; i < a.length; i++) dot += a[i] * b[i]
+    // Vectors are already unit-normed by the model (normalize: true), so |a|=|b|=1
+    return Math.max(-1, Math.min(1, dot))
+}

package/src/xeno-transformers.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+/**
+ * Ambient stub for the optional peer dependency @xenova/transformers.
+ * The real types are only available when the package is installed.
+ * We use dynamic import + `any` everywhere so this stub is sufficient.
+ */
+declare module '@xenova/transformers' {
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    export function pipeline(task: string, model?: string, options?: any): Promise<any>
+}

package/tests/semantic-searcher.test.ts ADDED Viewed

@@ -0,0 +1,178 @@
+import { describe, test, expect, beforeAll } from 'bun:test'
+import { SemanticSearcher } from '../src/semantic-searcher'
+import type { MikkLock } from '@getmikk/core'
+// ── Minimal mock lock ─────────────────────────────────────────────────────────
+const mockLock: MikkLock = {
+    version: '1.0.0',
+    generatedAt: new Date().toISOString(),
+    generatorVersion: '1.1.0',
+    projectRoot: '/test',
+    syncState: { status: 'clean', lastSyncAt: new Date().toISOString(), lockHash: 'a', contractHash: 'b' },
+    modules: {},
+    functions: {
+        'fn:auth:verifyToken': {
+            id: 'fn:auth:verifyToken',
+            name: 'verifyToken',
+            file: 'src/auth/verify.ts',
+            startLine: 1, endLine: 10,
+            hash: 'h1', calls: [], calledBy: [],
+            moduleId: 'auth',
+            purpose: 'Verify and decode a JWT token',
+            params: [{ name: 'token', type: 'string', optional: false }],
+            returnType: 'UserPayload',
+        },
+        'fn:db:saveUser': {
+            id: 'fn:db:saveUser',
+            name: 'saveUser',
+            file: 'src/db/users.ts',
+            startLine: 1, endLine: 15,
+            hash: 'h2', calls: [], calledBy: [],
+            moduleId: 'db',
+            purpose: 'Persist a user record to the database',
+            params: [{ name: 'user', type: 'User', optional: false }],
+            returnType: 'Promise<void>',
+        },
+        'fn:email:sendWelcome': {
+            id: 'fn:email:sendWelcome',
+            name: 'sendWelcomeEmail',
+            file: 'src/email/sender.ts',
+            startLine: 1, endLine: 20,
+            hash: 'h3', calls: [], calledBy: [],
+            moduleId: 'email',
+            purpose: 'Send a welcome email to a newly registered user',
+            params: [{ name: 'to', type: 'string', optional: false }],
+            returnType: 'Promise<void>',
+        },
+        'fn:api:handleLogin': {
+            id: 'fn:api:handleLogin',
+            name: 'handleLogin',
+            file: 'src/api/login.ts',
+            startLine: 1, endLine: 30,
+            hash: 'h4', calls: [], calledBy: [],
+            moduleId: 'api',
+            purpose: 'Handle HTTP login request and return JWT',
+            params: [{ name: 'req', type: 'Request', optional: false }],
+            returnType: 'Promise<Response>',
+        },
+    },
+    files: {},
+    graph: { nodes: 4, edges: 0, rootHash: 'root' },
+}
+// ── Tests ─────────────────────────────────────────────────────────────────────
+describe('SemanticSearcher', () => {
+    let searcher: SemanticSearcher
+    test('isAvailable() returns true (package is installed)', async () => {
+        const ok = await SemanticSearcher.isAvailable()
+        expect(ok).toBe(true)
+    })
+    describe('with indexed lock', () => {
+        beforeAll(async () => {
+            searcher = new SemanticSearcher('/tmp/mikk-test-' + Date.now())
+            await searcher.index(mockLock)
+        }, 60_000) // model download can take time on first run
+        test('returns results for any query', async () => {
+            const results = await searcher.search('authenticate user', mockLock, 4)
+            expect(results.length).toBeGreaterThan(0)
+            expect(results[0].score).toBeGreaterThanOrEqual(0)
+            expect(results[0].score).toBeLessThanOrEqual(1)
+        })
+        test('JWT-related query ranks verifyToken or handleLogin highest', async () => {
+            const results = await searcher.search('validate JWT token', mockLock, 4)
+            const top2Names = results.slice(0, 2).map(r => r.name)
+            const hasJwtMatch = top2Names.some(n => n === 'verifyToken' || n === 'handleLogin')
+            expect(hasJwtMatch).toBe(true)
+        })
+        test('email query ranks sendWelcomeEmail highest', async () => {
+            const results = await searcher.search('send email to new user', mockLock, 4)
+            expect(results[0].name).toBe('sendWelcomeEmail')
+        })
+        test('database persistence query ranks saveUser highest', async () => {
+            const results = await searcher.search('save user to database', mockLock, 4)
+            expect(results[0].name).toBe('saveUser')
+        })
+        test('results include required fields', async () => {
+            const results = await searcher.search('login', mockLock, 2)
+            for (const r of results) {
+                expect(typeof r.id).toBe('string')
+                expect(typeof r.name).toBe('string')
+                expect(typeof r.file).toBe('string')
+                expect(typeof r.score).toBe('number')
+                expect(typeof r.lines).toBe('string')
+            }
+        })
+        test('topK limits the number of results', async () => {
+            const results = await searcher.search('function', mockLock, 2)
+            expect(results.length).toBeLessThanOrEqual(2)
+        })
+        test('second call uses cache (no re-embedding)', async () => {
+            const t0 = Date.now()
+            await searcher.index(mockLock) // should be cache hit
+            const elapsed = Date.now() - t0
+            // Cache hit should be sub-100ms (disk read only)
+            expect(elapsed).toBeLessThan(500)
+        })
+        test('topK = 0 returns empty array', async () => {
+            const results = await searcher.search('login', mockLock, 0)
+            expect(results).toEqual([])
+        })
+        test('empty query string does not crash', async () => {
+            const results = await searcher.search('', mockLock, 2)
+            expect(Array.isArray(results)).toBe(true)
+        })
+        test('stale cache IDs not present in lock are silently skipped', async () => {
+            // Build a lock that has one fewer function than what was cached
+            const shrunkLock = { ...mockLock, functions: { 'fn:auth:verifyToken': mockLock.functions['fn:auth:verifyToken']! } }
+            // searcher.cache still has embeddings for all 4 IDs from beforeAll
+            const results = await searcher.search('user', shrunkLock, 10)
+            // Only the one function present in the shrunk lock should appear
+            expect(results.every(r => r.id in shrunkLock.functions)).toBe(true)
+        })
+        test('re-index with changed lock rebuilds embeddings', async () => {
+            const newFn = {
+                ...mockLock.functions['fn:auth:verifyToken']!,
+                id: 'fn:new:brandNewFn',
+                name: 'brandNewFn',
+                purpose: 'A completely unique purpose for testing fingerprint change',
+            }
+            const changedLock: MikkLock = {
+                ...mockLock,
+                functions: { ...mockLock.functions, 'fn:new:brandNewFn': newFn },
+            }
+            const freshSearcher = new SemanticSearcher('/tmp/mikk-reindex-' + Date.now())
+            await freshSearcher.index(changedLock) // fingerprint differs → full recompute
+            const results = await freshSearcher.search('unique purpose', changedLock, 5)
+            expect(results.some(r => r.name === 'brandNewFn')).toBe(true)
+        }, 30_000)
+    })
+    describe('edge cases', () => {
+        test('search() before index() throws', async () => {
+            const fresh = new SemanticSearcher('/tmp/mikk-never-indexed-' + Date.now())
+            await expect(fresh.search('query', mockLock)).rejects.toThrow('Call index() before search()')
+        })
+        test('empty lock: index() and search() succeed, return empty array', async () => {
+            const emptyLock: MikkLock = { ...mockLock, functions: {} }
+            const s = new SemanticSearcher('/tmp/mikk-empty-' + Date.now())
+            await s.index(emptyLock)
+            const results = await s.search('anything', emptyLock, 5)
+            expect(results).toEqual([])
+        })
+    })
+})