npm - @shrkcrft/ai - Versions diffs - 0.1.0-alpha.11 → 0.1.0-alpha.13 - Mend

@shrkcrft/ai 0.1.0-alpha.11 → 0.1.0-alpha.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/ai-request.d.ts +15 -0
package/dist/ai-request.d.ts.map +1 -1
package/dist/gemini/gemini-provider.d.ts +24 -0
package/dist/gemini/gemini-provider.d.ts.map +1 -0
package/dist/gemini/gemini-provider.js +97 -0
package/dist/index.d.ts +7 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +7 -0
package/dist/llamacpp/llama-cpp-provider.d.ts +56 -0
package/dist/llamacpp/llama-cpp-provider.d.ts.map +1 -0
package/dist/llamacpp/llama-cpp-provider.js +268 -0
package/dist/llm-hints.d.ts +36 -0
package/dist/llm-hints.d.ts.map +1 -0
package/dist/llm-hints.js +92 -0
package/dist/llm-recommendations.d.ts +72 -0
package/dist/llm-recommendations.d.ts.map +1 -0
package/dist/llm-recommendations.js +188 -0
package/dist/ollama/ollama-provider.d.ts +47 -0
package/dist/ollama/ollama-provider.d.ts.map +1 -0
package/dist/ollama/ollama-provider.js +166 -0
package/dist/pipeline/enhancement-pipeline.d.ts +123 -0
package/dist/pipeline/enhancement-pipeline.d.ts.map +1 -0
package/dist/pipeline/enhancement-pipeline.js +295 -0
package/dist/provider-resolver.d.ts +28 -0
package/dist/provider-resolver.d.ts.map +1 -0
package/dist/provider-resolver.js +80 -0
package/package.json +5 -4

package/dist/ai-request.d.ts CHANGED Viewed

@@ -13,6 +13,16 @@ export interface IAiRequest {
     maxTokens?: number;
     temperature?: number;
     context?: string;
+    responseFormat?: IAiResponseFormat;
+    /**
+     * Optional callback invoked with each newly-decoded token chunk as
+     * generation streams in. Providers that don't natively stream
+     * (HTTP Gemini / Claude in our non-SSE adapter) ignore this; the
+     * llamacpp provider forwards chunks live. Useful for stderr "live
+     * preview" in CLI commands and for an agent who wants to display
+     * progress without the synchronous wait.
+     */
+    onTokenStream?: (chunk: string) => void;
 }
 export interface IAiResponse {
     content: string;
@@ -30,4 +40,9 @@ export interface IAiProviderConfig {
     model?: string;
     timeoutMs?: number;
 }
+export interface IAiResponseFormat {
+    type: 'json_object' | 'json_schema';
+    schema?: Record<string, unknown>;
+    schemaName?: string;
+}
 //# sourceMappingURL=ai-request.d.ts.map

package/dist/ai-request.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"ai-request.d.ts","sourceRoot":"","sources":["../src/ai-request.ts"],"names":[],"mappings":"AAAA,oBAAY,aAAa;IACvB,MAAM,WAAW;IACjB,IAAI,SAAS;IACb,SAAS,cAAc;CACxB;AAED,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,aAAa,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,SAAS,UAAU,EAAE,CAAC;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;~~CAClB~~;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACxD,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB"}
1	+ {"version":3,"file":"ai-request.d.ts","sourceRoot":"","sources":["../src/ai-request.ts"],"names":[],"mappings":"AAAA,oBAAY,aAAa;IACvB,MAAM,WAAW;IACjB,IAAI,SAAS;IACb,SAAS,cAAc;CACxB;AAED,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,aAAa,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,SAAS,UAAU,EAAE,CAAC;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,iBAAiB,CAAC;IACnC;;;;;;;OAOG;IACH,aAAa,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACzC;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACxD,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,aAAa,GAAG,aAAa,CAAC;IACpC,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB"}

package/dist/gemini/gemini-provider.d.ts ADDED Viewed

@@ -0,0 +1,24 @@
+import { type AppError, type Result } from '@shrkcrft/core';
+import { AbstractAiProvider } from '../ai-provider.js';
+import { type IAiRequest, type IAiResponse } from '../ai-request.js';
+/**
+ * HTTP adapter for Google's Gemini (Generative Language API).
+ *
+ * Reads `GEMINI_API_KEY` from env (or `IAiProviderConfig.apiKey`). When
+ * the key is missing `isReady()` returns false and `send()` reports an
+ * actionable error — same contract as `ClaudeProvider`.
+ *
+ * The Gemini REST surface differs from Anthropic's: system messages are
+ * passed as a top-level `systemInstruction`, conversation turns become
+ * `contents[]` with roles `user`/`model`, and the response token cap is
+ * `generationConfig.maxOutputTokens` (not `max_tokens`). This adapter
+ * translates the provider-neutral `IAiRequest` shape into that wire
+ * format and back.
+ */
+export declare class GeminiProvider extends AbstractAiProvider {
+    readonly id = "gemini";
+    readonly name = "Google Gemini (HTTP)";
+    isReady(): boolean;
+    send(request: IAiRequest): Promise<Result<IAiResponse, AppError>>;
+}
+//# sourceMappingURL=gemini-provider.d.ts.map

package/dist/gemini/gemini-provider.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"gemini-provider.d.ts","sourceRoot":"","sources":["../../src/gemini/gemini-provider.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsC,KAAK,QAAQ,EAAE,KAAK,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EAAkC,KAAK,UAAU,EAAE,KAAK,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAErG;;;;;;;;;;;;;GAaG;AACH,qBAAa,cAAe,SAAQ,kBAAkB;IACpD,QAAQ,CAAC,EAAE,YAAY;IACvB,QAAQ,CAAC,IAAI,0BAA0B;IAEvC,OAAO,IAAI,OAAO;IAIZ,IAAI,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;CAyExE"}

package/dist/gemini/gemini-provider.js ADDED Viewed

@@ -0,0 +1,97 @@
+import { AppErrorImpl, ERROR_CODES, err, ok } from '@shrkcrft/core';
+import { AbstractAiProvider } from "../ai-provider.js";
+import { AiMessageRole } from "../ai-request.js";
+/**
+ * HTTP adapter for Google's Gemini (Generative Language API).
+ *
+ * Reads `GEMINI_API_KEY` from env (or `IAiProviderConfig.apiKey`). When
+ * the key is missing `isReady()` returns false and `send()` reports an
+ * actionable error — same contract as `ClaudeProvider`.
+ *
+ * The Gemini REST surface differs from Anthropic's: system messages are
+ * passed as a top-level `systemInstruction`, conversation turns become
+ * `contents[]` with roles `user`/`model`, and the response token cap is
+ * `generationConfig.maxOutputTokens` (not `max_tokens`). This adapter
+ * translates the provider-neutral `IAiRequest` shape into that wire
+ * format and back.
+ */
+export class GeminiProvider extends AbstractAiProvider {
+    id = 'gemini';
+    name = 'Google Gemini (HTTP)';
+    isReady() {
+        return Boolean(this.config.apiKey ?? process.env.GEMINI_API_KEY);
+    }
+    async send(request) {
+        const apiKey = this.config.apiKey ?? process.env.GEMINI_API_KEY;
+        if (!apiKey) {
+            return err(new AppErrorImpl(ERROR_CODES.INVALID_INPUT, 'GEMINI_API_KEY is not set — cannot reach Gemini', { suggestion: 'Put GEMINI_API_KEY=... in .env or `export GEMINI_API_KEY=...`' }));
+        }
+        const baseUrl = this.config.baseUrl ?? 'https://generativelanguage.googleapis.com';
+        const model = request.model ?? this.config.model ?? 'gemini-2.5-flash';
+        const maxTokens = request.maxTokens ?? 4096;
+        const systemInstructionText = collectSystem(request.messages);
+        const contents = collectContents(request.messages);
+        const body = {
+            contents,
+            generationConfig: {
+                maxOutputTokens: maxTokens,
+                ...(request.temperature !== undefined ? { temperature: request.temperature } : {}),
+                ...(request.responseFormat
+                    ? {
+                        responseMimeType: 'application/json',
+                    }
+                    : {}),
+            },
+        };
+        if (systemInstructionText) {
+            body.systemInstruction = { role: 'system', parts: [{ text: systemInstructionText }] };
+        }
+        try {
+            const url = `${baseUrl}/v1beta/models/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(apiKey)}`;
+            const res = await fetch(url, {
+                method: 'POST',
+                headers: { 'content-type': 'application/json' },
+                body: JSON.stringify(body),
+            });
+            if (!res.ok) {
+                const text = await res.text();
+                return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Gemini API ${res.status}: ${text.slice(0, 500)}`));
+            }
+            const json = (await res.json());
+            const content = (json.candidates?.[0]?.content?.parts ?? [])
+                .map((p) => (typeof p.text === 'string' ? p.text : ''))
+                .join('');
+            return ok({
+                content,
+                model: json.modelVersion ?? model,
+                finishReason: json.candidates?.[0]?.finishReason,
+                usage: {
+                    inputTokens: json.usageMetadata?.promptTokenCount,
+                    outputTokens: json.usageMetadata?.candidatesTokenCount,
+                },
+                raw: json,
+            });
+        }
+        catch (e) {
+            return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Failed to call Gemini: ${e.message}`, {
+                cause: e,
+            }));
+        }
+    }
+}
+function collectSystem(messages) {
+    const parts = messages.filter((m) => m.role === AiMessageRole.System).map((m) => m.content);
+    return parts.length > 0 ? parts.join('\n\n') : undefined;
+}
+function collectContents(messages) {
+    const out = [];
+    for (const m of messages) {
+        if (m.role === AiMessageRole.System)
+            continue;
+        out.push({
+            role: m.role === AiMessageRole.Assistant ? 'model' : 'user',
+            parts: [{ text: m.content }],
+        });
+    }
+    return out;
+}

package/dist/index.d.ts CHANGED Viewed

@@ -3,4 +3,11 @@ export * from './ai-request.js';
 export * from './prompt/prompt-builder.js';
 export * from './claude/claude-provider.js';
 export * from './claude/claude-cli-adapter.js';
+export * from './gemini/gemini-provider.js';
+export * from './ollama/ollama-provider.js';
+export * from './llamacpp/llama-cpp-provider.js';
+export * from './provider-resolver.js';
+export * from './pipeline/enhancement-pipeline.js';
+export * from './llm-hints.js';
+export * from './llm-recommendations.js';
 //# sourceMappingURL=index.d.ts.map

package/dist/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,kBAAkB,CAAC;AACjC,cAAc,iBAAiB,CAAC;AAChC,cAAc,4BAA4B,CAAC;AAC3C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,gCAAgC,CAAC"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,kBAAkB,CAAC;AACjC,cAAc,iBAAiB,CAAC;AAChC,cAAc,4BAA4B,CAAC;AAC3C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,kCAAkC,CAAC;AACjD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC;AACnD,cAAc,gBAAgB,CAAC;AAC/B,cAAc,0BAA0B,CAAC"}

package/dist/index.js CHANGED Viewed

@@ -3,3 +3,10 @@ export * from "./ai-request.js";
 export * from "./prompt/prompt-builder.js";
 export * from "./claude/claude-provider.js";
 export * from "./claude/claude-cli-adapter.js";
+export * from "./gemini/gemini-provider.js";
+export * from "./ollama/ollama-provider.js";
+export * from "./llamacpp/llama-cpp-provider.js";
+export * from "./provider-resolver.js";
+export * from "./pipeline/enhancement-pipeline.js";
+export * from "./llm-hints.js";
+export * from "./llm-recommendations.js";

package/dist/llamacpp/llama-cpp-provider.d.ts ADDED Viewed

@@ -0,0 +1,56 @@
+import { type AppError, type Result } from '@shrkcrft/core';
+import { AbstractAiProvider } from '../ai-provider.js';
+import { type IAiRequest, type IAiResponse } from '../ai-request.js';
+/**
+ * In-process generative provider backed by `node-llama-cpp` (a Node
+ * binding for llama.cpp). No HTTP. No daemon. The model is loaded
+ * once into process memory and reused across requests.
+ *
+ * Configuration (env or `IAiProviderConfig`):
+ *   - `LLAMACPP_MODEL_PATH`   — absolute or repo-relative path to a
+ *                               local `.gguf` file. If unset, the
+ *                               provider is `isReady() === false`.
+ *   - `LLAMACPP_CONTEXT_SIZE` — context window in tokens (default 8192).
+ *   - `LLAMACPP_GPU`          — `auto` (default) | `metal` | `cuda` | `off`.
+ *
+ * The first `send()` call pays the model-load cost (typically 1–10 s
+ * for a 3B Q4 model on Apple Silicon). Subsequent calls reuse
+ * the same `LlamaModel` + `LlamaContext`. A fresh `LlamaChatSession`
+ * is created per request so context isn't leaked between unrelated
+ * tasks.
+ *
+ * Tests can inject a fake generator via `_overrideForTests` to avoid
+ * pulling in the native binding and a 2 GB model file.
+ */
+export declare class LlamaCppProvider extends AbstractAiProvider {
+    readonly id = "llamacpp";
+    readonly name = "llama.cpp (in-process)";
+    /** Test hook — bypasses the native binding when set. */
+    static _overrideForTests: ((request: IAiRequest, modelPath: string) => Promise<IAiResponse>) | null;
+    /**
+     * Reads the module-level cache to expose the active model path for
+     * tools that need it (mostly the disposer). Returns null when no
+     * model has been loaded in this process.
+     */
+    static activeModelPath(): string | null;
+    isReady(): boolean;
+    send(request: IAiRequest): Promise<Result<IAiResponse, AppError>>;
+    private ensureLoaded;
+}
+/**
+ * Release the loaded llama.cpp model + context so the process can
+ * exit cleanly.
+ *
+ * Without this, the libc++ destructor for the Metal device list
+ * aborts on `exit()` with `ggml_metal_device_free` because the
+ * device list isn't empty — same shape of teardown crash as the
+ * ONNX mutex issue, different native library. Disposing in the
+ * order session → context → model → llama lets the destructors
+ * run while the JS runtime is still healthy.
+ *
+ * Safe to call multiple times. Safe to call when no model was
+ * loaded. Errors during dispose are swallowed (the alternative is
+ * the abort we're trying to prevent).
+ */
+export declare function disposeLlamaCppRuntime(): Promise<void>;
+//# sourceMappingURL=llama-cpp-provider.d.ts.map

package/dist/llamacpp/llama-cpp-provider.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"llama-cpp-provider.d.ts","sourceRoot":"","sources":["../../src/llamacpp/llama-cpp-provider.ts"],"names":[],"mappings":"AAEA,OAAO,EAAsC,KAAK,QAAQ,EAAE,KAAK,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EAAkC,KAAK,UAAU,EAAE,KAAK,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAKrG;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,qBAAa,gBAAiB,SAAQ,kBAAkB;IACtD,QAAQ,CAAC,EAAE,cAAc;IACzB,QAAQ,CAAC,IAAI,4BAA4B;IAEzC,wDAAwD;IACxD,MAAM,CAAC,iBAAiB,EACpB,CAAC,CAAC,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,CAAC,GAClE,IAAI,CAAQ;IAEhB;;;;OAIG;IACH,MAAM,CAAC,eAAe,IAAI,MAAM,GAAG,IAAI;IAIvC,OAAO,IAAI,OAAO;IAIZ,IAAI,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;YA4IzD,YAAY;CAwB3B;AAWD;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,sBAAsB,IAAI,OAAO,CAAC,IAAI,CAAC,CAU5D"}

package/dist/llamacpp/llama-cpp-provider.js ADDED Viewed

@@ -0,0 +1,268 @@
+import { existsSync } from 'node:fs';
+import * as nodePath from 'node:path';
+import { AppErrorImpl, ERROR_CODES, err, ok } from '@shrkcrft/core';
+import { AbstractAiProvider } from "../ai-provider.js";
+import { AiMessageRole } from "../ai-request.js";
+const DEFAULT_CONTEXT_SIZE = 8192;
+const DEFAULT_MAX_TOKENS = 1024;
+/**
+ * In-process generative provider backed by `node-llama-cpp` (a Node
+ * binding for llama.cpp). No HTTP. No daemon. The model is loaded
+ * once into process memory and reused across requests.
+ *
+ * Configuration (env or `IAiProviderConfig`):
+ *   - `LLAMACPP_MODEL_PATH`   — absolute or repo-relative path to a
+ *                               local `.gguf` file. If unset, the
+ *                               provider is `isReady() === false`.
+ *   - `LLAMACPP_CONTEXT_SIZE` — context window in tokens (default 8192).
+ *   - `LLAMACPP_GPU`          — `auto` (default) | `metal` | `cuda` | `off`.
+ *
+ * The first `send()` call pays the model-load cost (typically 1–10 s
+ * for a 3B Q4 model on Apple Silicon). Subsequent calls reuse
+ * the same `LlamaModel` + `LlamaContext`. A fresh `LlamaChatSession`
+ * is created per request so context isn't leaked between unrelated
+ * tasks.
+ *
+ * Tests can inject a fake generator via `_overrideForTests` to avoid
+ * pulling in the native binding and a 2 GB model file.
+ */
+export class LlamaCppProvider extends AbstractAiProvider {
+    id = 'llamacpp';
+    name = 'llama.cpp (in-process)';
+    /** Test hook — bypasses the native binding when set. */
+    static _overrideForTests = null;
+    /**
+     * Reads the module-level cache to expose the active model path for
+     * tools that need it (mostly the disposer). Returns null when no
+     * model has been loaded in this process.
+     */
+    static activeModelPath() {
+        return sharedLlamaState?.modelPath ?? null;
+    }
+    isReady() {
+        return resolveModelPath(this.config.model) !== null;
+    }
+    async send(request) {
+        const modelPath = resolveModelPath(request.model ?? this.config.model);
+        if (modelPath === null) {
+            return err(new AppErrorImpl(ERROR_CODES.INVALID_INPUT, 'LLAMACPP_MODEL_PATH is not set or the file does not exist.', {
+                suggestion: 'Set LLAMACPP_MODEL_PATH=/path/to/qwen2.5-coder-3b.gguf in .env, or pass --model <path> on the CLI.',
+            }));
+        }
+        if (LlamaCppProvider._overrideForTests) {
+            try {
+                const value = await LlamaCppProvider._overrideForTests(request, modelPath);
+                return ok(value);
+            }
+            catch (e) {
+                return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Test override failed: ${e.message}`, {
+                    cause: e,
+                }));
+            }
+        }
+        try {
+            const tf = (await import('node-llama-cpp'));
+            const { LlamaChatSession } = tf;
+            const { model, context } = await this.ensureLoaded(modelPath);
+            const sequence = context.getSequence();
+            const session = new LlamaChatSession({
+                contextSequence: sequence,
+                systemPrompt: collectSystemPrompt(request.messages),
+            });
+            // Prior assistant/user turns get fed into the session in order so
+            // the model sees the conversation history. The trailing user turn
+            // is what we ask `prompt()` to respond to.
+            const turns = nonSystemTurns(request.messages);
+            for (let i = 0; i < turns.length - 1; i += 1) {
+                const turn = turns[i];
+                if (turn.role === AiMessageRole.Assistant) {
+                    // node-llama-cpp 3.x exposes session.addAssistantMessage in some
+                    // versions; older versions don't. Best effort: skip silently.
+                    const fn = session.addAssistantMessage;
+                    if (typeof fn === 'function')
+                        fn.call(session, turn.content);
+                    continue;
+                }
+                // For user turns that aren't the trailing one, prime them so the
+                // assistant response gets folded back into the context too.
+                await session.prompt(turn.content, {
+                    maxTokens: 1,
+                    stopOnAbortSignal: true,
+                });
+            }
+            const lastUser = turns[turns.length - 1];
+            const userPrompt = lastUser && lastUser.role === AiMessageRole.User ? lastUser.content : '';
+            const maxTokens = request.maxTokens ?? DEFAULT_MAX_TOKENS;
+            const wantsJson = !!request.responseFormat;
+            // When the caller wants JSON, ask llama.cpp to enforce it at
+            // sample time via a grammar. This eliminates a whole class of
+            // parse failures (preamble prose, trailing markdown, runaway
+            // continuation) that small models routinely produce. Best effort:
+            // if the grammar constructor isn't available in this version we
+            // fall back to plain prompting + trim.
+            let grammar = undefined;
+            if (wantsJson) {
+                try {
+                    const Ctor = tf.LlamaJsonSchemaGrammar;
+                    // CRITICAL: pass the *same* Llama instance the model was
+                    // loaded with. node-llama-cpp rejects mixing grammars from
+                    // one instance with a session from another ("The
+                    // LlamaGrammar … was created with a different Llama
+                    // instance"). Calling getLlama() again would also leak a
+                    // second native Metal device, which then crashes the
+                    // process on exit (`ggml_metal_device_free`).
+                    const sharedLlama = sharedLlamaState?.llama;
+                    if (Ctor && request.responseFormat?.schema && sharedLlama) {
+                        grammar = new Ctor(sharedLlama, request.responseFormat.schema);
+                    }
+                }
+                catch {
+                    grammar = undefined;
+                }
+            }
+            const start = Date.now();
+            const onChunk = request.onTokenStream;
+            const text = await session.prompt(userPrompt, {
+                maxTokens,
+                ...(request.temperature !== undefined ? { temperature: request.temperature } : {}),
+                ...(wantsJson ? { trimWhitespaceSuffix: true } : {}),
+                ...(grammar ? { grammar: grammar } : {}),
+                ...(onChunk
+                    ? {
+                        onTextChunk: (chunk) => {
+                            try {
+                                onChunk(chunk);
+                            }
+                            catch {
+                                // never let a callback failure break inference
+                            }
+                        },
+                    }
+                    : {}),
+            });
+            const elapsedMs = Date.now() - start;
+            // Release the LlamaContext sequence so the next send() can take it.
+            // Without this we hit "No sequences left" on the second call. The
+            // LlamaModel + LlamaContext themselves stay loaded across calls.
+            const sessionDisposable = session;
+            if (typeof sessionDisposable.dispose === 'function')
+                sessionDisposable.dispose();
+            const seqDisposable = sequence;
+            if (typeof seqDisposable.dispose === 'function')
+                seqDisposable.dispose();
+            return ok({
+                content: text,
+                model: nodePath.basename(modelPath),
+                finishReason: 'stop',
+                usage: {
+                // node-llama-cpp does not surface input/output token counts in a
+                // stable v3 API path; we leave usage undefined and let callers
+                // approximate from char count if needed.
+                },
+                raw: { backend: 'node-llama-cpp', modelPath, elapsedMs },
+            });
+        }
+        catch (e) {
+            return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `node-llama-cpp call failed: ${e.message}`, {
+                cause: e,
+                suggestion: 'Verify LLAMACPP_MODEL_PATH points to a valid .gguf file readable by llama.cpp.',
+            }));
+        }
+    }
+    async ensureLoaded(modelPath) {
+        // Cached at MODULE scope so the disposer can find it on process
+        // exit. (Per-instance caching used to live here, but the disposer
+        // doesn't know which provider instance to ask.)
+        if (sharedLlamaState && sharedLlamaState.modelPath === modelPath) {
+            return { model: sharedLlamaState.model, context: sharedLlamaState.context };
+        }
+        if (sharedLlamaState) {
+            // Different model requested — tear down the old one before
+            // loading a new one. Best-effort; failures are tolerated.
+            await disposeLlamaCppRuntime();
+        }
+        const { getLlama } = (await import('node-llama-cpp'));
+        const llama = await getLlama({
+            gpu: resolveGpuChoice(this.config.baseUrl),
+        });
+        const model = await llama.loadModel({ modelPath });
+        const contextSize = Number.isFinite(this.config.timeoutMs)
+            ? DEFAULT_CONTEXT_SIZE
+            : Number(process.env.LLAMACPP_CONTEXT_SIZE ?? DEFAULT_CONTEXT_SIZE);
+        const context = await model.createContext({ contextSize });
+        sharedLlamaState = { llama, model, context, modelPath };
+        return { model, context };
+    }
+}
+let sharedLlamaState = null;
+/**
+ * Release the loaded llama.cpp model + context so the process can
+ * exit cleanly.
+ *
+ * Without this, the libc++ destructor for the Metal device list
+ * aborts on `exit()` with `ggml_metal_device_free` because the
+ * device list isn't empty — same shape of teardown crash as the
+ * ONNX mutex issue, different native library. Disposing in the
+ * order session → context → model → llama lets the destructors
+ * run while the JS runtime is still healthy.
+ *
+ * Safe to call multiple times. Safe to call when no model was
+ * loaded. Errors during dispose are swallowed (the alternative is
+ * the abort we're trying to prevent).
+ */
+export async function disposeLlamaCppRuntime() {
+    const state = sharedLlamaState;
+    sharedLlamaState = null;
+    if (!state)
+        return;
+    // Context first — it holds the sequence pool that depends on the model.
+    await callMaybeDispose(state.context);
+    // Then the model, which depends on the llama runtime.
+    await callMaybeDispose(state.model);
+    // Finally the Llama instance itself (releases the Metal device).
+    await callMaybeDispose(state.llama);
+}
+async function callMaybeDispose(target) {
+    if (!target || typeof target !== 'object')
+        return;
+    const maybe = target;
+    if (typeof maybe.dispose !== 'function')
+        return;
+    try {
+        const r = maybe.dispose();
+        if (r && typeof r.then === 'function') {
+            await r.catch(() => undefined);
+        }
+    }
+    catch {
+        // ignore
+    }
+}
+function resolveModelPath(explicit) {
+    const envPath = process.env.LLAMACPP_MODEL_PATH;
+    const candidate = explicit && explicit.length > 0 ? explicit : envPath;
+    if (!candidate)
+        return null;
+    if (nodePath.isAbsolute(candidate)) {
+        return existsSync(candidate) ? candidate : null;
+    }
+    const fromCwd = nodePath.resolve(process.cwd(), candidate);
+    return existsSync(fromCwd) ? fromCwd : null;
+}
+function resolveGpuChoice(_baseUrl) {
+    const choice = (process.env.LLAMACPP_GPU ?? 'auto').trim().toLowerCase();
+    if (choice === 'metal')
+        return 'metal';
+    if (choice === 'cuda')
+        return 'cuda';
+    if (choice === 'off' || choice === 'false' || choice === 'no' || choice === 'cpu')
+        return false;
+    return 'auto';
+}
+function collectSystemPrompt(messages) {
+    const parts = messages.filter((m) => m.role === AiMessageRole.System).map((m) => m.content);
+    return parts.join('\n\n');
+}
+function nonSystemTurns(messages) {
+    return messages.filter((m) => m.role !== AiMessageRole.System);
+}

package/dist/llm-hints.d.ts ADDED Viewed

@@ -0,0 +1,36 @@
+import type { AiProviderKind } from './provider-resolver.js';
+import type { IAiProvider } from './ai-provider.js';
+export type AiHintLevel = 'setup' | 'upgrade' | 'info';
+export interface IAiHint {
+    level: AiHintLevel;
+    title: string;
+    steps: readonly string[];
+}
+export interface IAiBlock {
+    reachable: boolean;
+    requestedProvider: AiProviderKind;
+    providerId: string | null;
+    enhancementSkipped: boolean;
+    hints: readonly IAiHint[];
+}
+export interface IBuildAiBlockInput {
+    /** What `selectAiProvider` returned, or null if the caller didn't try. */
+    selection?: {
+        requested: AiProviderKind;
+        provider: IAiProvider | null;
+    } | null;
+    /** True when --no-enhance was passed (user opted out — don't nag). */
+    userOptedOut?: boolean;
+}
+/**
+ * Produces the structured `ai` block that lives on every audit report
+ * and any command using `enrichWithLlmRecommendations`. Without the
+ * AI block, `--no-enhance` and "no provider reachable" look the same
+ * to a downstream agent. The block disambiguates.
+ *
+ * Lives in `@shrkcrft/ai` so any package (CLI, packs, MCP server's
+ * read-only surfaces) can construct the same shape.
+ */
+export declare function buildAiBlock(input?: IBuildAiBlockInput): IAiBlock;
+export declare function renderAiBlockMarkdown(block: IAiBlock): string;
+//# sourceMappingURL=llm-hints.d.ts.map

package/dist/llm-hints.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"llm-hints.d.ts","sourceRoot":"","sources":["../src/llm-hints.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAEpD,MAAM,MAAM,WAAW,GAAG,OAAO,GAAG,SAAS,GAAG,MAAM,CAAC;AAEvD,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,WAAW,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,SAAS,MAAM,EAAE,CAAC;CAC1B;AAED,MAAM,WAAW,QAAQ;IACvB,SAAS,EAAE,OAAO,CAAC;IACnB,iBAAiB,EAAE,cAAc,CAAC;IAClC,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,kBAAkB,EAAE,OAAO,CAAC;IAC5B,KAAK,EAAE,SAAS,OAAO,EAAE,CAAC;CAC3B;AAED,MAAM,WAAW,kBAAkB;IACjC,0EAA0E;IAC1E,SAAS,CAAC,EAAE;QAAE,SAAS,EAAE,cAAc,CAAC;QAAC,QAAQ,EAAE,WAAW,GAAG,IAAI,CAAA;KAAE,GAAG,IAAI,CAAC;IAC/E,sEAAsE;IACtE,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB;AAED;;;;;;;;GAQG;AACH,wBAAgB,YAAY,CAAC,KAAK,GAAE,kBAAuB,GAAG,QAAQ,CAiErE;AAED,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,QAAQ,GAAG,MAAM,CAiB7D"}

package/dist/llm-hints.js ADDED Viewed

@@ -0,0 +1,92 @@
+import { selectAiProvider } from "./provider-resolver.js";
+/**
+ * Produces the structured `ai` block that lives on every audit report
+ * and any command using `enrichWithLlmRecommendations`. Without the
+ * AI block, `--no-enhance` and "no provider reachable" look the same
+ * to a downstream agent. The block disambiguates.
+ *
+ * Lives in `@shrkcrft/ai` so any package (CLI, packs, MCP server's
+ * read-only surfaces) can construct the same shape.
+ */
+export function buildAiBlock(input = {}) {
+    // Honour an explicitly-passed selection (including {provider: null} when
+    // --no-enhance is in play) without re-probing the auto chain. Only fall
+    // back to live probing when the caller didn't supply a selection at all.
+    const selection = input.selection !== undefined && input.selection !== null
+        ? input.selection
+        : input.userOptedOut
+            ? { requested: 'auto', provider: null }
+            : selectAiProvider(undefined);
+    const reachable = selection.provider !== null;
+    const providerId = selection.provider?.id ?? null;
+    const requested = selection.requested;
+    const userOptedOut = Boolean(input.userOptedOut);
+    const hints = [];
+    if (!reachable && !userOptedOut) {
+        hints.push({
+            level: 'setup',
+            title: 'Enable LLM enrichment for deeper analysis',
+            steps: [
+                'Local-first: install Ollama (https://ollama.com/download) or set LLAMACPP_MODEL_PATH for in-process inference.',
+                'Pull a model that fits your machine — e.g. `ollama pull llama3.2` (good general-purpose) or `ollama pull qwen2.5-coder:7b` (code-aware).',
+                'Optional: export OLLAMA_HOST=http://localhost:11434 (default) or point at a remote daemon.',
+                'Optional: export OLLAMA_MODEL=<id> to pin the model used by shrk.',
+                'Re-run without --no-enhance. The deterministic findings are unchanged; LLM critique appears under `llmFindings`.',
+            ],
+        });
+    }
+    else if (!reachable && userOptedOut) {
+        hints.push({
+            level: 'info',
+            title: 'LLM enrichment disabled by --no-enhance',
+            steps: [
+                'Deterministic findings are first-class; LLM is purely additive.',
+                'Drop --no-enhance to layer LLM critique on top when a provider is available.',
+            ],
+        });
+    }
+    else {
+        hints.push({
+            level: 'info',
+            title: `LLM enrichment active via ${providerId}`,
+            steps: [
+                'LLM-derived findings appear with `[llm]` tags and a confidence score.',
+                'Tune behavior: --provider ollama|llamacpp, --model <id>, AI_PROVIDER env var (overrides --provider when unset).',
+            ],
+        });
+        hints.push({
+            level: 'upgrade',
+            title: 'Sharpen LLM output if findings feel thin',
+            steps: [
+                'Prefer a code-aware model for technical staleness checks (e.g. qwen2.5-coder:7b, deepseek-coder-v2).',
+                'Larger models notice more drift but cost latency — try 7B for code, 14B+ for nuanced doc-content review.',
+                'For fix-plan enrichment, the same provider is reused; no separate config needed.',
+            ],
+        });
+    }
+    return {
+        reachable,
+        requestedProvider: requested,
+        providerId,
+        enhancementSkipped: userOptedOut,
+        hints,
+    };
+}
+export function renderAiBlockMarkdown(block) {
+    const out = [];
+    const status = block.reachable
+        ? `active via \`${block.providerId}\``
+        : block.enhancementSkipped
+            ? 'disabled by `--no-enhance`'
+            : 'unavailable (no local LLM detected)';
+    out.push(`## AI configuration — ${status}`);
+    out.push('');
+    for (const hint of block.hints) {
+        out.push(`### [${hint.level}] ${hint.title}`);
+        for (const step of hint.steps) {
+            out.push(`- ${step}`);
+        }
+        out.push('');
+    }
+    return out.join('\n');
+}