@shrkcrft/ai 0.1.0-alpha.11 → 0.1.0-alpha.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,16 @@ export interface IAiRequest {
13
13
  maxTokens?: number;
14
14
  temperature?: number;
15
15
  context?: string;
16
+ responseFormat?: IAiResponseFormat;
17
+ /**
18
+ * Optional callback invoked with each newly-decoded token chunk as
19
+ * generation streams in. Providers that don't natively stream
20
+ * (HTTP Gemini / Claude in our non-SSE adapter) ignore this; the
21
+ * llamacpp provider forwards chunks live. Useful for stderr "live
22
+ * preview" in CLI commands and for an agent who wants to display
23
+ * progress without the synchronous wait.
24
+ */
25
+ onTokenStream?: (chunk: string) => void;
16
26
  }
17
27
  export interface IAiResponse {
18
28
  content: string;
@@ -30,4 +40,9 @@ export interface IAiProviderConfig {
30
40
  model?: string;
31
41
  timeoutMs?: number;
32
42
  }
43
+ export interface IAiResponseFormat {
44
+ type: 'json_object' | 'json_schema';
45
+ schema?: Record<string, unknown>;
46
+ schemaName?: string;
47
+ }
33
48
  //# sourceMappingURL=ai-request.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"ai-request.d.ts","sourceRoot":"","sources":["../src/ai-request.ts"],"names":[],"mappings":"AAAA,oBAAY,aAAa;IACvB,MAAM,WAAW;IACjB,IAAI,SAAS;IACb,SAAS,cAAc;CACxB;AAED,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,aAAa,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,SAAS,UAAU,EAAE,CAAC;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACxD,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB"}
1
+ {"version":3,"file":"ai-request.d.ts","sourceRoot":"","sources":["../src/ai-request.ts"],"names":[],"mappings":"AAAA,oBAAY,aAAa;IACvB,MAAM,WAAW;IACjB,IAAI,SAAS;IACb,SAAS,cAAc;CACxB;AAED,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,aAAa,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,SAAS,UAAU,EAAE,CAAC;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,iBAAiB,CAAC;IACnC;;;;;;;OAOG;IACH,aAAa,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACzC;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACxD,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,aAAa,GAAG,aAAa,CAAC;IACpC,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB"}
@@ -0,0 +1,24 @@
1
+ import { type AppError, type Result } from '@shrkcrft/core';
2
+ import { AbstractAiProvider } from '../ai-provider.js';
3
+ import { type IAiRequest, type IAiResponse } from '../ai-request.js';
4
+ /**
5
+ * HTTP adapter for Google's Gemini (Generative Language API).
6
+ *
7
+ * Reads `GEMINI_API_KEY` from env (or `IAiProviderConfig.apiKey`). When
8
+ * the key is missing `isReady()` returns false and `send()` reports an
9
+ * actionable error — same contract as `ClaudeProvider`.
10
+ *
11
+ * The Gemini REST surface differs from Anthropic's: system messages are
12
+ * passed as a top-level `systemInstruction`, conversation turns become
13
+ * `contents[]` with roles `user`/`model`, and the response token cap is
14
+ * `generationConfig.maxOutputTokens` (not `max_tokens`). This adapter
15
+ * translates the provider-neutral `IAiRequest` shape into that wire
16
+ * format and back.
17
+ */
18
+ export declare class GeminiProvider extends AbstractAiProvider {
19
+ readonly id = "gemini";
20
+ readonly name = "Google Gemini (HTTP)";
21
+ isReady(): boolean;
22
+ send(request: IAiRequest): Promise<Result<IAiResponse, AppError>>;
23
+ }
24
+ //# sourceMappingURL=gemini-provider.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"gemini-provider.d.ts","sourceRoot":"","sources":["../../src/gemini/gemini-provider.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsC,KAAK,QAAQ,EAAE,KAAK,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EAAkC,KAAK,UAAU,EAAE,KAAK,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAErG;;;;;;;;;;;;;GAaG;AACH,qBAAa,cAAe,SAAQ,kBAAkB;IACpD,QAAQ,CAAC,EAAE,YAAY;IACvB,QAAQ,CAAC,IAAI,0BAA0B;IAEvC,OAAO,IAAI,OAAO;IAIZ,IAAI,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;CAyExE"}
@@ -0,0 +1,97 @@
1
+ import { AppErrorImpl, ERROR_CODES, err, ok } from '@shrkcrft/core';
2
+ import { AbstractAiProvider } from "../ai-provider.js";
3
+ import { AiMessageRole } from "../ai-request.js";
4
+ /**
5
+ * HTTP adapter for Google's Gemini (Generative Language API).
6
+ *
7
+ * Reads `GEMINI_API_KEY` from env (or `IAiProviderConfig.apiKey`). When
8
+ * the key is missing `isReady()` returns false and `send()` reports an
9
+ * actionable error — same contract as `ClaudeProvider`.
10
+ *
11
+ * The Gemini REST surface differs from Anthropic's: system messages are
12
+ * passed as a top-level `systemInstruction`, conversation turns become
13
+ * `contents[]` with roles `user`/`model`, and the response token cap is
14
+ * `generationConfig.maxOutputTokens` (not `max_tokens`). This adapter
15
+ * translates the provider-neutral `IAiRequest` shape into that wire
16
+ * format and back.
17
+ */
18
+ export class GeminiProvider extends AbstractAiProvider {
19
+ id = 'gemini';
20
+ name = 'Google Gemini (HTTP)';
21
+ isReady() {
22
+ return Boolean(this.config.apiKey ?? process.env.GEMINI_API_KEY);
23
+ }
24
+ async send(request) {
25
+ const apiKey = this.config.apiKey ?? process.env.GEMINI_API_KEY;
26
+ if (!apiKey) {
27
+ return err(new AppErrorImpl(ERROR_CODES.INVALID_INPUT, 'GEMINI_API_KEY is not set — cannot reach Gemini', { suggestion: 'Put GEMINI_API_KEY=... in .env or `export GEMINI_API_KEY=...`' }));
28
+ }
29
+ const baseUrl = this.config.baseUrl ?? 'https://generativelanguage.googleapis.com';
30
+ const model = request.model ?? this.config.model ?? 'gemini-2.5-flash';
31
+ const maxTokens = request.maxTokens ?? 4096;
32
+ const systemInstructionText = collectSystem(request.messages);
33
+ const contents = collectContents(request.messages);
34
+ const body = {
35
+ contents,
36
+ generationConfig: {
37
+ maxOutputTokens: maxTokens,
38
+ ...(request.temperature !== undefined ? { temperature: request.temperature } : {}),
39
+ ...(request.responseFormat
40
+ ? {
41
+ responseMimeType: 'application/json',
42
+ }
43
+ : {}),
44
+ },
45
+ };
46
+ if (systemInstructionText) {
47
+ body.systemInstruction = { role: 'system', parts: [{ text: systemInstructionText }] };
48
+ }
49
+ try {
50
+ const url = `${baseUrl}/v1beta/models/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(apiKey)}`;
51
+ const res = await fetch(url, {
52
+ method: 'POST',
53
+ headers: { 'content-type': 'application/json' },
54
+ body: JSON.stringify(body),
55
+ });
56
+ if (!res.ok) {
57
+ const text = await res.text();
58
+ return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Gemini API ${res.status}: ${text.slice(0, 500)}`));
59
+ }
60
+ const json = (await res.json());
61
+ const content = (json.candidates?.[0]?.content?.parts ?? [])
62
+ .map((p) => (typeof p.text === 'string' ? p.text : ''))
63
+ .join('');
64
+ return ok({
65
+ content,
66
+ model: json.modelVersion ?? model,
67
+ finishReason: json.candidates?.[0]?.finishReason,
68
+ usage: {
69
+ inputTokens: json.usageMetadata?.promptTokenCount,
70
+ outputTokens: json.usageMetadata?.candidatesTokenCount,
71
+ },
72
+ raw: json,
73
+ });
74
+ }
75
+ catch (e) {
76
+ return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Failed to call Gemini: ${e.message}`, {
77
+ cause: e,
78
+ }));
79
+ }
80
+ }
81
+ }
82
+ function collectSystem(messages) {
83
+ const parts = messages.filter((m) => m.role === AiMessageRole.System).map((m) => m.content);
84
+ return parts.length > 0 ? parts.join('\n\n') : undefined;
85
+ }
86
+ function collectContents(messages) {
87
+ const out = [];
88
+ for (const m of messages) {
89
+ if (m.role === AiMessageRole.System)
90
+ continue;
91
+ out.push({
92
+ role: m.role === AiMessageRole.Assistant ? 'model' : 'user',
93
+ parts: [{ text: m.content }],
94
+ });
95
+ }
96
+ return out;
97
+ }
package/dist/index.d.ts CHANGED
@@ -3,4 +3,11 @@ export * from './ai-request.js';
3
3
  export * from './prompt/prompt-builder.js';
4
4
  export * from './claude/claude-provider.js';
5
5
  export * from './claude/claude-cli-adapter.js';
6
+ export * from './gemini/gemini-provider.js';
7
+ export * from './ollama/ollama-provider.js';
8
+ export * from './llamacpp/llama-cpp-provider.js';
9
+ export * from './provider-resolver.js';
10
+ export * from './pipeline/enhancement-pipeline.js';
11
+ export * from './llm-hints.js';
12
+ export * from './llm-recommendations.js';
6
13
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,kBAAkB,CAAC;AACjC,cAAc,iBAAiB,CAAC;AAChC,cAAc,4BAA4B,CAAC;AAC3C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,gCAAgC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,kBAAkB,CAAC;AACjC,cAAc,iBAAiB,CAAC;AAChC,cAAc,4BAA4B,CAAC;AAC3C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,kCAAkC,CAAC;AACjD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC;AACnD,cAAc,gBAAgB,CAAC;AAC/B,cAAc,0BAA0B,CAAC"}
package/dist/index.js CHANGED
@@ -3,3 +3,10 @@ export * from "./ai-request.js";
3
3
  export * from "./prompt/prompt-builder.js";
4
4
  export * from "./claude/claude-provider.js";
5
5
  export * from "./claude/claude-cli-adapter.js";
6
+ export * from "./gemini/gemini-provider.js";
7
+ export * from "./ollama/ollama-provider.js";
8
+ export * from "./llamacpp/llama-cpp-provider.js";
9
+ export * from "./provider-resolver.js";
10
+ export * from "./pipeline/enhancement-pipeline.js";
11
+ export * from "./llm-hints.js";
12
+ export * from "./llm-recommendations.js";
@@ -0,0 +1,56 @@
1
+ import { type AppError, type Result } from '@shrkcrft/core';
2
+ import { AbstractAiProvider } from '../ai-provider.js';
3
+ import { type IAiRequest, type IAiResponse } from '../ai-request.js';
4
+ /**
5
+ * In-process generative provider backed by `node-llama-cpp` (a Node
6
+ * binding for llama.cpp). No HTTP. No daemon. The model is loaded
7
+ * once into process memory and reused across requests.
8
+ *
9
+ * Configuration (env or `IAiProviderConfig`):
10
+ * - `LLAMACPP_MODEL_PATH` — absolute or repo-relative path to a
11
+ * local `.gguf` file. If unset, the
12
+ * provider is `isReady() === false`.
13
+ * - `LLAMACPP_CONTEXT_SIZE` — context window in tokens (default 8192).
14
+ * - `LLAMACPP_GPU` — `auto` (default) | `metal` | `cuda` | `off`.
15
+ *
16
+ * The first `send()` call pays the model-load cost (typically 1–10 s
17
+ * for a 3B Q4 model on Apple Silicon). Subsequent calls reuse
18
+ * the same `LlamaModel` + `LlamaContext`. A fresh `LlamaChatSession`
19
+ * is created per request so context isn't leaked between unrelated
20
+ * tasks.
21
+ *
22
+ * Tests can inject a fake generator via `_overrideForTests` to avoid
23
+ * pulling in the native binding and a 2 GB model file.
24
+ */
25
+ export declare class LlamaCppProvider extends AbstractAiProvider {
26
+ readonly id = "llamacpp";
27
+ readonly name = "llama.cpp (in-process)";
28
+ /** Test hook — bypasses the native binding when set. */
29
+ static _overrideForTests: ((request: IAiRequest, modelPath: string) => Promise<IAiResponse>) | null;
30
+ /**
31
+ * Reads the module-level cache to expose the active model path for
32
+ * tools that need it (mostly the disposer). Returns null when no
33
+ * model has been loaded in this process.
34
+ */
35
+ static activeModelPath(): string | null;
36
+ isReady(): boolean;
37
+ send(request: IAiRequest): Promise<Result<IAiResponse, AppError>>;
38
+ private ensureLoaded;
39
+ }
40
+ /**
41
+ * Release the loaded llama.cpp model + context so the process can
42
+ * exit cleanly.
43
+ *
44
+ * Without this, the libc++ destructor for the Metal device list
45
+ * aborts on `exit()` with `ggml_metal_device_free` because the
46
+ * device list isn't empty — same shape of teardown crash as the
47
+ * ONNX mutex issue, different native library. Disposing in the
48
+ * order session → context → model → llama lets the destructors
49
+ * run while the JS runtime is still healthy.
50
+ *
51
+ * Safe to call multiple times. Safe to call when no model was
52
+ * loaded. Errors during dispose are swallowed (the alternative is
53
+ * the abort we're trying to prevent).
54
+ */
55
+ export declare function disposeLlamaCppRuntime(): Promise<void>;
56
+ //# sourceMappingURL=llama-cpp-provider.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llama-cpp-provider.d.ts","sourceRoot":"","sources":["../../src/llamacpp/llama-cpp-provider.ts"],"names":[],"mappings":"AAEA,OAAO,EAAsC,KAAK,QAAQ,EAAE,KAAK,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EAAkC,KAAK,UAAU,EAAE,KAAK,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAKrG;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,qBAAa,gBAAiB,SAAQ,kBAAkB;IACtD,QAAQ,CAAC,EAAE,cAAc;IACzB,QAAQ,CAAC,IAAI,4BAA4B;IAEzC,wDAAwD;IACxD,MAAM,CAAC,iBAAiB,EACpB,CAAC,CAAC,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,CAAC,GAClE,IAAI,CAAQ;IAEhB;;;;OAIG;IACH,MAAM,CAAC,eAAe,IAAI,MAAM,GAAG,IAAI;IAIvC,OAAO,IAAI,OAAO;IAIZ,IAAI,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;YA4IzD,YAAY;CAwB3B;AAWD;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,sBAAsB,IAAI,OAAO,CAAC,IAAI,CAAC,CAU5D"}
@@ -0,0 +1,268 @@
1
+ import { existsSync } from 'node:fs';
2
+ import * as nodePath from 'node:path';
3
+ import { AppErrorImpl, ERROR_CODES, err, ok } from '@shrkcrft/core';
4
+ import { AbstractAiProvider } from "../ai-provider.js";
5
+ import { AiMessageRole } from "../ai-request.js";
6
+ const DEFAULT_CONTEXT_SIZE = 8192;
7
+ const DEFAULT_MAX_TOKENS = 1024;
8
+ /**
9
+ * In-process generative provider backed by `node-llama-cpp` (a Node
10
+ * binding for llama.cpp). No HTTP. No daemon. The model is loaded
11
+ * once into process memory and reused across requests.
12
+ *
13
+ * Configuration (env or `IAiProviderConfig`):
14
+ * - `LLAMACPP_MODEL_PATH` — absolute or repo-relative path to a
15
+ * local `.gguf` file. If unset, the
16
+ * provider is `isReady() === false`.
17
+ * - `LLAMACPP_CONTEXT_SIZE` — context window in tokens (default 8192).
18
+ * - `LLAMACPP_GPU` — `auto` (default) | `metal` | `cuda` | `off`.
19
+ *
20
+ * The first `send()` call pays the model-load cost (typically 1–10 s
21
+ * for a 3B Q4 model on Apple Silicon). Subsequent calls reuse
22
+ * the same `LlamaModel` + `LlamaContext`. A fresh `LlamaChatSession`
23
+ * is created per request so context isn't leaked between unrelated
24
+ * tasks.
25
+ *
26
+ * Tests can inject a fake generator via `_overrideForTests` to avoid
27
+ * pulling in the native binding and a 2 GB model file.
28
+ */
29
+ export class LlamaCppProvider extends AbstractAiProvider {
30
+ id = 'llamacpp';
31
+ name = 'llama.cpp (in-process)';
32
+ /** Test hook — bypasses the native binding when set. */
33
+ static _overrideForTests = null;
34
+ /**
35
+ * Reads the module-level cache to expose the active model path for
36
+ * tools that need it (mostly the disposer). Returns null when no
37
+ * model has been loaded in this process.
38
+ */
39
+ static activeModelPath() {
40
+ return sharedLlamaState?.modelPath ?? null;
41
+ }
42
+ isReady() {
43
+ return resolveModelPath(this.config.model) !== null;
44
+ }
45
+ async send(request) {
46
+ const modelPath = resolveModelPath(request.model ?? this.config.model);
47
+ if (modelPath === null) {
48
+ return err(new AppErrorImpl(ERROR_CODES.INVALID_INPUT, 'LLAMACPP_MODEL_PATH is not set or the file does not exist.', {
49
+ suggestion: 'Set LLAMACPP_MODEL_PATH=/path/to/qwen2.5-coder-3b.gguf in .env, or pass --model <path> on the CLI.',
50
+ }));
51
+ }
52
+ if (LlamaCppProvider._overrideForTests) {
53
+ try {
54
+ const value = await LlamaCppProvider._overrideForTests(request, modelPath);
55
+ return ok(value);
56
+ }
57
+ catch (e) {
58
+ return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Test override failed: ${e.message}`, {
59
+ cause: e,
60
+ }));
61
+ }
62
+ }
63
+ try {
64
+ const tf = (await import('node-llama-cpp'));
65
+ const { LlamaChatSession } = tf;
66
+ const { model, context } = await this.ensureLoaded(modelPath);
67
+ const sequence = context.getSequence();
68
+ const session = new LlamaChatSession({
69
+ contextSequence: sequence,
70
+ systemPrompt: collectSystemPrompt(request.messages),
71
+ });
72
+ // Prior assistant/user turns get fed into the session in order so
73
+ // the model sees the conversation history. The trailing user turn
74
+ // is what we ask `prompt()` to respond to.
75
+ const turns = nonSystemTurns(request.messages);
76
+ for (let i = 0; i < turns.length - 1; i += 1) {
77
+ const turn = turns[i];
78
+ if (turn.role === AiMessageRole.Assistant) {
79
+ // node-llama-cpp 3.x exposes session.addAssistantMessage in some
80
+ // versions; older versions don't. Best effort: skip silently.
81
+ const fn = session.addAssistantMessage;
82
+ if (typeof fn === 'function')
83
+ fn.call(session, turn.content);
84
+ continue;
85
+ }
86
+ // For user turns that aren't the trailing one, prime them so the
87
+ // assistant response gets folded back into the context too.
88
+ await session.prompt(turn.content, {
89
+ maxTokens: 1,
90
+ stopOnAbortSignal: true,
91
+ });
92
+ }
93
+ const lastUser = turns[turns.length - 1];
94
+ const userPrompt = lastUser && lastUser.role === AiMessageRole.User ? lastUser.content : '';
95
+ const maxTokens = request.maxTokens ?? DEFAULT_MAX_TOKENS;
96
+ const wantsJson = !!request.responseFormat;
97
+ // When the caller wants JSON, ask llama.cpp to enforce it at
98
+ // sample time via a grammar. This eliminates a whole class of
99
+ // parse failures (preamble prose, trailing markdown, runaway
100
+ // continuation) that small models routinely produce. Best effort:
101
+ // if the grammar constructor isn't available in this version we
102
+ // fall back to plain prompting + trim.
103
+ let grammar = undefined;
104
+ if (wantsJson) {
105
+ try {
106
+ const Ctor = tf.LlamaJsonSchemaGrammar;
107
+ // CRITICAL: pass the *same* Llama instance the model was
108
+ // loaded with. node-llama-cpp rejects mixing grammars from
109
+ // one instance with a session from another ("The
110
+ // LlamaGrammar … was created with a different Llama
111
+ // instance"). Calling getLlama() again would also leak a
112
+ // second native Metal device, which then crashes the
113
+ // process on exit (`ggml_metal_device_free`).
114
+ const sharedLlama = sharedLlamaState?.llama;
115
+ if (Ctor && request.responseFormat?.schema && sharedLlama) {
116
+ grammar = new Ctor(sharedLlama, request.responseFormat.schema);
117
+ }
118
+ }
119
+ catch {
120
+ grammar = undefined;
121
+ }
122
+ }
123
+ const start = Date.now();
124
+ const onChunk = request.onTokenStream;
125
+ const text = await session.prompt(userPrompt, {
126
+ maxTokens,
127
+ ...(request.temperature !== undefined ? { temperature: request.temperature } : {}),
128
+ ...(wantsJson ? { trimWhitespaceSuffix: true } : {}),
129
+ ...(grammar ? { grammar: grammar } : {}),
130
+ ...(onChunk
131
+ ? {
132
+ onTextChunk: (chunk) => {
133
+ try {
134
+ onChunk(chunk);
135
+ }
136
+ catch {
137
+ // never let a callback failure break inference
138
+ }
139
+ },
140
+ }
141
+ : {}),
142
+ });
143
+ const elapsedMs = Date.now() - start;
144
+ // Release the LlamaContext sequence so the next send() can take it.
145
+ // Without this we hit "No sequences left" on the second call. The
146
+ // LlamaModel + LlamaContext themselves stay loaded across calls.
147
+ const sessionDisposable = session;
148
+ if (typeof sessionDisposable.dispose === 'function')
149
+ sessionDisposable.dispose();
150
+ const seqDisposable = sequence;
151
+ if (typeof seqDisposable.dispose === 'function')
152
+ seqDisposable.dispose();
153
+ return ok({
154
+ content: text,
155
+ model: nodePath.basename(modelPath),
156
+ finishReason: 'stop',
157
+ usage: {
158
+ // node-llama-cpp does not surface input/output token counts in a
159
+ // stable v3 API path; we leave usage undefined and let callers
160
+ // approximate from char count if needed.
161
+ },
162
+ raw: { backend: 'node-llama-cpp', modelPath, elapsedMs },
163
+ });
164
+ }
165
+ catch (e) {
166
+ return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `node-llama-cpp call failed: ${e.message}`, {
167
+ cause: e,
168
+ suggestion: 'Verify LLAMACPP_MODEL_PATH points to a valid .gguf file readable by llama.cpp.',
169
+ }));
170
+ }
171
+ }
172
+ async ensureLoaded(modelPath) {
173
+ // Cached at MODULE scope so the disposer can find it on process
174
+ // exit. (Per-instance caching used to live here, but the disposer
175
+ // doesn't know which provider instance to ask.)
176
+ if (sharedLlamaState && sharedLlamaState.modelPath === modelPath) {
177
+ return { model: sharedLlamaState.model, context: sharedLlamaState.context };
178
+ }
179
+ if (sharedLlamaState) {
180
+ // Different model requested — tear down the old one before
181
+ // loading a new one. Best-effort; failures are tolerated.
182
+ await disposeLlamaCppRuntime();
183
+ }
184
+ const { getLlama } = (await import('node-llama-cpp'));
185
+ const llama = await getLlama({
186
+ gpu: resolveGpuChoice(this.config.baseUrl),
187
+ });
188
+ const model = await llama.loadModel({ modelPath });
189
+ const contextSize = Number.isFinite(this.config.timeoutMs)
190
+ ? DEFAULT_CONTEXT_SIZE
191
+ : Number(process.env.LLAMACPP_CONTEXT_SIZE ?? DEFAULT_CONTEXT_SIZE);
192
+ const context = await model.createContext({ contextSize });
193
+ sharedLlamaState = { llama, model, context, modelPath };
194
+ return { model, context };
195
+ }
196
+ }
197
+ let sharedLlamaState = null;
198
+ /**
199
+ * Release the loaded llama.cpp model + context so the process can
200
+ * exit cleanly.
201
+ *
202
+ * Without this, the libc++ destructor for the Metal device list
203
+ * aborts on `exit()` with `ggml_metal_device_free` because the
204
+ * device list isn't empty — same shape of teardown crash as the
205
+ * ONNX mutex issue, different native library. Disposing in the
206
+ * order session → context → model → llama lets the destructors
207
+ * run while the JS runtime is still healthy.
208
+ *
209
+ * Safe to call multiple times. Safe to call when no model was
210
+ * loaded. Errors during dispose are swallowed (the alternative is
211
+ * the abort we're trying to prevent).
212
+ */
213
+ export async function disposeLlamaCppRuntime() {
214
+ const state = sharedLlamaState;
215
+ sharedLlamaState = null;
216
+ if (!state)
217
+ return;
218
+ // Context first — it holds the sequence pool that depends on the model.
219
+ await callMaybeDispose(state.context);
220
+ // Then the model, which depends on the llama runtime.
221
+ await callMaybeDispose(state.model);
222
+ // Finally the Llama instance itself (releases the Metal device).
223
+ await callMaybeDispose(state.llama);
224
+ }
225
+ async function callMaybeDispose(target) {
226
+ if (!target || typeof target !== 'object')
227
+ return;
228
+ const maybe = target;
229
+ if (typeof maybe.dispose !== 'function')
230
+ return;
231
+ try {
232
+ const r = maybe.dispose();
233
+ if (r && typeof r.then === 'function') {
234
+ await r.catch(() => undefined);
235
+ }
236
+ }
237
+ catch {
238
+ // ignore
239
+ }
240
+ }
241
+ function resolveModelPath(explicit) {
242
+ const envPath = process.env.LLAMACPP_MODEL_PATH;
243
+ const candidate = explicit && explicit.length > 0 ? explicit : envPath;
244
+ if (!candidate)
245
+ return null;
246
+ if (nodePath.isAbsolute(candidate)) {
247
+ return existsSync(candidate) ? candidate : null;
248
+ }
249
+ const fromCwd = nodePath.resolve(process.cwd(), candidate);
250
+ return existsSync(fromCwd) ? fromCwd : null;
251
+ }
252
+ function resolveGpuChoice(_baseUrl) {
253
+ const choice = (process.env.LLAMACPP_GPU ?? 'auto').trim().toLowerCase();
254
+ if (choice === 'metal')
255
+ return 'metal';
256
+ if (choice === 'cuda')
257
+ return 'cuda';
258
+ if (choice === 'off' || choice === 'false' || choice === 'no' || choice === 'cpu')
259
+ return false;
260
+ return 'auto';
261
+ }
262
+ function collectSystemPrompt(messages) {
263
+ const parts = messages.filter((m) => m.role === AiMessageRole.System).map((m) => m.content);
264
+ return parts.join('\n\n');
265
+ }
266
+ function nonSystemTurns(messages) {
267
+ return messages.filter((m) => m.role !== AiMessageRole.System);
268
+ }
@@ -0,0 +1,36 @@
1
+ import type { AiProviderKind } from './provider-resolver.js';
2
+ import type { IAiProvider } from './ai-provider.js';
3
+ export type AiHintLevel = 'setup' | 'upgrade' | 'info';
4
+ export interface IAiHint {
5
+ level: AiHintLevel;
6
+ title: string;
7
+ steps: readonly string[];
8
+ }
9
+ export interface IAiBlock {
10
+ reachable: boolean;
11
+ requestedProvider: AiProviderKind;
12
+ providerId: string | null;
13
+ enhancementSkipped: boolean;
14
+ hints: readonly IAiHint[];
15
+ }
16
+ export interface IBuildAiBlockInput {
17
+ /** What `selectAiProvider` returned, or null if the caller didn't try. */
18
+ selection?: {
19
+ requested: AiProviderKind;
20
+ provider: IAiProvider | null;
21
+ } | null;
22
+ /** True when --no-enhance was passed (user opted out — don't nag). */
23
+ userOptedOut?: boolean;
24
+ }
25
+ /**
26
+ * Produces the structured `ai` block that lives on every audit report
27
+ * and any command using `enrichWithLlmRecommendations`. Without the
28
+ * AI block, `--no-enhance` and "no provider reachable" look the same
29
+ * to a downstream agent. The block disambiguates.
30
+ *
31
+ * Lives in `@shrkcrft/ai` so any package (CLI, packs, MCP server's
32
+ * read-only surfaces) can construct the same shape.
33
+ */
34
+ export declare function buildAiBlock(input?: IBuildAiBlockInput): IAiBlock;
35
+ export declare function renderAiBlockMarkdown(block: IAiBlock): string;
36
+ //# sourceMappingURL=llm-hints.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm-hints.d.ts","sourceRoot":"","sources":["../src/llm-hints.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAEpD,MAAM,MAAM,WAAW,GAAG,OAAO,GAAG,SAAS,GAAG,MAAM,CAAC;AAEvD,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,WAAW,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,SAAS,MAAM,EAAE,CAAC;CAC1B;AAED,MAAM,WAAW,QAAQ;IACvB,SAAS,EAAE,OAAO,CAAC;IACnB,iBAAiB,EAAE,cAAc,CAAC;IAClC,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,kBAAkB,EAAE,OAAO,CAAC;IAC5B,KAAK,EAAE,SAAS,OAAO,EAAE,CAAC;CAC3B;AAED,MAAM,WAAW,kBAAkB;IACjC,0EAA0E;IAC1E,SAAS,CAAC,EAAE;QAAE,SAAS,EAAE,cAAc,CAAC;QAAC,QAAQ,EAAE,WAAW,GAAG,IAAI,CAAA;KAAE,GAAG,IAAI,CAAC;IAC/E,sEAAsE;IACtE,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB;AAED;;;;;;;;GAQG;AACH,wBAAgB,YAAY,CAAC,KAAK,GAAE,kBAAuB,GAAG,QAAQ,CAiErE;AAED,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,QAAQ,GAAG,MAAM,CAiB7D"}
@@ -0,0 +1,92 @@
1
+ import { selectAiProvider } from "./provider-resolver.js";
2
+ /**
3
+ * Produces the structured `ai` block that lives on every audit report
4
+ * and any command using `enrichWithLlmRecommendations`. Without the
5
+ * AI block, `--no-enhance` and "no provider reachable" look the same
6
+ * to a downstream agent. The block disambiguates.
7
+ *
8
+ * Lives in `@shrkcrft/ai` so any package (CLI, packs, MCP server's
9
+ * read-only surfaces) can construct the same shape.
10
+ */
11
+ export function buildAiBlock(input = {}) {
12
+ // Honour an explicitly-passed selection (including {provider: null} when
13
+ // --no-enhance is in play) without re-probing the auto chain. Only fall
14
+ // back to live probing when the caller didn't supply a selection at all.
15
+ const selection = input.selection !== undefined && input.selection !== null
16
+ ? input.selection
17
+ : input.userOptedOut
18
+ ? { requested: 'auto', provider: null }
19
+ : selectAiProvider(undefined);
20
+ const reachable = selection.provider !== null;
21
+ const providerId = selection.provider?.id ?? null;
22
+ const requested = selection.requested;
23
+ const userOptedOut = Boolean(input.userOptedOut);
24
+ const hints = [];
25
+ if (!reachable && !userOptedOut) {
26
+ hints.push({
27
+ level: 'setup',
28
+ title: 'Enable LLM enrichment for deeper analysis',
29
+ steps: [
30
+ 'Local-first: install Ollama (https://ollama.com/download) or set LLAMACPP_MODEL_PATH for in-process inference.',
31
+ 'Pull a model that fits your machine — e.g. `ollama pull llama3.2` (good general-purpose) or `ollama pull qwen2.5-coder:7b` (code-aware).',
32
+ 'Optional: export OLLAMA_HOST=http://localhost:11434 (default) or point at a remote daemon.',
33
+ 'Optional: export OLLAMA_MODEL=<id> to pin the model used by shrk.',
34
+ 'Re-run without --no-enhance. The deterministic findings are unchanged; LLM critique appears under `llmFindings`.',
35
+ ],
36
+ });
37
+ }
38
+ else if (!reachable && userOptedOut) {
39
+ hints.push({
40
+ level: 'info',
41
+ title: 'LLM enrichment disabled by --no-enhance',
42
+ steps: [
43
+ 'Deterministic findings are first-class; LLM is purely additive.',
44
+ 'Drop --no-enhance to layer LLM critique on top when a provider is available.',
45
+ ],
46
+ });
47
+ }
48
+ else {
49
+ hints.push({
50
+ level: 'info',
51
+ title: `LLM enrichment active via ${providerId}`,
52
+ steps: [
53
+ 'LLM-derived findings appear with `[llm]` tags and a confidence score.',
54
+ 'Tune behavior: --provider ollama|llamacpp, --model <id>, AI_PROVIDER env var (overrides --provider when unset).',
55
+ ],
56
+ });
57
+ hints.push({
58
+ level: 'upgrade',
59
+ title: 'Sharpen LLM output if findings feel thin',
60
+ steps: [
61
+ 'Prefer a code-aware model for technical staleness checks (e.g. qwen2.5-coder:7b, deepseek-coder-v2).',
62
+ 'Larger models notice more drift but cost latency — try 7B for code, 14B+ for nuanced doc-content review.',
63
+ 'For fix-plan enrichment, the same provider is reused; no separate config needed.',
64
+ ],
65
+ });
66
+ }
67
+ return {
68
+ reachable,
69
+ requestedProvider: requested,
70
+ providerId,
71
+ enhancementSkipped: userOptedOut,
72
+ hints,
73
+ };
74
+ }
75
+ export function renderAiBlockMarkdown(block) {
76
+ const out = [];
77
+ const status = block.reachable
78
+ ? `active via \`${block.providerId}\``
79
+ : block.enhancementSkipped
80
+ ? 'disabled by `--no-enhance`'
81
+ : 'unavailable (no local LLM detected)';
82
+ out.push(`## AI configuration — ${status}`);
83
+ out.push('');
84
+ for (const hint of block.hints) {
85
+ out.push(`### [${hint.level}] ${hint.title}`);
86
+ for (const step of hint.steps) {
87
+ out.push(`- ${step}`);
88
+ }
89
+ out.push('');
90
+ }
91
+ return out.join('\n');
92
+ }