@shrkcrft/ai 0.1.0-alpha.10 → 0.1.0-alpha.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai-request.d.ts +15 -0
- package/dist/ai-request.d.ts.map +1 -1
- package/dist/gemini/gemini-provider.d.ts +24 -0
- package/dist/gemini/gemini-provider.d.ts.map +1 -0
- package/dist/gemini/gemini-provider.js +97 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -0
- package/dist/llamacpp/llama-cpp-provider.d.ts +56 -0
- package/dist/llamacpp/llama-cpp-provider.d.ts.map +1 -0
- package/dist/llamacpp/llama-cpp-provider.js +268 -0
- package/dist/ollama/ollama-provider.d.ts +47 -0
- package/dist/ollama/ollama-provider.d.ts.map +1 -0
- package/dist/ollama/ollama-provider.js +166 -0
- package/dist/pipeline/enhancement-pipeline.d.ts +123 -0
- package/dist/pipeline/enhancement-pipeline.d.ts.map +1 -0
- package/dist/pipeline/enhancement-pipeline.js +295 -0
- package/dist/provider-resolver.d.ts +28 -0
- package/dist/provider-resolver.d.ts.map +1 -0
- package/dist/provider-resolver.js +80 -0
- package/package.json +5 -4
package/dist/ai-request.d.ts
CHANGED
|
@@ -13,6 +13,16 @@ export interface IAiRequest {
|
|
|
13
13
|
maxTokens?: number;
|
|
14
14
|
temperature?: number;
|
|
15
15
|
context?: string;
|
|
16
|
+
responseFormat?: IAiResponseFormat;
|
|
17
|
+
/**
|
|
18
|
+
* Optional callback invoked with each newly-decoded token chunk as
|
|
19
|
+
* generation streams in. Providers that don't natively stream
|
|
20
|
+
* (HTTP Gemini / Claude in our non-SSE adapter) ignore this; the
|
|
21
|
+
* llamacpp provider forwards chunks live. Useful for stderr "live
|
|
22
|
+
* preview" in CLI commands and for an agent who wants to display
|
|
23
|
+
* progress without the synchronous wait.
|
|
24
|
+
*/
|
|
25
|
+
onTokenStream?: (chunk: string) => void;
|
|
16
26
|
}
|
|
17
27
|
export interface IAiResponse {
|
|
18
28
|
content: string;
|
|
@@ -30,4 +40,9 @@ export interface IAiProviderConfig {
|
|
|
30
40
|
model?: string;
|
|
31
41
|
timeoutMs?: number;
|
|
32
42
|
}
|
|
43
|
+
export interface IAiResponseFormat {
|
|
44
|
+
type: 'json_object' | 'json_schema';
|
|
45
|
+
schema?: Record<string, unknown>;
|
|
46
|
+
schemaName?: string;
|
|
47
|
+
}
|
|
33
48
|
//# sourceMappingURL=ai-request.d.ts.map
|
package/dist/ai-request.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-request.d.ts","sourceRoot":"","sources":["../src/ai-request.ts"],"names":[],"mappings":"AAAA,oBAAY,aAAa;IACvB,MAAM,WAAW;IACjB,IAAI,SAAS;IACb,SAAS,cAAc;CACxB;AAED,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,aAAa,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,SAAS,UAAU,EAAE,CAAC;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"ai-request.d.ts","sourceRoot":"","sources":["../src/ai-request.ts"],"names":[],"mappings":"AAAA,oBAAY,aAAa;IACvB,MAAM,WAAW;IACjB,IAAI,SAAS;IACb,SAAS,cAAc;CACxB;AAED,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,aAAa,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,SAAS,UAAU,EAAE,CAAC;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,iBAAiB,CAAC;IACnC;;;;;;;OAOG;IACH,aAAa,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACzC;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACxD,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED,MAAM,WAAW,iBAAiB;IAChC,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,aAAa,GAAG,aAAa,CAAC;IACpC,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { type AppError, type Result } from '@shrkcrft/core';
|
|
2
|
+
import { AbstractAiProvider } from '../ai-provider.js';
|
|
3
|
+
import { type IAiRequest, type IAiResponse } from '../ai-request.js';
|
|
4
|
+
/**
|
|
5
|
+
* HTTP adapter for Google's Gemini (Generative Language API).
|
|
6
|
+
*
|
|
7
|
+
* Reads `GEMINI_API_KEY` from env (or `IAiProviderConfig.apiKey`). When
|
|
8
|
+
* the key is missing `isReady()` returns false and `send()` reports an
|
|
9
|
+
* actionable error — same contract as `ClaudeProvider`.
|
|
10
|
+
*
|
|
11
|
+
* The Gemini REST surface differs from Anthropic's: system messages are
|
|
12
|
+
* passed as a top-level `systemInstruction`, conversation turns become
|
|
13
|
+
* `contents[]` with roles `user`/`model`, and the response token cap is
|
|
14
|
+
* `generationConfig.maxOutputTokens` (not `max_tokens`). This adapter
|
|
15
|
+
* translates the provider-neutral `IAiRequest` shape into that wire
|
|
16
|
+
* format and back.
|
|
17
|
+
*/
|
|
18
|
+
export declare class GeminiProvider extends AbstractAiProvider {
|
|
19
|
+
readonly id = "gemini";
|
|
20
|
+
readonly name = "Google Gemini (HTTP)";
|
|
21
|
+
isReady(): boolean;
|
|
22
|
+
send(request: IAiRequest): Promise<Result<IAiResponse, AppError>>;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=gemini-provider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gemini-provider.d.ts","sourceRoot":"","sources":["../../src/gemini/gemini-provider.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsC,KAAK,QAAQ,EAAE,KAAK,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EAAkC,KAAK,UAAU,EAAE,KAAK,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAErG;;;;;;;;;;;;;GAaG;AACH,qBAAa,cAAe,SAAQ,kBAAkB;IACpD,QAAQ,CAAC,EAAE,YAAY;IACvB,QAAQ,CAAC,IAAI,0BAA0B;IAEvC,OAAO,IAAI,OAAO;IAIZ,IAAI,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;CAyExE"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { AppErrorImpl, ERROR_CODES, err, ok } from '@shrkcrft/core';
|
|
2
|
+
import { AbstractAiProvider } from "../ai-provider.js";
|
|
3
|
+
import { AiMessageRole } from "../ai-request.js";
|
|
4
|
+
/**
|
|
5
|
+
* HTTP adapter for Google's Gemini (Generative Language API).
|
|
6
|
+
*
|
|
7
|
+
* Reads `GEMINI_API_KEY` from env (or `IAiProviderConfig.apiKey`). When
|
|
8
|
+
* the key is missing `isReady()` returns false and `send()` reports an
|
|
9
|
+
* actionable error — same contract as `ClaudeProvider`.
|
|
10
|
+
*
|
|
11
|
+
* The Gemini REST surface differs from Anthropic's: system messages are
|
|
12
|
+
* passed as a top-level `systemInstruction`, conversation turns become
|
|
13
|
+
* `contents[]` with roles `user`/`model`, and the response token cap is
|
|
14
|
+
* `generationConfig.maxOutputTokens` (not `max_tokens`). This adapter
|
|
15
|
+
* translates the provider-neutral `IAiRequest` shape into that wire
|
|
16
|
+
* format and back.
|
|
17
|
+
*/
|
|
18
|
+
export class GeminiProvider extends AbstractAiProvider {
|
|
19
|
+
id = 'gemini';
|
|
20
|
+
name = 'Google Gemini (HTTP)';
|
|
21
|
+
isReady() {
|
|
22
|
+
return Boolean(this.config.apiKey ?? process.env.GEMINI_API_KEY);
|
|
23
|
+
}
|
|
24
|
+
async send(request) {
|
|
25
|
+
const apiKey = this.config.apiKey ?? process.env.GEMINI_API_KEY;
|
|
26
|
+
if (!apiKey) {
|
|
27
|
+
return err(new AppErrorImpl(ERROR_CODES.INVALID_INPUT, 'GEMINI_API_KEY is not set — cannot reach Gemini', { suggestion: 'Put GEMINI_API_KEY=... in .env or `export GEMINI_API_KEY=...`' }));
|
|
28
|
+
}
|
|
29
|
+
const baseUrl = this.config.baseUrl ?? 'https://generativelanguage.googleapis.com';
|
|
30
|
+
const model = request.model ?? this.config.model ?? 'gemini-2.5-flash';
|
|
31
|
+
const maxTokens = request.maxTokens ?? 4096;
|
|
32
|
+
const systemInstructionText = collectSystem(request.messages);
|
|
33
|
+
const contents = collectContents(request.messages);
|
|
34
|
+
const body = {
|
|
35
|
+
contents,
|
|
36
|
+
generationConfig: {
|
|
37
|
+
maxOutputTokens: maxTokens,
|
|
38
|
+
...(request.temperature !== undefined ? { temperature: request.temperature } : {}),
|
|
39
|
+
...(request.responseFormat
|
|
40
|
+
? {
|
|
41
|
+
responseMimeType: 'application/json',
|
|
42
|
+
}
|
|
43
|
+
: {}),
|
|
44
|
+
},
|
|
45
|
+
};
|
|
46
|
+
if (systemInstructionText) {
|
|
47
|
+
body.systemInstruction = { role: 'system', parts: [{ text: systemInstructionText }] };
|
|
48
|
+
}
|
|
49
|
+
try {
|
|
50
|
+
const url = `${baseUrl}/v1beta/models/${encodeURIComponent(model)}:generateContent?key=${encodeURIComponent(apiKey)}`;
|
|
51
|
+
const res = await fetch(url, {
|
|
52
|
+
method: 'POST',
|
|
53
|
+
headers: { 'content-type': 'application/json' },
|
|
54
|
+
body: JSON.stringify(body),
|
|
55
|
+
});
|
|
56
|
+
if (!res.ok) {
|
|
57
|
+
const text = await res.text();
|
|
58
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Gemini API ${res.status}: ${text.slice(0, 500)}`));
|
|
59
|
+
}
|
|
60
|
+
const json = (await res.json());
|
|
61
|
+
const content = (json.candidates?.[0]?.content?.parts ?? [])
|
|
62
|
+
.map((p) => (typeof p.text === 'string' ? p.text : ''))
|
|
63
|
+
.join('');
|
|
64
|
+
return ok({
|
|
65
|
+
content,
|
|
66
|
+
model: json.modelVersion ?? model,
|
|
67
|
+
finishReason: json.candidates?.[0]?.finishReason,
|
|
68
|
+
usage: {
|
|
69
|
+
inputTokens: json.usageMetadata?.promptTokenCount,
|
|
70
|
+
outputTokens: json.usageMetadata?.candidatesTokenCount,
|
|
71
|
+
},
|
|
72
|
+
raw: json,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
catch (e) {
|
|
76
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Failed to call Gemini: ${e.message}`, {
|
|
77
|
+
cause: e,
|
|
78
|
+
}));
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
function collectSystem(messages) {
|
|
83
|
+
const parts = messages.filter((m) => m.role === AiMessageRole.System).map((m) => m.content);
|
|
84
|
+
return parts.length > 0 ? parts.join('\n\n') : undefined;
|
|
85
|
+
}
|
|
86
|
+
function collectContents(messages) {
|
|
87
|
+
const out = [];
|
|
88
|
+
for (const m of messages) {
|
|
89
|
+
if (m.role === AiMessageRole.System)
|
|
90
|
+
continue;
|
|
91
|
+
out.push({
|
|
92
|
+
role: m.role === AiMessageRole.Assistant ? 'model' : 'user',
|
|
93
|
+
parts: [{ text: m.content }],
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
return out;
|
|
97
|
+
}
|
package/dist/index.d.ts
CHANGED
|
@@ -3,4 +3,9 @@ export * from './ai-request.js';
|
|
|
3
3
|
export * from './prompt/prompt-builder.js';
|
|
4
4
|
export * from './claude/claude-provider.js';
|
|
5
5
|
export * from './claude/claude-cli-adapter.js';
|
|
6
|
+
export * from './gemini/gemini-provider.js';
|
|
7
|
+
export * from './ollama/ollama-provider.js';
|
|
8
|
+
export * from './llamacpp/llama-cpp-provider.js';
|
|
9
|
+
export * from './provider-resolver.js';
|
|
10
|
+
export * from './pipeline/enhancement-pipeline.js';
|
|
6
11
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,kBAAkB,CAAC;AACjC,cAAc,iBAAiB,CAAC;AAChC,cAAc,4BAA4B,CAAC;AAC3C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,gCAAgC,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,cAAc,kBAAkB,CAAC;AACjC,cAAc,iBAAiB,CAAC;AAChC,cAAc,4BAA4B,CAAC;AAC3C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,6BAA6B,CAAC;AAC5C,cAAc,kCAAkC,CAAC;AACjD,cAAc,wBAAwB,CAAC;AACvC,cAAc,oCAAoC,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -3,3 +3,8 @@ export * from "./ai-request.js";
|
|
|
3
3
|
export * from "./prompt/prompt-builder.js";
|
|
4
4
|
export * from "./claude/claude-provider.js";
|
|
5
5
|
export * from "./claude/claude-cli-adapter.js";
|
|
6
|
+
export * from "./gemini/gemini-provider.js";
|
|
7
|
+
export * from "./ollama/ollama-provider.js";
|
|
8
|
+
export * from "./llamacpp/llama-cpp-provider.js";
|
|
9
|
+
export * from "./provider-resolver.js";
|
|
10
|
+
export * from "./pipeline/enhancement-pipeline.js";
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { type AppError, type Result } from '@shrkcrft/core';
|
|
2
|
+
import { AbstractAiProvider } from '../ai-provider.js';
|
|
3
|
+
import { type IAiRequest, type IAiResponse } from '../ai-request.js';
|
|
4
|
+
/**
|
|
5
|
+
* In-process generative provider backed by `node-llama-cpp` (a Node
|
|
6
|
+
* binding for llama.cpp). No HTTP. No daemon. The model is loaded
|
|
7
|
+
* once into process memory and reused across requests.
|
|
8
|
+
*
|
|
9
|
+
* Configuration (env or `IAiProviderConfig`):
|
|
10
|
+
* - `LLAMACPP_MODEL_PATH` — absolute or repo-relative path to a
|
|
11
|
+
* local `.gguf` file. If unset, the
|
|
12
|
+
* provider is `isReady() === false`.
|
|
13
|
+
* - `LLAMACPP_CONTEXT_SIZE` — context window in tokens (default 8192).
|
|
14
|
+
* - `LLAMACPP_GPU` — `auto` (default) | `metal` | `cuda` | `off`.
|
|
15
|
+
*
|
|
16
|
+
* The first `send()` call pays the model-load cost (typically 1–10 s
|
|
17
|
+
* for a 3B Q4 model on Apple Silicon). Subsequent calls reuse
|
|
18
|
+
* the same `LlamaModel` + `LlamaContext`. A fresh `LlamaChatSession`
|
|
19
|
+
* is created per request so context isn't leaked between unrelated
|
|
20
|
+
* tasks.
|
|
21
|
+
*
|
|
22
|
+
* Tests can inject a fake generator via `_overrideForTests` to avoid
|
|
23
|
+
* pulling in the native binding and a 2 GB model file.
|
|
24
|
+
*/
|
|
25
|
+
export declare class LlamaCppProvider extends AbstractAiProvider {
|
|
26
|
+
readonly id = "llamacpp";
|
|
27
|
+
readonly name = "llama.cpp (in-process)";
|
|
28
|
+
/** Test hook — bypasses the native binding when set. */
|
|
29
|
+
static _overrideForTests: ((request: IAiRequest, modelPath: string) => Promise<IAiResponse>) | null;
|
|
30
|
+
/**
|
|
31
|
+
* Reads the module-level cache to expose the active model path for
|
|
32
|
+
* tools that need it (mostly the disposer). Returns null when no
|
|
33
|
+
* model has been loaded in this process.
|
|
34
|
+
*/
|
|
35
|
+
static activeModelPath(): string | null;
|
|
36
|
+
isReady(): boolean;
|
|
37
|
+
send(request: IAiRequest): Promise<Result<IAiResponse, AppError>>;
|
|
38
|
+
private ensureLoaded;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Release the loaded llama.cpp model + context so the process can
|
|
42
|
+
* exit cleanly.
|
|
43
|
+
*
|
|
44
|
+
* Without this, the libc++ destructor for the Metal device list
|
|
45
|
+
* aborts on `exit()` with `ggml_metal_device_free` because the
|
|
46
|
+
* device list isn't empty — same shape of teardown crash as the
|
|
47
|
+
* ONNX mutex issue, different native library. Disposing in the
|
|
48
|
+
* order session → context → model → llama lets the destructors
|
|
49
|
+
* run while the JS runtime is still healthy.
|
|
50
|
+
*
|
|
51
|
+
* Safe to call multiple times. Safe to call when no model was
|
|
52
|
+
* loaded. Errors during dispose are swallowed (the alternative is
|
|
53
|
+
* the abort we're trying to prevent).
|
|
54
|
+
*/
|
|
55
|
+
export declare function disposeLlamaCppRuntime(): Promise<void>;
|
|
56
|
+
//# sourceMappingURL=llama-cpp-provider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llama-cpp-provider.d.ts","sourceRoot":"","sources":["../../src/llamacpp/llama-cpp-provider.ts"],"names":[],"mappings":"AAEA,OAAO,EAAsC,KAAK,QAAQ,EAAE,KAAK,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EAAkC,KAAK,UAAU,EAAE,KAAK,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAKrG;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,qBAAa,gBAAiB,SAAQ,kBAAkB;IACtD,QAAQ,CAAC,EAAE,cAAc;IACzB,QAAQ,CAAC,IAAI,4BAA4B;IAEzC,wDAAwD;IACxD,MAAM,CAAC,iBAAiB,EACpB,CAAC,CAAC,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,KAAK,OAAO,CAAC,WAAW,CAAC,CAAC,GAClE,IAAI,CAAQ;IAEhB;;;;OAIG;IACH,MAAM,CAAC,eAAe,IAAI,MAAM,GAAG,IAAI;IAIvC,OAAO,IAAI,OAAO;IAIZ,IAAI,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;YA4IzD,YAAY;CAwB3B;AAWD;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,sBAAsB,IAAI,OAAO,CAAC,IAAI,CAAC,CAU5D"}
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
import { existsSync } from 'node:fs';
|
|
2
|
+
import * as nodePath from 'node:path';
|
|
3
|
+
import { AppErrorImpl, ERROR_CODES, err, ok } from '@shrkcrft/core';
|
|
4
|
+
import { AbstractAiProvider } from "../ai-provider.js";
|
|
5
|
+
import { AiMessageRole } from "../ai-request.js";
|
|
6
|
+
const DEFAULT_CONTEXT_SIZE = 8192;
|
|
7
|
+
const DEFAULT_MAX_TOKENS = 1024;
|
|
8
|
+
/**
|
|
9
|
+
* In-process generative provider backed by `node-llama-cpp` (a Node
|
|
10
|
+
* binding for llama.cpp). No HTTP. No daemon. The model is loaded
|
|
11
|
+
* once into process memory and reused across requests.
|
|
12
|
+
*
|
|
13
|
+
* Configuration (env or `IAiProviderConfig`):
|
|
14
|
+
* - `LLAMACPP_MODEL_PATH` — absolute or repo-relative path to a
|
|
15
|
+
* local `.gguf` file. If unset, the
|
|
16
|
+
* provider is `isReady() === false`.
|
|
17
|
+
* - `LLAMACPP_CONTEXT_SIZE` — context window in tokens (default 8192).
|
|
18
|
+
* - `LLAMACPP_GPU` — `auto` (default) | `metal` | `cuda` | `off`.
|
|
19
|
+
*
|
|
20
|
+
* The first `send()` call pays the model-load cost (typically 1–10 s
|
|
21
|
+
* for a 3B Q4 model on Apple Silicon). Subsequent calls reuse
|
|
22
|
+
* the same `LlamaModel` + `LlamaContext`. A fresh `LlamaChatSession`
|
|
23
|
+
* is created per request so context isn't leaked between unrelated
|
|
24
|
+
* tasks.
|
|
25
|
+
*
|
|
26
|
+
* Tests can inject a fake generator via `_overrideForTests` to avoid
|
|
27
|
+
* pulling in the native binding and a 2 GB model file.
|
|
28
|
+
*/
|
|
29
|
+
export class LlamaCppProvider extends AbstractAiProvider {
|
|
30
|
+
id = 'llamacpp';
|
|
31
|
+
name = 'llama.cpp (in-process)';
|
|
32
|
+
/** Test hook — bypasses the native binding when set. */
|
|
33
|
+
static _overrideForTests = null;
|
|
34
|
+
/**
|
|
35
|
+
* Reads the module-level cache to expose the active model path for
|
|
36
|
+
* tools that need it (mostly the disposer). Returns null when no
|
|
37
|
+
* model has been loaded in this process.
|
|
38
|
+
*/
|
|
39
|
+
static activeModelPath() {
|
|
40
|
+
return sharedLlamaState?.modelPath ?? null;
|
|
41
|
+
}
|
|
42
|
+
isReady() {
|
|
43
|
+
return resolveModelPath(this.config.model) !== null;
|
|
44
|
+
}
|
|
45
|
+
async send(request) {
|
|
46
|
+
const modelPath = resolveModelPath(request.model ?? this.config.model);
|
|
47
|
+
if (modelPath === null) {
|
|
48
|
+
return err(new AppErrorImpl(ERROR_CODES.INVALID_INPUT, 'LLAMACPP_MODEL_PATH is not set or the file does not exist.', {
|
|
49
|
+
suggestion: 'Set LLAMACPP_MODEL_PATH=/path/to/qwen2.5-coder-3b.gguf in .env, or pass --model <path> on the CLI.',
|
|
50
|
+
}));
|
|
51
|
+
}
|
|
52
|
+
if (LlamaCppProvider._overrideForTests) {
|
|
53
|
+
try {
|
|
54
|
+
const value = await LlamaCppProvider._overrideForTests(request, modelPath);
|
|
55
|
+
return ok(value);
|
|
56
|
+
}
|
|
57
|
+
catch (e) {
|
|
58
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Test override failed: ${e.message}`, {
|
|
59
|
+
cause: e,
|
|
60
|
+
}));
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
try {
|
|
64
|
+
const tf = (await import('node-llama-cpp'));
|
|
65
|
+
const { LlamaChatSession } = tf;
|
|
66
|
+
const { model, context } = await this.ensureLoaded(modelPath);
|
|
67
|
+
const sequence = context.getSequence();
|
|
68
|
+
const session = new LlamaChatSession({
|
|
69
|
+
contextSequence: sequence,
|
|
70
|
+
systemPrompt: collectSystemPrompt(request.messages),
|
|
71
|
+
});
|
|
72
|
+
// Prior assistant/user turns get fed into the session in order so
|
|
73
|
+
// the model sees the conversation history. The trailing user turn
|
|
74
|
+
// is what we ask `prompt()` to respond to.
|
|
75
|
+
const turns = nonSystemTurns(request.messages);
|
|
76
|
+
for (let i = 0; i < turns.length - 1; i += 1) {
|
|
77
|
+
const turn = turns[i];
|
|
78
|
+
if (turn.role === AiMessageRole.Assistant) {
|
|
79
|
+
// node-llama-cpp 3.x exposes session.addAssistantMessage in some
|
|
80
|
+
// versions; older versions don't. Best effort: skip silently.
|
|
81
|
+
const fn = session.addAssistantMessage;
|
|
82
|
+
if (typeof fn === 'function')
|
|
83
|
+
fn.call(session, turn.content);
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
// For user turns that aren't the trailing one, prime them so the
|
|
87
|
+
// assistant response gets folded back into the context too.
|
|
88
|
+
await session.prompt(turn.content, {
|
|
89
|
+
maxTokens: 1,
|
|
90
|
+
stopOnAbortSignal: true,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
const lastUser = turns[turns.length - 1];
|
|
94
|
+
const userPrompt = lastUser && lastUser.role === AiMessageRole.User ? lastUser.content : '';
|
|
95
|
+
const maxTokens = request.maxTokens ?? DEFAULT_MAX_TOKENS;
|
|
96
|
+
const wantsJson = !!request.responseFormat;
|
|
97
|
+
// When the caller wants JSON, ask llama.cpp to enforce it at
|
|
98
|
+
// sample time via a grammar. This eliminates a whole class of
|
|
99
|
+
// parse failures (preamble prose, trailing markdown, runaway
|
|
100
|
+
// continuation) that small models routinely produce. Best effort:
|
|
101
|
+
// if the grammar constructor isn't available in this version we
|
|
102
|
+
// fall back to plain prompting + trim.
|
|
103
|
+
let grammar = undefined;
|
|
104
|
+
if (wantsJson) {
|
|
105
|
+
try {
|
|
106
|
+
const Ctor = tf.LlamaJsonSchemaGrammar;
|
|
107
|
+
// CRITICAL: pass the *same* Llama instance the model was
|
|
108
|
+
// loaded with. node-llama-cpp rejects mixing grammars from
|
|
109
|
+
// one instance with a session from another ("The
|
|
110
|
+
// LlamaGrammar … was created with a different Llama
|
|
111
|
+
// instance"). Calling getLlama() again would also leak a
|
|
112
|
+
// second native Metal device, which then crashes the
|
|
113
|
+
// process on exit (`ggml_metal_device_free`).
|
|
114
|
+
const sharedLlama = sharedLlamaState?.llama;
|
|
115
|
+
if (Ctor && request.responseFormat?.schema && sharedLlama) {
|
|
116
|
+
grammar = new Ctor(sharedLlama, request.responseFormat.schema);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
catch {
|
|
120
|
+
grammar = undefined;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
const start = Date.now();
|
|
124
|
+
const onChunk = request.onTokenStream;
|
|
125
|
+
const text = await session.prompt(userPrompt, {
|
|
126
|
+
maxTokens,
|
|
127
|
+
...(request.temperature !== undefined ? { temperature: request.temperature } : {}),
|
|
128
|
+
...(wantsJson ? { trimWhitespaceSuffix: true } : {}),
|
|
129
|
+
...(grammar ? { grammar: grammar } : {}),
|
|
130
|
+
...(onChunk
|
|
131
|
+
? {
|
|
132
|
+
onTextChunk: (chunk) => {
|
|
133
|
+
try {
|
|
134
|
+
onChunk(chunk);
|
|
135
|
+
}
|
|
136
|
+
catch {
|
|
137
|
+
// never let a callback failure break inference
|
|
138
|
+
}
|
|
139
|
+
},
|
|
140
|
+
}
|
|
141
|
+
: {}),
|
|
142
|
+
});
|
|
143
|
+
const elapsedMs = Date.now() - start;
|
|
144
|
+
// Release the LlamaContext sequence so the next send() can take it.
|
|
145
|
+
// Without this we hit "No sequences left" on the second call. The
|
|
146
|
+
// LlamaModel + LlamaContext themselves stay loaded across calls.
|
|
147
|
+
const sessionDisposable = session;
|
|
148
|
+
if (typeof sessionDisposable.dispose === 'function')
|
|
149
|
+
sessionDisposable.dispose();
|
|
150
|
+
const seqDisposable = sequence;
|
|
151
|
+
if (typeof seqDisposable.dispose === 'function')
|
|
152
|
+
seqDisposable.dispose();
|
|
153
|
+
return ok({
|
|
154
|
+
content: text,
|
|
155
|
+
model: nodePath.basename(modelPath),
|
|
156
|
+
finishReason: 'stop',
|
|
157
|
+
usage: {
|
|
158
|
+
// node-llama-cpp does not surface input/output token counts in a
|
|
159
|
+
// stable v3 API path; we leave usage undefined and let callers
|
|
160
|
+
// approximate from char count if needed.
|
|
161
|
+
},
|
|
162
|
+
raw: { backend: 'node-llama-cpp', modelPath, elapsedMs },
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
catch (e) {
|
|
166
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `node-llama-cpp call failed: ${e.message}`, {
|
|
167
|
+
cause: e,
|
|
168
|
+
suggestion: 'Verify LLAMACPP_MODEL_PATH points to a valid .gguf file readable by llama.cpp.',
|
|
169
|
+
}));
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
async ensureLoaded(modelPath) {
|
|
173
|
+
// Cached at MODULE scope so the disposer can find it on process
|
|
174
|
+
// exit. (Per-instance caching used to live here, but the disposer
|
|
175
|
+
// doesn't know which provider instance to ask.)
|
|
176
|
+
if (sharedLlamaState && sharedLlamaState.modelPath === modelPath) {
|
|
177
|
+
return { model: sharedLlamaState.model, context: sharedLlamaState.context };
|
|
178
|
+
}
|
|
179
|
+
if (sharedLlamaState) {
|
|
180
|
+
// Different model requested — tear down the old one before
|
|
181
|
+
// loading a new one. Best-effort; failures are tolerated.
|
|
182
|
+
await disposeLlamaCppRuntime();
|
|
183
|
+
}
|
|
184
|
+
const { getLlama } = (await import('node-llama-cpp'));
|
|
185
|
+
const llama = await getLlama({
|
|
186
|
+
gpu: resolveGpuChoice(this.config.baseUrl),
|
|
187
|
+
});
|
|
188
|
+
const model = await llama.loadModel({ modelPath });
|
|
189
|
+
const contextSize = Number.isFinite(this.config.timeoutMs)
|
|
190
|
+
? DEFAULT_CONTEXT_SIZE
|
|
191
|
+
: Number(process.env.LLAMACPP_CONTEXT_SIZE ?? DEFAULT_CONTEXT_SIZE);
|
|
192
|
+
const context = await model.createContext({ contextSize });
|
|
193
|
+
sharedLlamaState = { llama, model, context, modelPath };
|
|
194
|
+
return { model, context };
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
let sharedLlamaState = null;
|
|
198
|
+
/**
|
|
199
|
+
* Release the loaded llama.cpp model + context so the process can
|
|
200
|
+
* exit cleanly.
|
|
201
|
+
*
|
|
202
|
+
* Without this, the libc++ destructor for the Metal device list
|
|
203
|
+
* aborts on `exit()` with `ggml_metal_device_free` because the
|
|
204
|
+
* device list isn't empty — same shape of teardown crash as the
|
|
205
|
+
* ONNX mutex issue, different native library. Disposing in the
|
|
206
|
+
* order session → context → model → llama lets the destructors
|
|
207
|
+
* run while the JS runtime is still healthy.
|
|
208
|
+
*
|
|
209
|
+
* Safe to call multiple times. Safe to call when no model was
|
|
210
|
+
* loaded. Errors during dispose are swallowed (the alternative is
|
|
211
|
+
* the abort we're trying to prevent).
|
|
212
|
+
*/
|
|
213
|
+
export async function disposeLlamaCppRuntime() {
|
|
214
|
+
const state = sharedLlamaState;
|
|
215
|
+
sharedLlamaState = null;
|
|
216
|
+
if (!state)
|
|
217
|
+
return;
|
|
218
|
+
// Context first — it holds the sequence pool that depends on the model.
|
|
219
|
+
await callMaybeDispose(state.context);
|
|
220
|
+
// Then the model, which depends on the llama runtime.
|
|
221
|
+
await callMaybeDispose(state.model);
|
|
222
|
+
// Finally the Llama instance itself (releases the Metal device).
|
|
223
|
+
await callMaybeDispose(state.llama);
|
|
224
|
+
}
|
|
225
|
+
async function callMaybeDispose(target) {
|
|
226
|
+
if (!target || typeof target !== 'object')
|
|
227
|
+
return;
|
|
228
|
+
const maybe = target;
|
|
229
|
+
if (typeof maybe.dispose !== 'function')
|
|
230
|
+
return;
|
|
231
|
+
try {
|
|
232
|
+
const r = maybe.dispose();
|
|
233
|
+
if (r && typeof r.then === 'function') {
|
|
234
|
+
await r.catch(() => undefined);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
catch {
|
|
238
|
+
// ignore
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
function resolveModelPath(explicit) {
|
|
242
|
+
const envPath = process.env.LLAMACPP_MODEL_PATH;
|
|
243
|
+
const candidate = explicit && explicit.length > 0 ? explicit : envPath;
|
|
244
|
+
if (!candidate)
|
|
245
|
+
return null;
|
|
246
|
+
if (nodePath.isAbsolute(candidate)) {
|
|
247
|
+
return existsSync(candidate) ? candidate : null;
|
|
248
|
+
}
|
|
249
|
+
const fromCwd = nodePath.resolve(process.cwd(), candidate);
|
|
250
|
+
return existsSync(fromCwd) ? fromCwd : null;
|
|
251
|
+
}
|
|
252
|
+
function resolveGpuChoice(_baseUrl) {
|
|
253
|
+
const choice = (process.env.LLAMACPP_GPU ?? 'auto').trim().toLowerCase();
|
|
254
|
+
if (choice === 'metal')
|
|
255
|
+
return 'metal';
|
|
256
|
+
if (choice === 'cuda')
|
|
257
|
+
return 'cuda';
|
|
258
|
+
if (choice === 'off' || choice === 'false' || choice === 'no' || choice === 'cpu')
|
|
259
|
+
return false;
|
|
260
|
+
return 'auto';
|
|
261
|
+
}
|
|
262
|
+
function collectSystemPrompt(messages) {
|
|
263
|
+
const parts = messages.filter((m) => m.role === AiMessageRole.System).map((m) => m.content);
|
|
264
|
+
return parts.join('\n\n');
|
|
265
|
+
}
|
|
266
|
+
function nonSystemTurns(messages) {
|
|
267
|
+
return messages.filter((m) => m.role !== AiMessageRole.System);
|
|
268
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { type AppError, type Result } from '@shrkcrft/core';
|
|
2
|
+
import { AbstractAiProvider } from '../ai-provider.js';
|
|
3
|
+
import { type IAiRequest, type IAiResponse } from '../ai-request.js';
|
|
4
|
+
/**
|
|
5
|
+
* HTTP adapter for a local Ollama instance (https://ollama.com).
|
|
6
|
+
*
|
|
7
|
+
* Unlike Gemini/Claude, Ollama is host-based and does not need an API
|
|
8
|
+
* key — `isReady()` is always true; the actual reachability check is
|
|
9
|
+
* deferred to `send()`. The host is picked from `OLLAMA_HOST` (or the
|
|
10
|
+
* provider config). Two forms are accepted:
|
|
11
|
+
* - A full URL, e.g. `OLLAMA_HOST=http://my-box:11434`.
|
|
12
|
+
* - A bare hostname (or IP) when paired with `OLLAMA_PORT`, e.g.
|
|
13
|
+
* `OLLAMA_HOST=my-box` + `OLLAMA_PORT=11434`. The URL is assembled
|
|
14
|
+
* as `http://<host>:<port>`.
|
|
15
|
+
* Falls back to `http://localhost:11434`. The default model comes from
|
|
16
|
+
* `OLLAMA_MODEL` and may be overridden per request.
|
|
17
|
+
*
|
|
18
|
+
* Wire format: `POST /api/chat` with `{model, messages, stream:false,
|
|
19
|
+
* format?, options}`. The provider-neutral `IAiMessage` roles map
|
|
20
|
+
* directly onto Ollama roles. When `responseFormat` is supplied we ask
|
|
21
|
+
* Ollama for structured output — newer servers accept a JSON-schema
|
|
22
|
+
* object as `format`, older servers fall back to `format: "json"`.
|
|
23
|
+
*/
|
|
24
|
+
export declare class OllamaProvider extends AbstractAiProvider {
|
|
25
|
+
readonly id = "ollama";
|
|
26
|
+
readonly name = "Ollama (local HTTP)";
|
|
27
|
+
isReady(): boolean;
|
|
28
|
+
/**
|
|
29
|
+
* One-shot preflight against `GET /api/tags`.
|
|
30
|
+
*
|
|
31
|
+
* Why this exists: Ollama is the one provider whose readiness is
|
|
32
|
+
* decoupled from env (the daemon may be down, the model may not be
|
|
33
|
+
* pulled). The two-stage planner calls this *before* stage 1 so it
|
|
34
|
+
* can fail with `ollama serve` / `ollama pull <model>` hints instead
|
|
35
|
+
* of a confusing network error mid-call.
|
|
36
|
+
*
|
|
37
|
+
* `requireModel` (optional) is checked against the server's tag list
|
|
38
|
+
* and reported separately so the caller can build a precise hint.
|
|
39
|
+
*/
|
|
40
|
+
healthCheck(requireModel?: string): Promise<Result<{
|
|
41
|
+
host: string;
|
|
42
|
+
models: string[];
|
|
43
|
+
modelPresent: boolean | null;
|
|
44
|
+
}, AppError>>;
|
|
45
|
+
send(request: IAiRequest): Promise<Result<IAiResponse, AppError>>;
|
|
46
|
+
}
|
|
47
|
+
//# sourceMappingURL=ollama-provider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ollama-provider.d.ts","sourceRoot":"","sources":["../../src/ollama/ollama-provider.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsC,KAAK,QAAQ,EAAE,KAAK,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAChG,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EAAiB,KAAK,UAAU,EAAE,KAAK,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAMpF;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,cAAe,SAAQ,kBAAkB;IACpD,QAAQ,CAAC,EAAE,YAAY;IACvB,QAAQ,CAAC,IAAI,yBAAyB;IAEtC,OAAO,IAAI,OAAO;IAIlB;;;;;;;;;;;OAWG;IACG,WAAW,CACf,YAAY,CAAC,EAAE,MAAM,GACpB,OAAO,CAAC,MAAM,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,EAAE,CAAC;QAAC,YAAY,EAAE,OAAO,GAAG,IAAI,CAAA;KAAE,EAAE,QAAQ,CAAC,CAAC;IA+BxF,IAAI,CAAC,OAAO,EAAE,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;CAkExE"}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import { AppErrorImpl, ERROR_CODES, err, ok } from '@shrkcrft/core';
|
|
2
|
+
import { AbstractAiProvider } from "../ai-provider.js";
|
|
3
|
+
import { AiMessageRole } from "../ai-request.js";
|
|
4
|
+
const DEFAULT_OLLAMA_HOST = 'http://localhost:11434';
|
|
5
|
+
const DEFAULT_OLLAMA_MODEL = 'llama3.1';
|
|
6
|
+
const DEFAULT_OLLAMA_PORT = 11434;
|
|
7
|
+
/**
|
|
8
|
+
* HTTP adapter for a local Ollama instance (https://ollama.com).
|
|
9
|
+
*
|
|
10
|
+
* Unlike Gemini/Claude, Ollama is host-based and does not need an API
|
|
11
|
+
* key — `isReady()` is always true; the actual reachability check is
|
|
12
|
+
* deferred to `send()`. The host is picked from `OLLAMA_HOST` (or the
|
|
13
|
+
* provider config). Two forms are accepted:
|
|
14
|
+
* - A full URL, e.g. `OLLAMA_HOST=http://my-box:11434`.
|
|
15
|
+
* - A bare hostname (or IP) when paired with `OLLAMA_PORT`, e.g.
|
|
16
|
+
* `OLLAMA_HOST=my-box` + `OLLAMA_PORT=11434`. The URL is assembled
|
|
17
|
+
* as `http://<host>:<port>`.
|
|
18
|
+
* Falls back to `http://localhost:11434`. The default model comes from
|
|
19
|
+
* `OLLAMA_MODEL` and may be overridden per request.
|
|
20
|
+
*
|
|
21
|
+
* Wire format: `POST /api/chat` with `{model, messages, stream:false,
|
|
22
|
+
* format?, options}`. The provider-neutral `IAiMessage` roles map
|
|
23
|
+
* directly onto Ollama roles. When `responseFormat` is supplied we ask
|
|
24
|
+
* Ollama for structured output — newer servers accept a JSON-schema
|
|
25
|
+
* object as `format`, older servers fall back to `format: "json"`.
|
|
26
|
+
*/
|
|
27
|
+
export class OllamaProvider extends AbstractAiProvider {
|
|
28
|
+
id = 'ollama';
|
|
29
|
+
name = 'Ollama (local HTTP)';
|
|
30
|
+
isReady() {
|
|
31
|
+
return true;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* One-shot preflight against `GET /api/tags`.
|
|
35
|
+
*
|
|
36
|
+
* Why this exists: Ollama is the one provider whose readiness is
|
|
37
|
+
* decoupled from env (the daemon may be down, the model may not be
|
|
38
|
+
* pulled). The two-stage planner calls this *before* stage 1 so it
|
|
39
|
+
* can fail with `ollama serve` / `ollama pull <model>` hints instead
|
|
40
|
+
* of a confusing network error mid-call.
|
|
41
|
+
*
|
|
42
|
+
* `requireModel` (optional) is checked against the server's tag list
|
|
43
|
+
* and reported separately so the caller can build a precise hint.
|
|
44
|
+
*/
|
|
45
|
+
async healthCheck(requireModel) {
|
|
46
|
+
const baseUrl = resolveBaseUrl(this.config.baseUrl);
|
|
47
|
+
try {
|
|
48
|
+
const res = await fetch(`${baseUrl}/api/tags`, { method: 'GET' });
|
|
49
|
+
if (!res.ok) {
|
|
50
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Ollama health-check failed at ${baseUrl}/api/tags (HTTP ${res.status})`, { suggestion: `Is OLLAMA_HOST correct? Currently ${baseUrl}.` }));
|
|
51
|
+
}
|
|
52
|
+
const json = (await res.json());
|
|
53
|
+
const models = (json.models ?? []).map((m) => m.name ?? '').filter((n) => n.length > 0);
|
|
54
|
+
const modelPresent = requireModel ? models.includes(requireModel) : null;
|
|
55
|
+
return ok({ host: baseUrl, models, modelPresent });
|
|
56
|
+
}
|
|
57
|
+
catch (e) {
|
|
58
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Cannot reach Ollama at ${baseUrl}: ${e.message}`, {
|
|
59
|
+
cause: e,
|
|
60
|
+
suggestion: `Start the daemon (\`ollama serve\`) or set OLLAMA_HOST to a reachable instance.`,
|
|
61
|
+
}));
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
async send(request) {
|
|
65
|
+
const baseUrl = resolveBaseUrl(this.config.baseUrl);
|
|
66
|
+
const model = request.model ?? this.config.model ?? process.env.OLLAMA_MODEL ?? DEFAULT_OLLAMA_MODEL;
|
|
67
|
+
const maxTokens = request.maxTokens ?? 4096;
|
|
68
|
+
const messages = request.messages.map((m) => ({
|
|
69
|
+
role: roleFor(m.role),
|
|
70
|
+
content: m.content,
|
|
71
|
+
}));
|
|
72
|
+
const body = {
|
|
73
|
+
model,
|
|
74
|
+
messages,
|
|
75
|
+
stream: false,
|
|
76
|
+
options: {
|
|
77
|
+
num_predict: maxTokens,
|
|
78
|
+
...(request.temperature !== undefined ? { temperature: request.temperature } : {}),
|
|
79
|
+
},
|
|
80
|
+
};
|
|
81
|
+
const format = formatFor(request.responseFormat);
|
|
82
|
+
if (format !== undefined)
|
|
83
|
+
body.format = format;
|
|
84
|
+
try {
|
|
85
|
+
const res = await fetch(`${baseUrl}/api/chat`, {
|
|
86
|
+
method: 'POST',
|
|
87
|
+
headers: { 'content-type': 'application/json' },
|
|
88
|
+
body: JSON.stringify(body),
|
|
89
|
+
});
|
|
90
|
+
if (!res.ok) {
|
|
91
|
+
const text = await res.text();
|
|
92
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Ollama API ${res.status}: ${text.slice(0, 500)}`, {
|
|
93
|
+
suggestion: `Check OLLAMA_HOST (currently ${baseUrl}) and that the model "${model}" is pulled (\`ollama pull ${model}\`).`,
|
|
94
|
+
}));
|
|
95
|
+
}
|
|
96
|
+
const json = (await res.json());
|
|
97
|
+
const content = json.message?.content ?? '';
|
|
98
|
+
return ok({
|
|
99
|
+
content,
|
|
100
|
+
model: json.model ?? model,
|
|
101
|
+
finishReason: json.done_reason,
|
|
102
|
+
usage: {
|
|
103
|
+
inputTokens: json.prompt_eval_count,
|
|
104
|
+
outputTokens: json.eval_count,
|
|
105
|
+
},
|
|
106
|
+
raw: json,
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
catch (e) {
|
|
110
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Failed to call Ollama at ${baseUrl}: ${e.message}`, {
|
|
111
|
+
cause: e,
|
|
112
|
+
suggestion: `Is Ollama running? Try \`ollama serve\` or set OLLAMA_HOST to a reachable instance.`,
|
|
113
|
+
}));
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
function roleFor(role) {
|
|
118
|
+
if (role === AiMessageRole.System)
|
|
119
|
+
return 'system';
|
|
120
|
+
if (role === AiMessageRole.Assistant)
|
|
121
|
+
return 'assistant';
|
|
122
|
+
return 'user';
|
|
123
|
+
}
|
|
124
|
+
function formatFor(responseFormat) {
|
|
125
|
+
if (!responseFormat)
|
|
126
|
+
return undefined;
|
|
127
|
+
if (responseFormat.type === 'json_schema' && responseFormat.schema) {
|
|
128
|
+
return responseFormat.schema;
|
|
129
|
+
}
|
|
130
|
+
return 'json';
|
|
131
|
+
}
|
|
132
|
+
function stripTrailingSlash(url) {
|
|
133
|
+
return url.endsWith('/') ? url.slice(0, -1) : url;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Resolve the Ollama base URL from config + env. Accepts:
|
|
137
|
+
* - An explicit base URL on the provider config (`baseUrl`).
|
|
138
|
+
* - `OLLAMA_HOST` as a full URL (`http://my-box:11434`).
|
|
139
|
+
* - `OLLAMA_HOST` as a bare host (`my-box`) paired with
|
|
140
|
+
* `OLLAMA_PORT` (default 11434 if only host is given).
|
|
141
|
+
* - Falls back to `http://localhost:11434`.
|
|
142
|
+
*
|
|
143
|
+
* Why split host/port: lets the user point at a remote Ollama with two
|
|
144
|
+
* dotenv entries instead of having to remember the URL form. Both
|
|
145
|
+
* styles coexist; if `OLLAMA_HOST` already contains a scheme we keep
|
|
146
|
+
* it verbatim and ignore `OLLAMA_PORT` (the URL is authoritative).
|
|
147
|
+
*/
|
|
148
|
+
function resolveBaseUrl(configBaseUrl) {
|
|
149
|
+
if (configBaseUrl && configBaseUrl.length > 0) {
|
|
150
|
+
return stripTrailingSlash(configBaseUrl);
|
|
151
|
+
}
|
|
152
|
+
const rawHost = (process.env.OLLAMA_HOST ?? '').trim();
|
|
153
|
+
const rawPort = (process.env.OLLAMA_PORT ?? '').trim();
|
|
154
|
+
if (rawHost.length === 0 && rawPort.length === 0) {
|
|
155
|
+
return DEFAULT_OLLAMA_HOST;
|
|
156
|
+
}
|
|
157
|
+
if (rawHost.length > 0 && /^https?:\/\//i.test(rawHost)) {
|
|
158
|
+
// Full URL form takes precedence — OLLAMA_PORT is intentionally
|
|
159
|
+
// ignored so users can't end up with two conflicting sources of
|
|
160
|
+
// truth.
|
|
161
|
+
return stripTrailingSlash(rawHost);
|
|
162
|
+
}
|
|
163
|
+
const host = rawHost.length > 0 ? rawHost : 'localhost';
|
|
164
|
+
const port = rawPort.length > 0 ? rawPort : String(DEFAULT_OLLAMA_PORT);
|
|
165
|
+
return `http://${host}:${port}`;
|
|
166
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { type AppError, type Result } from '@shrkcrft/core';
|
|
2
|
+
import type { IAiProvider } from '../ai-provider.js';
|
|
3
|
+
import { type IAiMessage } from '../ai-request.js';
|
|
4
|
+
/**
|
|
5
|
+
* Identifier for a stage in the multi-pass enhancement pipeline.
|
|
6
|
+
*
|
|
7
|
+
* The default Claude-agent-oriented pipeline runs `draft → critique →
|
|
8
|
+
* refine → polish`. Callers may pass a custom stage list to truncate,
|
|
9
|
+
* extend, or rearrange the flow.
|
|
10
|
+
*/
|
|
11
|
+
export declare enum EnhancementStageKind {
|
|
12
|
+
Draft = "draft",
|
|
13
|
+
Critique = "critique",
|
|
14
|
+
Refine = "refine",
|
|
15
|
+
Polish = "polish"
|
|
16
|
+
}
|
|
17
|
+
export interface IEnhancementStageInput {
|
|
18
|
+
/** The deterministic ground truth assembled by the engine. */
|
|
19
|
+
originalContext: string;
|
|
20
|
+
/** The original user task / question. */
|
|
21
|
+
task: string;
|
|
22
|
+
/** Output of the previous stage (empty on the first stage). */
|
|
23
|
+
previous: string;
|
|
24
|
+
/** Output of the most recent `critique` stage, when relevant. */
|
|
25
|
+
lastCritique?: string;
|
|
26
|
+
}
|
|
27
|
+
export interface IEnhancementStage {
|
|
28
|
+
kind: EnhancementStageKind;
|
|
29
|
+
/**
|
|
30
|
+
* Build the messages the LLM should see for this stage. Stages stay
|
|
31
|
+
* pure — the orchestrator owns the provider, retries, and bookkeeping.
|
|
32
|
+
*/
|
|
33
|
+
buildMessages(input: IEnhancementStageInput): IAiMessage[];
|
|
34
|
+
}
|
|
35
|
+
export interface IEnhancementStageResult {
|
|
36
|
+
kind: EnhancementStageKind;
|
|
37
|
+
content: string;
|
|
38
|
+
model: string;
|
|
39
|
+
/** Set when the stage failed and we kept the previous-stage output. */
|
|
40
|
+
degraded?: boolean;
|
|
41
|
+
errorMessage?: string;
|
|
42
|
+
usage?: {
|
|
43
|
+
inputTokens?: number;
|
|
44
|
+
outputTokens?: number;
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
export interface IEnhancementPipelineOptions {
|
|
48
|
+
/** Cap the pipeline depth — useful for cheap models. Default: all stages. */
|
|
49
|
+
maxPasses?: number;
|
|
50
|
+
/** Per-stage `maxTokens`. Default: 4096. */
|
|
51
|
+
maxTokensPerStage?: number;
|
|
52
|
+
/** Per-stage `temperature`. Default: 0.2 (deterministic-ish). */
|
|
53
|
+
temperature?: number;
|
|
54
|
+
/** Override the model selection (forwarded to the provider per call). */
|
|
55
|
+
model?: string;
|
|
56
|
+
/** Optional progress hook — called once per stage. */
|
|
57
|
+
onStage?: (event: {
|
|
58
|
+
kind: EnhancementStageKind;
|
|
59
|
+
ok: boolean;
|
|
60
|
+
pass: number;
|
|
61
|
+
total: number;
|
|
62
|
+
}) => void;
|
|
63
|
+
}
|
|
64
|
+
export interface IEnhancementPipelineRun {
|
|
65
|
+
/** Final enriched output. Always defined — falls back to `originalContext` when every stage failed. */
|
|
66
|
+
finalOutput: string;
|
|
67
|
+
/** Per-stage history (ordered). */
|
|
68
|
+
stages: IEnhancementStageResult[];
|
|
69
|
+
/** Aggregated token usage across stages (when reported by the provider). */
|
|
70
|
+
totalUsage: {
|
|
71
|
+
inputTokens: number;
|
|
72
|
+
outputTokens: number;
|
|
73
|
+
};
|
|
74
|
+
/**
|
|
75
|
+
* True when the pipeline could not call the LLM at all (no provider
|
|
76
|
+
* passed). The caller is expected to handle this case by returning
|
|
77
|
+
* the deterministic seed unchanged.
|
|
78
|
+
*/
|
|
79
|
+
deterministicFallback: boolean;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Multi-pass refinement pipeline that turns a deterministic brief into
|
|
83
|
+
* a denser, more agent-ready artefact by making the LLM critique and
|
|
84
|
+
* rewrite its own work.
|
|
85
|
+
*
|
|
86
|
+
* Design contract:
|
|
87
|
+
* - When no provider is supplied, the pipeline returns the
|
|
88
|
+
* `originalContext` unchanged and flags `deterministicFallback`.
|
|
89
|
+
* The deterministic engine remains the source of truth.
|
|
90
|
+
* - When a provider is supplied, every stage call is retried-once on
|
|
91
|
+
* failure; a permanently-failed stage degrades to the previous
|
|
92
|
+
* stage's output (the pipeline never throws and never produces
|
|
93
|
+
* less than the deterministic input).
|
|
94
|
+
* - Stages compose: a caller can pass a 2-stage `[draft, polish]`
|
|
95
|
+
* pipeline for fast paths, or extend with custom critique prompts
|
|
96
|
+
* for project-specific quality bars.
|
|
97
|
+
*
|
|
98
|
+
* Why a pipeline (vs. a single rich prompt): small local models behave
|
|
99
|
+
* dramatically better when asked to "find the gaps in this draft" than
|
|
100
|
+
* when asked to "write the perfect brief in one shot". The critique
|
|
101
|
+
* pass surfaces vague claims and missing evidence; the refine pass
|
|
102
|
+
* fixes them; the polish pass enforces Claude-agent ergonomics
|
|
103
|
+
* (file:line refs, explicit next commands, terse bullets).
|
|
104
|
+
*/
|
|
105
|
+
export declare class EnhancementPipeline {
|
|
106
|
+
private readonly stages;
|
|
107
|
+
constructor(stages: ReadonlyArray<IEnhancementStage>);
|
|
108
|
+
run(input: {
|
|
109
|
+
task: string;
|
|
110
|
+
originalContext: string;
|
|
111
|
+
}, provider: IAiProvider | null, options?: IEnhancementPipelineOptions): Promise<Result<IEnhancementPipelineRun, AppError>>;
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* The default stage set for "make this brief more useful to the Claude
|
|
115
|
+
* agent". Tuned for small local models (Qwen2.5-Coder-3B, Llama-3.1-8B).
|
|
116
|
+
*
|
|
117
|
+
* Each stage's user message is intentionally short and concrete; the
|
|
118
|
+
* heavy lifting (the deterministic seed) lives in the system role
|
|
119
|
+
* and is reused verbatim across stages so the model never loses
|
|
120
|
+
* grounding.
|
|
121
|
+
*/
|
|
122
|
+
export declare function buildDefaultEnhancementStages(): IEnhancementStage[];
|
|
123
|
+
//# sourceMappingURL=enhancement-pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"enhancement-pipeline.d.ts","sourceRoot":"","sources":["../../src/pipeline/enhancement-pipeline.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsC,KAAK,QAAQ,EAAE,KAAK,MAAM,EAAE,MAAM,gBAAgB,CAAC;AAChG,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAiB,KAAK,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAElE;;;;;;GAMG;AACH,oBAAY,oBAAoB;IAC9B,KAAK,UAAU;IACf,QAAQ,aAAa;IACrB,MAAM,WAAW;IACjB,MAAM,WAAW;CAClB;AAED,MAAM,WAAW,sBAAsB;IACrC,8DAA8D;IAC9D,eAAe,EAAE,MAAM,CAAC;IACxB,yCAAyC;IACzC,IAAI,EAAE,MAAM,CAAC;IACb,+DAA+D;IAC/D,QAAQ,EAAE,MAAM,CAAC;IACjB,iEAAiE;IACjE,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,oBAAoB,CAAC;IAC3B;;;OAGG;IACH,aAAa,CAAC,KAAK,EAAE,sBAAsB,GAAG,UAAU,EAAE,CAAC;CAC5D;AAED,MAAM,WAAW,uBAAuB;IACtC,IAAI,EAAE,oBAAoB,CAAC;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,uEAAuE;IACvE,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE;QAAE,WAAW,CAAC,EAAE,MAAM,CAAC;QAAC,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;CACzD;AAED,MAAM,WAAW,2BAA2B;IAC1C,6EAA6E;IAC7E,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,4CAA4C;IAC5C,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,iEAAiE;IACjE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,yEAAyE;IACzE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,sDAAsD;IACtD,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE;QAAE,IAAI,EAAE,oBAAoB,CAAC;QAAC,EAAE,EAAE,OAAO,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,KAAK,IAAI,CAAC;CACrG;AAED,MAAM,WAAW,uBAAuB;IACtC,uGAAuG;IACvG,WAAW,EAAE,MAAM,CAAC;IACpB,mCAAmC;IACnC,MAAM,EAAE,uBAAuB,EAAE,CAAC;IAClC,4EAA4E;IAC5E,UAAU,EAAE;QAAE,WAAW,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC;IAC1D;;;;OAIG;IACH,qBAAqB,EAAE,OAAO,CAAC;CAChC;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmC;gBAE9C,MAAM,EAAE,aAAa,CAAC,iBAAiB,CAAC;IAI9C,GAAG,CACP,KAAK,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,eAAe,EAAE,MAAM,CAAA;KAAE,EAChD,QAAQ,EAAE,WAAW,GAAG,IAAI,EAC5B,OAAO,GAAE,2BAAgC,GACxC,OAAO,CAAC,MAAM,CAAC,uBAAuB,EAAE,QAAQ,CAAC,CAAC;CAmFtD;AAED;;;;;;;;GAQG;AACH,wBAAgB,6BAA6B,IAAI,iBAAiB,EAAE,CAOnE"}
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
import { AppErrorImpl, ERROR_CODES, err, ok } from '@shrkcrft/core';
|
|
2
|
+
import { AiMessageRole } from "../ai-request.js";
|
|
3
|
+
/**
|
|
4
|
+
* Identifier for a stage in the multi-pass enhancement pipeline.
|
|
5
|
+
*
|
|
6
|
+
* The default Claude-agent-oriented pipeline runs `draft → critique →
|
|
7
|
+
* refine → polish`. Callers may pass a custom stage list to truncate,
|
|
8
|
+
* extend, or rearrange the flow.
|
|
9
|
+
*/
|
|
10
|
+
export var EnhancementStageKind;
|
|
11
|
+
(function (EnhancementStageKind) {
|
|
12
|
+
EnhancementStageKind["Draft"] = "draft";
|
|
13
|
+
EnhancementStageKind["Critique"] = "critique";
|
|
14
|
+
EnhancementStageKind["Refine"] = "refine";
|
|
15
|
+
EnhancementStageKind["Polish"] = "polish";
|
|
16
|
+
})(EnhancementStageKind || (EnhancementStageKind = {}));
|
|
17
|
+
/**
|
|
18
|
+
* Multi-pass refinement pipeline that turns a deterministic brief into
|
|
19
|
+
* a denser, more agent-ready artefact by making the LLM critique and
|
|
20
|
+
* rewrite its own work.
|
|
21
|
+
*
|
|
22
|
+
* Design contract:
|
|
23
|
+
* - When no provider is supplied, the pipeline returns the
|
|
24
|
+
* `originalContext` unchanged and flags `deterministicFallback`.
|
|
25
|
+
* The deterministic engine remains the source of truth.
|
|
26
|
+
* - When a provider is supplied, every stage call is retried-once on
|
|
27
|
+
* failure; a permanently-failed stage degrades to the previous
|
|
28
|
+
* stage's output (the pipeline never throws and never produces
|
|
29
|
+
* less than the deterministic input).
|
|
30
|
+
* - Stages compose: a caller can pass a 2-stage `[draft, polish]`
|
|
31
|
+
* pipeline for fast paths, or extend with custom critique prompts
|
|
32
|
+
* for project-specific quality bars.
|
|
33
|
+
*
|
|
34
|
+
* Why a pipeline (vs. a single rich prompt): small local models behave
|
|
35
|
+
* dramatically better when asked to "find the gaps in this draft" than
|
|
36
|
+
* when asked to "write the perfect brief in one shot". The critique
|
|
37
|
+
* pass surfaces vague claims and missing evidence; the refine pass
|
|
38
|
+
* fixes them; the polish pass enforces Claude-agent ergonomics
|
|
39
|
+
* (file:line refs, explicit next commands, terse bullets).
|
|
40
|
+
*/
|
|
41
|
+
export class EnhancementPipeline {
|
|
42
|
+
stages;
|
|
43
|
+
constructor(stages) {
|
|
44
|
+
this.stages = stages;
|
|
45
|
+
}
|
|
46
|
+
async run(input, provider, options = {}) {
|
|
47
|
+
if (!provider) {
|
|
48
|
+
return ok({
|
|
49
|
+
finalOutput: input.originalContext,
|
|
50
|
+
stages: [],
|
|
51
|
+
totalUsage: { inputTokens: 0, outputTokens: 0 },
|
|
52
|
+
deterministicFallback: true,
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
const cap = options.maxPasses ?? this.stages.length;
|
|
56
|
+
const plan = this.stages.slice(0, Math.max(1, cap));
|
|
57
|
+
const stagesOut = [];
|
|
58
|
+
const totalUsage = { inputTokens: 0, outputTokens: 0 };
|
|
59
|
+
let previous = '';
|
|
60
|
+
let lastCritique;
|
|
61
|
+
let lastGood = input.originalContext;
|
|
62
|
+
for (let i = 0; i < plan.length; i += 1) {
|
|
63
|
+
const stage = plan[i];
|
|
64
|
+
const messages = stage.buildMessages({
|
|
65
|
+
originalContext: input.originalContext,
|
|
66
|
+
task: input.task,
|
|
67
|
+
previous,
|
|
68
|
+
lastCritique,
|
|
69
|
+
});
|
|
70
|
+
const stageResult = await callOnceWithRetry(provider, {
|
|
71
|
+
messages,
|
|
72
|
+
maxTokens: options.maxTokensPerStage ?? 4096,
|
|
73
|
+
temperature: options.temperature ?? 0.2,
|
|
74
|
+
...(options.model ? { model: options.model } : {}),
|
|
75
|
+
});
|
|
76
|
+
const onStage = options.onStage;
|
|
77
|
+
if (!stageResult.ok) {
|
|
78
|
+
stagesOut.push({
|
|
79
|
+
kind: stage.kind,
|
|
80
|
+
content: lastGood,
|
|
81
|
+
model: options.model ?? '',
|
|
82
|
+
degraded: true,
|
|
83
|
+
errorMessage: stageResult.error.message,
|
|
84
|
+
});
|
|
85
|
+
if (onStage)
|
|
86
|
+
onStage({ kind: stage.kind, ok: false, pass: i + 1, total: plan.length });
|
|
87
|
+
// Stage failed: keep last-good output but allow the pipeline to
|
|
88
|
+
// continue. A failed `critique` is recoverable (`refine` just
|
|
89
|
+
// gets no critique). A failed `refine` falls back to the prior
|
|
90
|
+
// draft. A failed `polish` returns the refined draft.
|
|
91
|
+
previous = lastGood;
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
const content = (stageResult.value.content ?? '').trim();
|
|
95
|
+
const usage = stageResult.value.usage ?? {};
|
|
96
|
+
if (typeof usage.inputTokens === 'number')
|
|
97
|
+
totalUsage.inputTokens += usage.inputTokens;
|
|
98
|
+
if (typeof usage.outputTokens === 'number')
|
|
99
|
+
totalUsage.outputTokens += usage.outputTokens;
|
|
100
|
+
stagesOut.push({
|
|
101
|
+
kind: stage.kind,
|
|
102
|
+
content,
|
|
103
|
+
model: stageResult.value.model,
|
|
104
|
+
...(usage.inputTokens || usage.outputTokens ? { usage } : {}),
|
|
105
|
+
});
|
|
106
|
+
if (stage.kind === EnhancementStageKind.Critique) {
|
|
107
|
+
lastCritique = content;
|
|
108
|
+
// Critique is not a candidate for `finalOutput` — keep the
|
|
109
|
+
// previous draft as the running best.
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
previous = content;
|
|
113
|
+
lastGood = content;
|
|
114
|
+
}
|
|
115
|
+
if (onStage)
|
|
116
|
+
onStage({ kind: stage.kind, ok: true, pass: i + 1, total: plan.length });
|
|
117
|
+
}
|
|
118
|
+
return ok({
|
|
119
|
+
finalOutput: lastGood,
|
|
120
|
+
stages: stagesOut,
|
|
121
|
+
totalUsage,
|
|
122
|
+
deterministicFallback: false,
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* The default stage set for "make this brief more useful to the Claude
|
|
128
|
+
* agent". Tuned for small local models (Qwen2.5-Coder-3B, Llama-3.1-8B).
|
|
129
|
+
*
|
|
130
|
+
* Each stage's user message is intentionally short and concrete; the
|
|
131
|
+
* heavy lifting (the deterministic seed) lives in the system role
|
|
132
|
+
* and is reused verbatim across stages so the model never loses
|
|
133
|
+
* grounding.
|
|
134
|
+
*/
|
|
135
|
+
export function buildDefaultEnhancementStages() {
|
|
136
|
+
return [
|
|
137
|
+
new DraftStage(),
|
|
138
|
+
new CritiqueStage(),
|
|
139
|
+
new RefineStage(),
|
|
140
|
+
new PolishStage(),
|
|
141
|
+
];
|
|
142
|
+
}
|
|
143
|
+
class DraftStage {
|
|
144
|
+
kind = EnhancementStageKind.Draft;
|
|
145
|
+
buildMessages(input) {
|
|
146
|
+
return [
|
|
147
|
+
{
|
|
148
|
+
role: AiMessageRole.System,
|
|
149
|
+
content: [
|
|
150
|
+
'You are SharkCraft, a deterministic, local-first code-intelligence engine.',
|
|
151
|
+
'Your job is to write a concise, Claude-agent-ready brief for the supplied task.',
|
|
152
|
+
'Treat the repository context below as the ONLY ground truth. Do NOT invent file paths, symbols, or commands.',
|
|
153
|
+
'',
|
|
154
|
+
'## Repository context',
|
|
155
|
+
input.originalContext.trim(),
|
|
156
|
+
].join('\n'),
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
role: AiMessageRole.User,
|
|
160
|
+
content: [
|
|
161
|
+
`# Task`,
|
|
162
|
+
input.task.trim(),
|
|
163
|
+
'',
|
|
164
|
+
'# Write the draft brief',
|
|
165
|
+
'Sections, in order:',
|
|
166
|
+
'1. **Goal** — one sentence.',
|
|
167
|
+
'2. **Files to read** — bullet list, `path` (no line numbers, just path) with one-line rationale.',
|
|
168
|
+
'3. **Files likely to modify** — bullet list, same format.',
|
|
169
|
+
'4. **Implementation sketch** — 3–6 bullets, imperative.',
|
|
170
|
+
'5. **Risks / unknowns** — bullets; mark each "RISK" or "UNKNOWN".',
|
|
171
|
+
'6. **First commands** — fenced bash, one command per line.',
|
|
172
|
+
'',
|
|
173
|
+
'Be terse. Skip prose. Skip preambles. Skip "I will now…".',
|
|
174
|
+
].join('\n'),
|
|
175
|
+
},
|
|
176
|
+
];
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
class CritiqueStage {
|
|
180
|
+
kind = EnhancementStageKind.Critique;
|
|
181
|
+
buildMessages(input) {
|
|
182
|
+
return [
|
|
183
|
+
{
|
|
184
|
+
role: AiMessageRole.System,
|
|
185
|
+
content: [
|
|
186
|
+
'You are a code-review style critic for SharkCraft briefs.',
|
|
187
|
+
'Treat the repository context below as the ONLY ground truth.',
|
|
188
|
+
'',
|
|
189
|
+
'## Repository context',
|
|
190
|
+
input.originalContext.trim(),
|
|
191
|
+
].join('\n'),
|
|
192
|
+
},
|
|
193
|
+
{
|
|
194
|
+
role: AiMessageRole.User,
|
|
195
|
+
content: [
|
|
196
|
+
`# Original task`,
|
|
197
|
+
input.task.trim(),
|
|
198
|
+
'',
|
|
199
|
+
`# Draft brief to critique`,
|
|
200
|
+
input.previous.trim() || '(empty)',
|
|
201
|
+
'',
|
|
202
|
+
'# Critique',
|
|
203
|
+
'Find concrete issues. For each issue: one line, prefixed with one of:',
|
|
204
|
+
'- `GAP:` — something important the brief omits.',
|
|
205
|
+
'- `VAGUE:` — a claim that lacks an exact file path, symbol, or command.',
|
|
206
|
+
'- `WRONG:` — a claim that contradicts the repository context.',
|
|
207
|
+
'- `MISSING-EVIDENCE:` — a claim with no file:line or knowledge-entry id behind it.',
|
|
208
|
+
'',
|
|
209
|
+
'If the draft is already strong, output a single line: `OK`.',
|
|
210
|
+
'Do NOT rewrite the brief. Critique only.',
|
|
211
|
+
].join('\n'),
|
|
212
|
+
},
|
|
213
|
+
];
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
class RefineStage {
|
|
217
|
+
kind = EnhancementStageKind.Refine;
|
|
218
|
+
buildMessages(input) {
|
|
219
|
+
return [
|
|
220
|
+
{
|
|
221
|
+
role: AiMessageRole.System,
|
|
222
|
+
content: [
|
|
223
|
+
'You are SharkCraft. Rewrite the draft brief to address the critique, while staying strictly grounded in the repository context.',
|
|
224
|
+
'',
|
|
225
|
+
'## Repository context',
|
|
226
|
+
input.originalContext.trim(),
|
|
227
|
+
].join('\n'),
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
role: AiMessageRole.User,
|
|
231
|
+
content: [
|
|
232
|
+
`# Original task`,
|
|
233
|
+
input.task.trim(),
|
|
234
|
+
'',
|
|
235
|
+
`# Draft brief`,
|
|
236
|
+
input.previous.trim() || '(empty)',
|
|
237
|
+
'',
|
|
238
|
+
`# Critique to address`,
|
|
239
|
+
(input.lastCritique ?? 'OK').trim(),
|
|
240
|
+
'',
|
|
241
|
+
'# Rewrite the brief',
|
|
242
|
+
'Same section layout as the draft. Resolve every GAP/VAGUE/WRONG/MISSING-EVIDENCE line by adding an exact file path or removing the claim. Keep it terse.',
|
|
243
|
+
].join('\n'),
|
|
244
|
+
},
|
|
245
|
+
];
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
class PolishStage {
|
|
249
|
+
kind = EnhancementStageKind.Polish;
|
|
250
|
+
buildMessages(input) {
|
|
251
|
+
return [
|
|
252
|
+
{
|
|
253
|
+
role: AiMessageRole.System,
|
|
254
|
+
content: [
|
|
255
|
+
'You are SharkCraft. Final polish pass — improve readability for an AI coding agent (e.g. Claude Code) that will consume this brief.',
|
|
256
|
+
'Keep the meaning intact. Do not add new facts.',
|
|
257
|
+
'',
|
|
258
|
+
'## Repository context (reference only — do not extend)',
|
|
259
|
+
input.originalContext.trim(),
|
|
260
|
+
].join('\n'),
|
|
261
|
+
},
|
|
262
|
+
{
|
|
263
|
+
role: AiMessageRole.User,
|
|
264
|
+
content: [
|
|
265
|
+
`# Original task`,
|
|
266
|
+
input.task.trim(),
|
|
267
|
+
'',
|
|
268
|
+
`# Brief to polish`,
|
|
269
|
+
input.previous.trim() || '(empty)',
|
|
270
|
+
'',
|
|
271
|
+
'# Polish pass',
|
|
272
|
+
'Rules:',
|
|
273
|
+
'- Convert any `path` reference to `path:lineNumber` when a line number appears in the context (do not invent line numbers).',
|
|
274
|
+
'- Keep each bullet to one line.',
|
|
275
|
+
'- Promote any imperative verb to the start of the bullet (`Add`, `Wire`, `Replace`, …).',
|
|
276
|
+
'- Surface any RISK / UNKNOWN as a short, scannable bullet.',
|
|
277
|
+
'- Output the brief only — no meta commentary, no "Here is the polished version".',
|
|
278
|
+
].join('\n'),
|
|
279
|
+
},
|
|
280
|
+
];
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
async function callOnceWithRetry(provider, request) {
|
|
284
|
+
const first = await provider.send(request);
|
|
285
|
+
if (first.ok) {
|
|
286
|
+
return ok({ content: first.value.content, model: first.value.model, usage: first.value.usage });
|
|
287
|
+
}
|
|
288
|
+
// One retry — small local models routinely 500 on the first request
|
|
289
|
+
// after a daemon start. Idempotent reissue is safe.
|
|
290
|
+
const second = await provider.send(request);
|
|
291
|
+
if (second.ok) {
|
|
292
|
+
return ok({ content: second.value.content, model: second.value.model, usage: second.value.usage });
|
|
293
|
+
}
|
|
294
|
+
return err(new AppErrorImpl(ERROR_CODES.IO_ERROR, `Enhancement-pipeline stage failed twice: ${second.error.message}`, { cause: second.error }));
|
|
295
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { IAiProvider } from './ai-provider.js';
|
|
2
|
+
export type AiProviderKind = 'auto' | 'claude' | 'gemini' | 'ollama' | 'llamacpp';
|
|
3
|
+
/**
|
|
4
|
+
* Resolve an AI provider by kind.
|
|
5
|
+
*
|
|
6
|
+
* The selector is layered so callers can stay terse:
|
|
7
|
+
* - `selectAiProvider('llamacpp' | 'ollama' | 'claude' | 'gemini')`
|
|
8
|
+
* → explicit pick. Returned even when `isReady()` is true; the
|
|
9
|
+
* caller decides what to do with a non-ready provider.
|
|
10
|
+
* - `selectAiProvider('auto')` (or `undefined`) → walk the local-first
|
|
11
|
+
* readiness chain: `llamacpp → ollama`. This is the default for
|
|
12
|
+
* SharkCraft: privacy + offline first, no surprise network calls
|
|
13
|
+
* to hosted APIs.
|
|
14
|
+
*
|
|
15
|
+
* Gemini and Claude are deliberately excluded from the `auto` chain.
|
|
16
|
+
* They are still callable via explicit `--provider gemini` /
|
|
17
|
+
* `--provider claude` (or `AI_PROVIDER=gemini` / `AI_PROVIDER=claude`)
|
|
18
|
+
* for users who keep API keys around — but the system never reaches
|
|
19
|
+
* out to a hosted LLM on its own.
|
|
20
|
+
*
|
|
21
|
+
* An unrecognised kind collapses to `'auto'` so the caller never has
|
|
22
|
+
* to validate user input twice.
|
|
23
|
+
*/
|
|
24
|
+
export declare function selectAiProvider(kind?: string): {
|
|
25
|
+
requested: AiProviderKind;
|
|
26
|
+
provider: IAiProvider | null;
|
|
27
|
+
};
|
|
28
|
+
//# sourceMappingURL=provider-resolver.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"provider-resolver.d.ts","sourceRoot":"","sources":["../src/provider-resolver.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAMpD,MAAM,MAAM,cAAc,GAAG,MAAM,GAAG,QAAQ,GAAG,QAAQ,GAAG,QAAQ,GAAG,UAAU,CAAC;AAElF;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,gBAAgB,CAC9B,IAAI,CAAC,EAAE,MAAM,GACZ;IAAE,SAAS,EAAE,cAAc,CAAC;IAAC,QAAQ,EAAE,WAAW,GAAG,IAAI,CAAA;CAAE,CAmB7D"}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { ClaudeProvider } from "./claude/claude-provider.js";
|
|
2
|
+
import { GeminiProvider } from "./gemini/gemini-provider.js";
|
|
3
|
+
import { OllamaProvider } from "./ollama/ollama-provider.js";
|
|
4
|
+
import { LlamaCppProvider } from "./llamacpp/llama-cpp-provider.js";
|
|
5
|
+
/**
|
|
6
|
+
* Resolve an AI provider by kind.
|
|
7
|
+
*
|
|
8
|
+
* The selector is layered so callers can stay terse:
|
|
9
|
+
* - `selectAiProvider('llamacpp' | 'ollama' | 'claude' | 'gemini')`
|
|
10
|
+
* → explicit pick. Returned even when `isReady()` is true; the
|
|
11
|
+
* caller decides what to do with a non-ready provider.
|
|
12
|
+
* - `selectAiProvider('auto')` (or `undefined`) → walk the local-first
|
|
13
|
+
* readiness chain: `llamacpp → ollama`. This is the default for
|
|
14
|
+
* SharkCraft: privacy + offline first, no surprise network calls
|
|
15
|
+
* to hosted APIs.
|
|
16
|
+
*
|
|
17
|
+
* Gemini and Claude are deliberately excluded from the `auto` chain.
|
|
18
|
+
* They are still callable via explicit `--provider gemini` /
|
|
19
|
+
* `--provider claude` (or `AI_PROVIDER=gemini` / `AI_PROVIDER=claude`)
|
|
20
|
+
* for users who keep API keys around — but the system never reaches
|
|
21
|
+
* out to a hosted LLM on its own.
|
|
22
|
+
*
|
|
23
|
+
* An unrecognised kind collapses to `'auto'` so the caller never has
|
|
24
|
+
* to validate user input twice.
|
|
25
|
+
*/
|
|
26
|
+
export function selectAiProvider(kind) {
|
|
27
|
+
const normalised = normaliseKind(kind);
|
|
28
|
+
if (normalised === 'claude') {
|
|
29
|
+
const provider = new ClaudeProvider();
|
|
30
|
+
return { requested: 'claude', provider: provider.isReady() ? provider : null };
|
|
31
|
+
}
|
|
32
|
+
if (normalised === 'gemini') {
|
|
33
|
+
const provider = new GeminiProvider();
|
|
34
|
+
return { requested: 'gemini', provider: provider.isReady() ? provider : null };
|
|
35
|
+
}
|
|
36
|
+
if (normalised === 'ollama') {
|
|
37
|
+
const provider = new OllamaProvider();
|
|
38
|
+
return { requested: 'ollama', provider: provider.isReady() ? provider : null };
|
|
39
|
+
}
|
|
40
|
+
if (normalised === 'llamacpp') {
|
|
41
|
+
const provider = new LlamaCppProvider();
|
|
42
|
+
return { requested: 'llamacpp', provider: provider.isReady() ? provider : null };
|
|
43
|
+
}
|
|
44
|
+
return autoSelect();
|
|
45
|
+
}
|
|
46
|
+
function normaliseKind(kind) {
|
|
47
|
+
const known = new Set(['claude', 'gemini', 'ollama', 'llamacpp']);
|
|
48
|
+
if (kind !== undefined) {
|
|
49
|
+
const explicit = kind.trim().toLowerCase();
|
|
50
|
+
if (known.has(explicit))
|
|
51
|
+
return explicit;
|
|
52
|
+
}
|
|
53
|
+
const envCandidate = (process.env.AI_PROVIDER ?? '').trim().toLowerCase();
|
|
54
|
+
if (known.has(envCandidate))
|
|
55
|
+
return envCandidate;
|
|
56
|
+
return 'auto';
|
|
57
|
+
}
|
|
58
|
+
function autoSelect() {
|
|
59
|
+
for (const kind of defaultAutoChain()) {
|
|
60
|
+
if (kind === 'llamacpp') {
|
|
61
|
+
const provider = new LlamaCppProvider();
|
|
62
|
+
if (provider.isReady())
|
|
63
|
+
return { requested: 'auto', provider };
|
|
64
|
+
}
|
|
65
|
+
else if (kind === 'ollama') {
|
|
66
|
+
const provider = new OllamaProvider();
|
|
67
|
+
if (provider.isReady())
|
|
68
|
+
return { requested: 'auto', provider };
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return { requested: 'auto', provider: null };
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Local-first chain. Hosted providers (Gemini, Claude) are
|
|
75
|
+
* intentionally absent — opting into a hosted API has to be explicit
|
|
76
|
+
* via `--provider <name>` or `AI_PROVIDER=<name>`.
|
|
77
|
+
*/
|
|
78
|
+
function defaultAutoChain() {
|
|
79
|
+
return ['llamacpp', 'ollama'];
|
|
80
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@shrkcrft/ai",
|
|
3
|
-
"version": "0.1.0-alpha.
|
|
4
|
-
"description": "SharkCraft
|
|
3
|
+
"version": "0.1.0-alpha.12",
|
|
4
|
+
"description": "SharkCraft local LLM provider abstraction: Ollama (HTTP) + llama.cpp (in-process) + multi-pass enhancement pipeline.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "SharkCraft contributors",
|
|
7
7
|
"type": "module",
|
|
@@ -43,8 +43,9 @@
|
|
|
43
43
|
"typecheck": "tsc --noEmit -p tsconfig.json"
|
|
44
44
|
},
|
|
45
45
|
"dependencies": {
|
|
46
|
-
"@shrkcrft/core": "^0.1.0-alpha.
|
|
47
|
-
"@shrkcrft/context": "^0.1.0-alpha.
|
|
46
|
+
"@shrkcrft/core": "^0.1.0-alpha.12",
|
|
47
|
+
"@shrkcrft/context": "^0.1.0-alpha.12",
|
|
48
|
+
"node-llama-cpp": "^3.16.0"
|
|
48
49
|
},
|
|
49
50
|
"publishConfig": {
|
|
50
51
|
"access": "public"
|