@shipworthy/ai-sdk-llama-cpp 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -0
- package/LICENSE +21 -0
- package/README.md +274 -0
- package/dist/binding-bun.d.ts +7 -0
- package/dist/binding-bun.d.ts.map +1 -0
- package/dist/binding-bun.js +354 -0
- package/dist/binding-bun.js.map +1 -0
- package/dist/binding-node.d.ts +7 -0
- package/dist/binding-node.d.ts.map +1 -0
- package/dist/binding-node.js +59 -0
- package/dist/binding-node.js.map +1 -0
- package/dist/binding.d.ts +67 -0
- package/dist/binding.d.ts.map +1 -0
- package/dist/binding.js +105 -0
- package/dist/binding.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -0
- package/dist/llama-cpp-embedding-model.d.ts +28 -0
- package/dist/llama-cpp-embedding-model.d.ts.map +1 -0
- package/dist/llama-cpp-embedding-model.js +78 -0
- package/dist/llama-cpp-embedding-model.js.map +1 -0
- package/dist/llama-cpp-language-model.d.ts +55 -0
- package/dist/llama-cpp-language-model.d.ts.map +1 -0
- package/dist/llama-cpp-language-model.js +221 -0
- package/dist/llama-cpp-language-model.js.map +1 -0
- package/dist/llama-cpp-provider.d.ts +82 -0
- package/dist/llama-cpp-provider.d.ts.map +1 -0
- package/dist/llama-cpp-provider.js +71 -0
- package/dist/llama-cpp-provider.js.map +1 -0
- package/dist/native-binding.d.ts +51 -0
- package/dist/native-binding.d.ts.map +1 -0
- package/dist/native-binding.js +74 -0
- package/dist/native-binding.js.map +1 -0
- package/native/CMakeLists.txt +74 -0
- package/native/binding.cpp +522 -0
- package/native/llama-wrapper.cpp +519 -0
- package/native/llama-wrapper.h +131 -0
- package/package.json +79 -0
- package/scripts/postinstall.cjs +74 -0
package/CMakeLists.txt
ADDED
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Lars Grammel
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
# ai-sdk-llama-cpp
|
|
2
|
+
|
|
3
|
+
> **Alpha Software** - This package is in early development. The API may change between versions without notice.
|
|
4
|
+
|
|
5
|
+
> **macOS Only** - This package currently only supports macOS with Apple Silicon or Intel processors.
|
|
6
|
+
|
|
7
|
+
A minimal [llama.cpp](https://github.com/ggerganov/llama.cpp) provider for the [Vercel AI SDK](https://sdk.vercel.ai/), implementing the `LanguageModelV3` interface.
|
|
8
|
+
|
|
9
|
+
This package loads llama.cpp directly into Node.js memory via native C++ bindings, enabling local LLM inference without requiring an external server.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- **Native Performance**: Direct C++ bindings using node-addon-api (N-API)
|
|
14
|
+
- **GPU Acceleration**: Automatic Metal support on macOS
|
|
15
|
+
- **Streaming & Non-streaming**: Full support for both `generateText` and `streamText`
|
|
16
|
+
- **Chat Templates**: Automatic or configurable chat template formatting (llama3, chatml, gemma, etc.)
|
|
17
|
+
- **ESM Only**: Modern ECMAScript modules, no CommonJS
|
|
18
|
+
- **GGUF Support**: Load any GGUF-format model
|
|
19
|
+
|
|
20
|
+
## Prerequisites
|
|
21
|
+
|
|
22
|
+
Before installing, ensure you have the following:
|
|
23
|
+
|
|
24
|
+
- **macOS** (Apple Silicon or Intel)
|
|
25
|
+
- **Node.js** >= 18.0.0
|
|
26
|
+
- **CMake** >= 3.15
|
|
27
|
+
- **Xcode Command Line Tools**
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# Install Xcode Command Line Tools (includes Clang)
|
|
31
|
+
xcode-select --install
|
|
32
|
+
|
|
33
|
+
# Install CMake via Homebrew
|
|
34
|
+
brew install cmake
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
npm install ai-sdk-llama-cpp
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The installation will automatically:
|
|
44
|
+
|
|
45
|
+
1. Detect macOS and verify platform compatibility
|
|
46
|
+
2. Compile llama.cpp as a static library with Metal support
|
|
47
|
+
3. Build the native Node.js addon
|
|
48
|
+
|
|
49
|
+
> **Note**: Installation on Windows or Linux will fail with an error. Only macOS is supported.
|
|
50
|
+
|
|
51
|
+
## Usage
|
|
52
|
+
|
|
53
|
+
### Basic Example
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
import { generateText } from "ai";
|
|
57
|
+
import { llamaCpp } from "ai-sdk-llama-cpp";
|
|
58
|
+
|
|
59
|
+
const model = llamaCpp({
|
|
60
|
+
modelPath: "./models/llama-3.2-1b-instruct.Q4_K_M.gguf",
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
try {
|
|
64
|
+
const { text } = await generateText({
|
|
65
|
+
model,
|
|
66
|
+
prompt: "Explain quantum computing in simple terms.",
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
console.log(text);
|
|
70
|
+
} finally {
|
|
71
|
+
model.dispose();
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Streaming Example
|
|
76
|
+
|
|
77
|
+
```typescript
|
|
78
|
+
import { streamText } from "ai";
|
|
79
|
+
import { llamaCpp } from "ai-sdk-llama-cpp";
|
|
80
|
+
|
|
81
|
+
const model = llamaCpp({
|
|
82
|
+
modelPath: "./models/llama-3.2-1b-instruct.Q4_K_M.gguf",
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
try {
|
|
86
|
+
const { textStream } = await streamText({
|
|
87
|
+
model,
|
|
88
|
+
prompt: "Write a haiku about programming.",
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
for await (const chunk of textStream) {
|
|
92
|
+
process.stdout.write(chunk);
|
|
93
|
+
}
|
|
94
|
+
} finally {
|
|
95
|
+
model.dispose();
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Embedding Example
|
|
100
|
+
|
|
101
|
+
```typescript
|
|
102
|
+
import { embed, embedMany } from "ai";
|
|
103
|
+
import { llamaCpp } from "ai-sdk-llama-cpp";
|
|
104
|
+
|
|
105
|
+
const model = llamaCpp.embedding({
|
|
106
|
+
modelPath: "./models/nomic-embed-text-v1.5.Q4_K_M.gguf",
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
try {
|
|
110
|
+
const { embedding } = await embed({
|
|
111
|
+
model,
|
|
112
|
+
value: "Hello, world!",
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
const { embeddings } = await embedMany({
|
|
116
|
+
model,
|
|
117
|
+
values: ["Hello, world!", "Hello, ▲!"],
|
|
118
|
+
});
|
|
119
|
+
} finally {
|
|
120
|
+
model.dispose();
|
|
121
|
+
}
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Configuration Options
|
|
125
|
+
|
|
126
|
+
```typescript
|
|
127
|
+
const model = llamaCpp({
|
|
128
|
+
// Required: Path to the GGUF model file
|
|
129
|
+
modelPath: "./models/your-model.gguf",
|
|
130
|
+
|
|
131
|
+
// Optional: Maximum context size (default: 2048)
|
|
132
|
+
contextSize: 4096,
|
|
133
|
+
|
|
134
|
+
// Optional: Number of layers to offload to GPU
|
|
135
|
+
// Default: 99 (all layers). Set to 0 to disable GPU.
|
|
136
|
+
gpuLayers: 99,
|
|
137
|
+
|
|
138
|
+
// Optional: Number of CPU threads (default: 4)
|
|
139
|
+
threads: 8,
|
|
140
|
+
|
|
141
|
+
// Optional: Enable verbose debug output from llama.cpp (default: false)
|
|
142
|
+
debug: true,
|
|
143
|
+
|
|
144
|
+
// Optional: Chat template to use for formatting messages
|
|
145
|
+
// - "auto" (default): Use the template embedded in the GGUF model file
|
|
146
|
+
// - Template name: Use a specific built-in template (e.g., "llama3", "chatml", "gemma")
|
|
147
|
+
chatTemplate: "auto",
|
|
148
|
+
});
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
#### Chat Templates
|
|
152
|
+
|
|
153
|
+
The `chatTemplate` option controls how messages are formatted before being sent to the model. Available templates include:
|
|
154
|
+
|
|
155
|
+
- `chatml`, `llama2`, `llama2-sys`, `llama3`, `llama4`
|
|
156
|
+
- `mistral-v1`, `mistral-v3`, `mistral-v7`
|
|
157
|
+
- `phi3`, `phi4`, `gemma`, `falcon3`, `zephyr`
|
|
158
|
+
- `deepseek`, `deepseek2`, `deepseek3`, `command-r`
|
|
159
|
+
- And more (see llama.cpp documentation for the full list)
|
|
160
|
+
|
|
161
|
+
### Generation Parameters
|
|
162
|
+
|
|
163
|
+
The standard AI SDK generation parameters are supported:
|
|
164
|
+
|
|
165
|
+
```typescript
|
|
166
|
+
try {
|
|
167
|
+
const { text } = await generateText({
|
|
168
|
+
model,
|
|
169
|
+
prompt: "Hello!",
|
|
170
|
+
maxTokens: 256, // Maximum tokens to generate
|
|
171
|
+
temperature: 0.7, // Sampling temperature (0-2)
|
|
172
|
+
topP: 0.9, // Nucleus sampling threshold
|
|
173
|
+
topK: 40, // Top-k sampling
|
|
174
|
+
stopSequences: ["\n"], // Stop generation at these sequences
|
|
175
|
+
});
|
|
176
|
+
} finally {
|
|
177
|
+
model.dispose();
|
|
178
|
+
}
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Model Downloads
|
|
182
|
+
|
|
183
|
+
You'll need to download GGUF-format models separately. Popular sources:
|
|
184
|
+
|
|
185
|
+
- [Hugging Face](https://huggingface.co/models?search=gguf) - Search for GGUF models
|
|
186
|
+
- [TheBloke's Models](https://huggingface.co/TheBloke) - Popular quantized models
|
|
187
|
+
|
|
188
|
+
Example download:
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
# Create models directory
|
|
192
|
+
mkdir -p models
|
|
193
|
+
|
|
194
|
+
# Download a model (example: Llama 3.2 1B)
|
|
195
|
+
wget -P models/ https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## API Reference
|
|
199
|
+
|
|
200
|
+
### `llamaCpp(config)`
|
|
201
|
+
|
|
202
|
+
Creates a new llama.cpp language model instance.
|
|
203
|
+
|
|
204
|
+
**Parameters:**
|
|
205
|
+
|
|
206
|
+
- `config.modelPath` (string, required): Path to the GGUF model file
|
|
207
|
+
- `config.contextSize` (number, optional): Maximum context size. Default: 2048
|
|
208
|
+
- `config.gpuLayers` (number, optional): GPU layers to offload. Default: 99
|
|
209
|
+
- `config.threads` (number, optional): CPU threads. Default: 4
|
|
210
|
+
- `config.debug` (boolean, optional): Enable verbose llama.cpp output. Default: false
|
|
211
|
+
- `config.chatTemplate` (string, optional): Chat template to use for formatting messages. Default: "auto"
|
|
212
|
+
|
|
213
|
+
**Returns:** `LlamaCppLanguageModel` - A language model compatible with the Vercel AI SDK
|
|
214
|
+
|
|
215
|
+
### `LlamaCppLanguageModel`
|
|
216
|
+
|
|
217
|
+
Implements the `LanguageModelV3` interface from `@ai-sdk/provider`.
|
|
218
|
+
|
|
219
|
+
**Methods:**
|
|
220
|
+
|
|
221
|
+
- `doGenerate(options)`: Non-streaming text generation
|
|
222
|
+
- `doStream(options)`: Streaming text generation
|
|
223
|
+
- `dispose()`: Unload the model and free GPU/CPU resources. **Always call this when done** to prevent memory leaks, especially when loading multiple models
|
|
224
|
+
|
|
225
|
+
## Limitations
|
|
226
|
+
|
|
227
|
+
This is a minimal implementation with the following limitations:
|
|
228
|
+
|
|
229
|
+
- **macOS only**: Windows and Linux are not supported
|
|
230
|
+
- **No tool/function calling**: Tool calls are not supported
|
|
231
|
+
- **No image inputs**: Only text prompts are supported
|
|
232
|
+
- **No JSON mode**: Structured output generation is not supported
|
|
233
|
+
|
|
234
|
+
## Development
|
|
235
|
+
|
|
236
|
+
### Building from Source
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
# Clone the repository
|
|
240
|
+
git clone https://github.com/lgrammel/ai-sdk-llama-cpp.git
|
|
241
|
+
cd ai-sdk-llama-cpp
|
|
242
|
+
|
|
243
|
+
# Initialize submodules
|
|
244
|
+
git submodule update --init --recursive
|
|
245
|
+
|
|
246
|
+
# Install dependencies
|
|
247
|
+
npm install
|
|
248
|
+
|
|
249
|
+
# Build the native addon and TypeScript
|
|
250
|
+
npm run build
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### Scripts
|
|
254
|
+
|
|
255
|
+
- `npm run build` - Build everything (native + TypeScript)
|
|
256
|
+
- `npm run build:native` - Build only the native addon
|
|
257
|
+
- `npm run build:ts` - Build only TypeScript
|
|
258
|
+
- `npm run clean` - Remove build artifacts
|
|
259
|
+
- `npm run test` - Run tests in watch mode
|
|
260
|
+
- `npm run test:run` - Run all tests once
|
|
261
|
+
- `npm run test:unit` - Run unit tests
|
|
262
|
+
- `npm run test:integration` - Run integration tests
|
|
263
|
+
- `npm run test:e2e` - Run end-to-end tests
|
|
264
|
+
- `npm run test:coverage` - Run tests with coverage
|
|
265
|
+
|
|
266
|
+
## License
|
|
267
|
+
|
|
268
|
+
MIT
|
|
269
|
+
|
|
270
|
+
## Acknowledgments
|
|
271
|
+
|
|
272
|
+
- [llama.cpp](https://github.com/ggerganov/llama.cpp) - The underlying inference engine
|
|
273
|
+
- [Vercel AI SDK](https://sdk.vercel.ai/) - The AI SDK framework
|
|
274
|
+
- [node-addon-api](https://github.com/nodejs/node-addon-api) - N-API C++ wrapper
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { LoadModelOptions, GenerateOptions, GenerateResult } from './binding.js';
|
|
2
|
+
export declare function loadModel(options: LoadModelOptions): Promise<number>;
|
|
3
|
+
export declare function unloadModel(handle: number): boolean;
|
|
4
|
+
export declare function isModelLoaded(handle: number): boolean;
|
|
5
|
+
export declare function generate(handle: number, options: GenerateOptions): Promise<GenerateResult>;
|
|
6
|
+
export declare function generateStream(handle: number, options: GenerateOptions, onToken: (token: string) => void): Promise<GenerateResult>;
|
|
7
|
+
//# sourceMappingURL=binding-bun.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"binding-bun.d.ts","sourceRoot":"","sources":["../src/binding-bun.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AA2ItF,wBAAgB,SAAS,CAAC,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC,CAqDpE;AAED,wBAAgB,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAEnD;AAED,wBAAgB,aAAa,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAErD;AA6FD,wBAAgB,QAAQ,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC,CA2C1F;AAED,wBAAgB,cAAc,CAC5B,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,eAAe,EACxB,OAAO,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,IAAI,GAC/B,OAAO,CAAC,cAAc,CAAC,CA4DzB"}
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
import { dlopen, FFIType, suffix, ptr, CString } from "bun:ffi";
|
|
2
|
+
import { join, dirname } from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
5
|
+
// Resolve the path to the shared library
|
|
6
|
+
const libPath = join(__dirname, "..", "build", "Release", `libllama_ffi.${suffix}`);
|
|
7
|
+
// Define FFI symbols for the C API
|
|
8
|
+
const lib = dlopen(libPath, {
|
|
9
|
+
llama_load_model: {
|
|
10
|
+
args: [FFIType.ptr],
|
|
11
|
+
returns: FFIType.i32,
|
|
12
|
+
},
|
|
13
|
+
llama_unload_model: {
|
|
14
|
+
args: [FFIType.i32],
|
|
15
|
+
returns: FFIType.bool,
|
|
16
|
+
},
|
|
17
|
+
llama_is_model_loaded: {
|
|
18
|
+
args: [FFIType.i32],
|
|
19
|
+
returns: FFIType.bool,
|
|
20
|
+
},
|
|
21
|
+
llama_generate: {
|
|
22
|
+
args: [FFIType.i32, FFIType.ptr],
|
|
23
|
+
returns: FFIType.ptr,
|
|
24
|
+
},
|
|
25
|
+
llama_generate_stream: {
|
|
26
|
+
args: [FFIType.i32, FFIType.ptr, FFIType.ptr, FFIType.ptr],
|
|
27
|
+
returns: FFIType.ptr,
|
|
28
|
+
},
|
|
29
|
+
llama_free_result: {
|
|
30
|
+
args: [FFIType.ptr],
|
|
31
|
+
returns: FFIType.void,
|
|
32
|
+
},
|
|
33
|
+
llama_get_last_error: {
|
|
34
|
+
args: [],
|
|
35
|
+
returns: FFIType.ptr,
|
|
36
|
+
},
|
|
37
|
+
llama_clear_error: {
|
|
38
|
+
args: [],
|
|
39
|
+
returns: FFIType.void,
|
|
40
|
+
},
|
|
41
|
+
});
|
|
42
|
+
// Workaround for Bun FFI toArrayBuffer bug: https://github.com/oven-sh/bun/issues/23656
|
|
43
|
+
// toArrayBuffer() always returns incorrect byteLength, so we use libc memcpy instead
|
|
44
|
+
const libc = dlopen(process.platform === "darwin" ? "libc.dylib" : "libc.so.6", {
|
|
45
|
+
memcpy: {
|
|
46
|
+
args: [FFIType.ptr, FFIType.ptr, FFIType.u64],
|
|
47
|
+
returns: FFIType.ptr,
|
|
48
|
+
},
|
|
49
|
+
});
|
|
50
|
+
/**
|
|
51
|
+
* Copy memory from a native pointer to a JavaScript ArrayBuffer.
|
|
52
|
+
* This is a workaround for the Bun FFI toArrayBuffer bug.
|
|
53
|
+
* See: https://github.com/oven-sh/bun/issues/23656
|
|
54
|
+
*/
|
|
55
|
+
function copyFromCPtr(src, len) {
|
|
56
|
+
const out = new Uint8Array(len >>> 0);
|
|
57
|
+
libc.symbols.memcpy(ptr(out), src, len >>> 0);
|
|
58
|
+
return out.buffer;
|
|
59
|
+
}
|
|
60
|
+
// Helper to encode string to null-terminated buffer
|
|
61
|
+
function encodeString(str) {
|
|
62
|
+
const encoder = new TextEncoder();
|
|
63
|
+
const encoded = encoder.encode(str);
|
|
64
|
+
const buffer = new Uint8Array(encoded.length + 1);
|
|
65
|
+
buffer.set(encoded);
|
|
66
|
+
buffer[encoded.length] = 0;
|
|
67
|
+
return buffer;
|
|
68
|
+
}
|
|
69
|
+
// Helper to read a null-terminated string from a pointer
|
|
70
|
+
function readCString(pointer) {
|
|
71
|
+
const ptrValue = typeof pointer === "number" ? pointer : Number(pointer);
|
|
72
|
+
if (ptrValue === 0)
|
|
73
|
+
return null;
|
|
74
|
+
return new CString(pointer).toString();
|
|
75
|
+
}
|
|
76
|
+
// Structure sizes and offsets (for 64-bit systems with proper alignment)
|
|
77
|
+
// llama_load_options_t layout:
|
|
78
|
+
// const char* model_path; // offset 0, size 8
|
|
79
|
+
// int32_t gpu_layers; // offset 8, size 4
|
|
80
|
+
// int32_t context_size; // offset 12, size 4
|
|
81
|
+
// int32_t threads; // offset 16, size 4
|
|
82
|
+
// bool debug; // offset 20, size 1
|
|
83
|
+
// padding // offset 21, size 3
|
|
84
|
+
// const char* chat_template; // offset 24, size 8
|
|
85
|
+
// Total: 32 bytes
|
|
86
|
+
const LOAD_OPTIONS_SIZE = 32;
|
|
87
|
+
// llama_generate_result_t layout:
|
|
88
|
+
// char* text; // offset 0, size 8
|
|
89
|
+
// int32_t prompt_tokens; // offset 8, size 4
|
|
90
|
+
// int32_t completion_tokens; // offset 12, size 4
|
|
91
|
+
// char* finish_reason; // offset 16, size 8
|
|
92
|
+
// char* error; // offset 24, size 8
|
|
93
|
+
// Total: 32 bytes
|
|
94
|
+
const RESULT_TEXT_OFFSET = 0;
|
|
95
|
+
const RESULT_PROMPT_TOKENS_OFFSET = 8;
|
|
96
|
+
const RESULT_COMPLETION_TOKENS_OFFSET = 12;
|
|
97
|
+
const RESULT_FINISH_REASON_OFFSET = 16;
|
|
98
|
+
const RESULT_ERROR_OFFSET = 24;
|
|
99
|
+
// llama_chat_message_t layout:
|
|
100
|
+
// const char* role; // offset 0, size 8
|
|
101
|
+
// const char* content; // offset 8, size 8
|
|
102
|
+
// Total: 16 bytes
|
|
103
|
+
const MESSAGE_SIZE = 16;
|
|
104
|
+
// llama_generate_options_t layout:
|
|
105
|
+
// llama_chat_message_t* messages; // offset 0, size 8
|
|
106
|
+
// size_t message_count; // offset 8, size 8
|
|
107
|
+
// int32_t max_tokens; // offset 16, size 4
|
|
108
|
+
// float temperature; // offset 20, size 4
|
|
109
|
+
// float top_p; // offset 24, size 4
|
|
110
|
+
// int32_t top_k; // offset 28, size 4
|
|
111
|
+
// const char** stop_sequences; // offset 32, size 8
|
|
112
|
+
// size_t stop_sequence_count; // offset 40, size 8
|
|
113
|
+
// Total: 48 bytes
|
|
114
|
+
const GENERATE_OPTIONS_SIZE = 48;
|
|
115
|
+
// Keep references to buffers to prevent GC
|
|
116
|
+
const bufferRefs = new Map();
|
|
117
|
+
let nextRefId = 1;
|
|
118
|
+
function keepAlive(id, refs) {
|
|
119
|
+
bufferRefs.set(id, refs);
|
|
120
|
+
}
|
|
121
|
+
function releaseRefs(id) {
|
|
122
|
+
bufferRefs.delete(id);
|
|
123
|
+
}
|
|
124
|
+
export function loadModel(options) {
|
|
125
|
+
return new Promise((resolve, reject) => {
|
|
126
|
+
const refId = nextRefId++;
|
|
127
|
+
const refs = [];
|
|
128
|
+
try {
|
|
129
|
+
// Create load options structure
|
|
130
|
+
const optionsBuffer = new ArrayBuffer(LOAD_OPTIONS_SIZE);
|
|
131
|
+
const optionsView = new DataView(optionsBuffer);
|
|
132
|
+
// Encode strings
|
|
133
|
+
const modelPathBuffer = encodeString(options.modelPath);
|
|
134
|
+
refs.push(modelPathBuffer);
|
|
135
|
+
const chatTemplateBuffer = encodeString(options.chatTemplate ?? "auto");
|
|
136
|
+
refs.push(chatTemplateBuffer);
|
|
137
|
+
const modelPathPtr = ptr(modelPathBuffer);
|
|
138
|
+
const chatTemplatePtr = ptr(chatTemplateBuffer);
|
|
139
|
+
// Set pointers and values
|
|
140
|
+
optionsView.setBigUint64(0, BigInt(modelPathPtr), true); // model_path
|
|
141
|
+
optionsView.setInt32(8, options.gpuLayers ?? 99, true); // gpu_layers
|
|
142
|
+
optionsView.setInt32(12, options.contextSize ?? 2048, true); // context_size
|
|
143
|
+
optionsView.setInt32(16, options.threads ?? 4, true); // threads
|
|
144
|
+
optionsView.setUint8(20, options.debug ? 1 : 0); // debug
|
|
145
|
+
optionsView.setBigUint64(24, BigInt(chatTemplatePtr), true); // chat_template
|
|
146
|
+
refs.push(optionsBuffer);
|
|
147
|
+
keepAlive(refId, refs);
|
|
148
|
+
// Create a Uint8Array view that we keep a reference to
|
|
149
|
+
const optionsArray = new Uint8Array(optionsBuffer);
|
|
150
|
+
refs.push(optionsArray);
|
|
151
|
+
// Call the native function
|
|
152
|
+
const handle = lib.symbols.llama_load_model(ptr(optionsArray));
|
|
153
|
+
releaseRefs(refId);
|
|
154
|
+
if (handle < 0) {
|
|
155
|
+
const errorPtr = lib.symbols.llama_get_last_error();
|
|
156
|
+
const error = !isNullPtr(errorPtr) ? readCString(errorPtr) : "Failed to load model";
|
|
157
|
+
lib.symbols.llama_clear_error();
|
|
158
|
+
reject(new Error(error ?? "Failed to load model"));
|
|
159
|
+
}
|
|
160
|
+
else {
|
|
161
|
+
resolve(handle);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
catch (error) {
|
|
165
|
+
releaseRefs(refId);
|
|
166
|
+
reject(error);
|
|
167
|
+
}
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
export function unloadModel(handle) {
|
|
171
|
+
return lib.symbols.llama_unload_model(handle);
|
|
172
|
+
}
|
|
173
|
+
export function isModelLoaded(handle) {
|
|
174
|
+
return lib.symbols.llama_is_model_loaded(handle);
|
|
175
|
+
}
|
|
176
|
+
function createGenerateOptions(options) {
|
|
177
|
+
const refs = [];
|
|
178
|
+
// Create message array
|
|
179
|
+
const messagesBuffer = new ArrayBuffer(MESSAGE_SIZE * options.messages.length);
|
|
180
|
+
const messagesView = new DataView(messagesBuffer);
|
|
181
|
+
refs.push(messagesBuffer);
|
|
182
|
+
for (let i = 0; i < options.messages.length; i++) {
|
|
183
|
+
const roleBuffer = encodeString(options.messages[i].role);
|
|
184
|
+
const contentBuffer = encodeString(options.messages[i].content);
|
|
185
|
+
refs.push(roleBuffer, contentBuffer);
|
|
186
|
+
const offset = i * MESSAGE_SIZE;
|
|
187
|
+
messagesView.setBigUint64(offset, BigInt(ptr(roleBuffer)), true);
|
|
188
|
+
messagesView.setBigUint64(offset + 8, BigInt(ptr(contentBuffer)), true);
|
|
189
|
+
}
|
|
190
|
+
// Create stop sequences array if provided
|
|
191
|
+
let stopSeqPtr = 0n;
|
|
192
|
+
if (options.stopSequences && options.stopSequences.length > 0) {
|
|
193
|
+
const stopSeqPtrBuffer = new ArrayBuffer(8 * options.stopSequences.length);
|
|
194
|
+
const stopSeqPtrView = new DataView(stopSeqPtrBuffer);
|
|
195
|
+
refs.push(stopSeqPtrBuffer);
|
|
196
|
+
for (let i = 0; i < options.stopSequences.length; i++) {
|
|
197
|
+
const seqBuffer = encodeString(options.stopSequences[i]);
|
|
198
|
+
refs.push(seqBuffer);
|
|
199
|
+
stopSeqPtrView.setBigUint64(i * 8, BigInt(ptr(seqBuffer)), true);
|
|
200
|
+
}
|
|
201
|
+
stopSeqPtr = BigInt(ptr(new Uint8Array(stopSeqPtrBuffer)));
|
|
202
|
+
}
|
|
203
|
+
// Create generate options structure
|
|
204
|
+
const optionsBuffer = new ArrayBuffer(GENERATE_OPTIONS_SIZE);
|
|
205
|
+
const optionsView = new DataView(optionsBuffer);
|
|
206
|
+
refs.push(optionsBuffer);
|
|
207
|
+
optionsView.setBigUint64(0, BigInt(ptr(new Uint8Array(messagesBuffer))), true); // messages
|
|
208
|
+
optionsView.setBigUint64(8, BigInt(options.messages.length), true); // message_count
|
|
209
|
+
optionsView.setInt32(16, options.maxTokens ?? 256, true); // max_tokens
|
|
210
|
+
optionsView.setFloat32(20, options.temperature ?? 0.7, true); // temperature
|
|
211
|
+
optionsView.setFloat32(24, options.topP ?? 0.9, true); // top_p
|
|
212
|
+
optionsView.setInt32(28, options.topK ?? 40, true); // top_k
|
|
213
|
+
optionsView.setBigUint64(32, stopSeqPtr, true); // stop_sequences
|
|
214
|
+
optionsView.setBigUint64(40, BigInt(options.stopSequences?.length ?? 0), true); // stop_sequence_count
|
|
215
|
+
return { buffer: optionsBuffer, refs };
|
|
216
|
+
}
|
|
217
|
+
// Helper to check if a pointer is null (can be null, 0, or 0n)
|
|
218
|
+
function isNullPtr(ptr) {
|
|
219
|
+
if (ptr === null)
|
|
220
|
+
return true;
|
|
221
|
+
if (typeof ptr === "number")
|
|
222
|
+
return ptr === 0;
|
|
223
|
+
if (typeof ptr === "bigint")
|
|
224
|
+
return ptr === 0n;
|
|
225
|
+
// For Pointer type, convert to number and check
|
|
226
|
+
return Number(ptr) === 0;
|
|
227
|
+
}
|
|
228
|
+
function parseResult(resultPtr) {
|
|
229
|
+
// Get the numeric value of the pointer for null check
|
|
230
|
+
const ptrValue = typeof resultPtr === "number" ? resultPtr : Number(resultPtr);
|
|
231
|
+
if (ptrValue === 0) {
|
|
232
|
+
throw new Error("Generation failed: null result pointer");
|
|
233
|
+
}
|
|
234
|
+
const buffer = copyFromCPtr(resultPtr, 32);
|
|
235
|
+
const view = new DataView(buffer);
|
|
236
|
+
const textPtr = Number(view.getBigUint64(RESULT_TEXT_OFFSET, true));
|
|
237
|
+
const promptTokens = view.getInt32(RESULT_PROMPT_TOKENS_OFFSET, true);
|
|
238
|
+
const completionTokens = view.getInt32(RESULT_COMPLETION_TOKENS_OFFSET, true);
|
|
239
|
+
const finishReasonPtr = Number(view.getBigUint64(RESULT_FINISH_REASON_OFFSET, true));
|
|
240
|
+
const errorPtr = Number(view.getBigUint64(RESULT_ERROR_OFFSET, true));
|
|
241
|
+
const text = textPtr ? readCString(textPtr) ?? "" : "";
|
|
242
|
+
const finishReason = finishReasonPtr ? readCString(finishReasonPtr) ?? "error" : "error";
|
|
243
|
+
const error = errorPtr ? readCString(errorPtr) : null;
|
|
244
|
+
if (error) {
|
|
245
|
+
throw new Error(error);
|
|
246
|
+
}
|
|
247
|
+
return {
|
|
248
|
+
text,
|
|
249
|
+
promptTokens,
|
|
250
|
+
completionTokens,
|
|
251
|
+
finishReason: finishReason,
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
export function generate(handle, options) {
|
|
255
|
+
return new Promise((resolve, reject) => {
|
|
256
|
+
const refId = nextRefId++;
|
|
257
|
+
try {
|
|
258
|
+
const { buffer, refs } = createGenerateOptions(options);
|
|
259
|
+
keepAlive(refId, refs);
|
|
260
|
+
const resultPtr = lib.symbols.llama_generate(handle, ptr(new Uint8Array(buffer)));
|
|
261
|
+
if (isNullPtr(resultPtr)) {
|
|
262
|
+
releaseRefs(refId);
|
|
263
|
+
const errorPtr = lib.symbols.llama_get_last_error();
|
|
264
|
+
const error = !isNullPtr(errorPtr) ? readCString(errorPtr) : "Generation failed: null result";
|
|
265
|
+
lib.symbols.llama_clear_error();
|
|
266
|
+
reject(new Error(error ?? "Generation failed: null result"));
|
|
267
|
+
return;
|
|
268
|
+
}
|
|
269
|
+
try {
|
|
270
|
+
const result = parseResult(resultPtr);
|
|
271
|
+
resolve(result);
|
|
272
|
+
}
|
|
273
|
+
catch (parseError) {
|
|
274
|
+
// If parsing fails, check for library error before re-throwing
|
|
275
|
+
const errorPtr = lib.symbols.llama_get_last_error();
|
|
276
|
+
if (!isNullPtr(errorPtr)) {
|
|
277
|
+
const libError = readCString(errorPtr);
|
|
278
|
+
lib.symbols.llama_clear_error();
|
|
279
|
+
if (libError) {
|
|
280
|
+
reject(new Error(libError));
|
|
281
|
+
return;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
throw parseError;
|
|
285
|
+
}
|
|
286
|
+
finally {
|
|
287
|
+
lib.symbols.llama_free_result(resultPtr);
|
|
288
|
+
releaseRefs(refId);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
catch (error) {
|
|
292
|
+
releaseRefs(refId);
|
|
293
|
+
reject(error);
|
|
294
|
+
}
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
export function generateStream(handle, options, onToken) {
|
|
298
|
+
return new Promise((resolve, reject) => {
|
|
299
|
+
const refId = nextRefId++;
|
|
300
|
+
try {
|
|
301
|
+
const { buffer, refs } = createGenerateOptions(options);
|
|
302
|
+
keepAlive(refId, refs);
|
|
303
|
+
// Collect tokens during generation
|
|
304
|
+
// Note: Bun FFI callbacks are tricky, so we use a workaround
|
|
305
|
+
// The C API will call the callback synchronously during generation
|
|
306
|
+
// We store tokens and emit them after generation completes
|
|
307
|
+
// For now, we use the non-streaming generate and emit all at once
|
|
308
|
+
// A proper streaming implementation would require Bun's callback support
|
|
309
|
+
const resultPtr = lib.symbols.llama_generate(handle, ptr(new Uint8Array(buffer)));
|
|
310
|
+
if (isNullPtr(resultPtr)) {
|
|
311
|
+
releaseRefs(refId);
|
|
312
|
+
const errorPtr = lib.symbols.llama_get_last_error();
|
|
313
|
+
const error = !isNullPtr(errorPtr) ? readCString(errorPtr) : "Generation failed: null result";
|
|
314
|
+
lib.symbols.llama_clear_error();
|
|
315
|
+
reject(new Error(error ?? "Generation failed: null result"));
|
|
316
|
+
return;
|
|
317
|
+
}
|
|
318
|
+
try {
|
|
319
|
+
const result = parseResult(resultPtr);
|
|
320
|
+
// Emit the full text as tokens (character by character for now)
|
|
321
|
+
// In a proper implementation, we'd use the streaming API with callbacks
|
|
322
|
+
const words = result.text.split(/(\s+)/);
|
|
323
|
+
for (const word of words) {
|
|
324
|
+
if (word) {
|
|
325
|
+
onToken(word);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
resolve(result);
|
|
329
|
+
}
|
|
330
|
+
catch (parseError) {
|
|
331
|
+
// If parsing fails, check for library error before re-throwing
|
|
332
|
+
const errorPtr = lib.symbols.llama_get_last_error();
|
|
333
|
+
if (!isNullPtr(errorPtr)) {
|
|
334
|
+
const libError = readCString(errorPtr);
|
|
335
|
+
lib.symbols.llama_clear_error();
|
|
336
|
+
if (libError) {
|
|
337
|
+
reject(new Error(libError));
|
|
338
|
+
return;
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
throw parseError;
|
|
342
|
+
}
|
|
343
|
+
finally {
|
|
344
|
+
lib.symbols.llama_free_result(resultPtr);
|
|
345
|
+
releaseRefs(refId);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
catch (error) {
|
|
349
|
+
releaseRefs(refId);
|
|
350
|
+
reject(error);
|
|
351
|
+
}
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
//# sourceMappingURL=binding-bun.js.map
|