@shipworthy/ai-sdk-llama-cpp 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CMakeLists.txt +6 -0
  2. package/LICENSE +21 -0
  3. package/README.md +274 -0
  4. package/dist/binding-bun.d.ts +7 -0
  5. package/dist/binding-bun.d.ts.map +1 -0
  6. package/dist/binding-bun.js +354 -0
  7. package/dist/binding-bun.js.map +1 -0
  8. package/dist/binding-node.d.ts +7 -0
  9. package/dist/binding-node.d.ts.map +1 -0
  10. package/dist/binding-node.js +59 -0
  11. package/dist/binding-node.js.map +1 -0
  12. package/dist/binding.d.ts +67 -0
  13. package/dist/binding.d.ts.map +1 -0
  14. package/dist/binding.js +105 -0
  15. package/dist/binding.js.map +1 -0
  16. package/dist/index.d.ts +5 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +8 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/llama-cpp-embedding-model.d.ts +28 -0
  21. package/dist/llama-cpp-embedding-model.d.ts.map +1 -0
  22. package/dist/llama-cpp-embedding-model.js +78 -0
  23. package/dist/llama-cpp-embedding-model.js.map +1 -0
  24. package/dist/llama-cpp-language-model.d.ts +55 -0
  25. package/dist/llama-cpp-language-model.d.ts.map +1 -0
  26. package/dist/llama-cpp-language-model.js +221 -0
  27. package/dist/llama-cpp-language-model.js.map +1 -0
  28. package/dist/llama-cpp-provider.d.ts +82 -0
  29. package/dist/llama-cpp-provider.d.ts.map +1 -0
  30. package/dist/llama-cpp-provider.js +71 -0
  31. package/dist/llama-cpp-provider.js.map +1 -0
  32. package/dist/native-binding.d.ts +51 -0
  33. package/dist/native-binding.d.ts.map +1 -0
  34. package/dist/native-binding.js +74 -0
  35. package/dist/native-binding.js.map +1 -0
  36. package/native/CMakeLists.txt +74 -0
  37. package/native/binding.cpp +522 -0
  38. package/native/llama-wrapper.cpp +519 -0
  39. package/native/llama-wrapper.h +131 -0
  40. package/package.json +79 -0
  41. package/scripts/postinstall.cjs +74 -0
package/CMakeLists.txt ADDED
@@ -0,0 +1,6 @@
1
+ cmake_minimum_required(VERSION 3.15)
2
+ project(llama_binding)
3
+
4
+ # Include the native build
5
+ add_subdirectory(native)
6
+
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Lars Grammel
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,274 @@
1
+ # ai-sdk-llama-cpp
2
+
3
+ > **Alpha Software** - This package is in early development. The API may change between versions without notice.
4
+
5
+ > **macOS Only** - This package currently only supports macOS with Apple Silicon or Intel processors.
6
+
7
+ A minimal [llama.cpp](https://github.com/ggerganov/llama.cpp) provider for the [Vercel AI SDK](https://sdk.vercel.ai/), implementing the `LanguageModelV3` interface.
8
+
9
+ This package loads llama.cpp directly into Node.js memory via native C++ bindings, enabling local LLM inference without requiring an external server.
10
+
11
+ ## Features
12
+
13
+ - **Native Performance**: Direct C++ bindings using node-addon-api (N-API)
14
+ - **GPU Acceleration**: Automatic Metal support on macOS
15
+ - **Streaming & Non-streaming**: Full support for both `generateText` and `streamText`
16
+ - **Chat Templates**: Automatic or configurable chat template formatting (llama3, chatml, gemma, etc.)
17
+ - **ESM Only**: Modern ECMAScript modules, no CommonJS
18
+ - **GGUF Support**: Load any GGUF-format model
19
+
20
+ ## Prerequisites
21
+
22
+ Before installing, ensure you have the following:
23
+
24
+ - **macOS** (Apple Silicon or Intel)
25
+ - **Node.js** >= 18.0.0
26
+ - **CMake** >= 3.15
27
+ - **Xcode Command Line Tools**
28
+
29
+ ```bash
30
+ # Install Xcode Command Line Tools (includes Clang)
31
+ xcode-select --install
32
+
33
+ # Install CMake via Homebrew
34
+ brew install cmake
35
+ ```
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ npm install ai-sdk-llama-cpp
41
+ ```
42
+
43
+ The installation will automatically:
44
+
45
+ 1. Detect macOS and verify platform compatibility
46
+ 2. Compile llama.cpp as a static library with Metal support
47
+ 3. Build the native Node.js addon
48
+
49
+ > **Note**: Installation on Windows or Linux will fail with an error. Only macOS is supported.
50
+
51
+ ## Usage
52
+
53
+ ### Basic Example
54
+
55
+ ```typescript
56
+ import { generateText } from "ai";
57
+ import { llamaCpp } from "ai-sdk-llama-cpp";
58
+
59
+ const model = llamaCpp({
60
+ modelPath: "./models/llama-3.2-1b-instruct.Q4_K_M.gguf",
61
+ });
62
+
63
+ try {
64
+ const { text } = await generateText({
65
+ model,
66
+ prompt: "Explain quantum computing in simple terms.",
67
+ });
68
+
69
+ console.log(text);
70
+ } finally {
71
+ model.dispose();
72
+ }
73
+ ```
74
+
75
+ ### Streaming Example
76
+
77
+ ```typescript
78
+ import { streamText } from "ai";
79
+ import { llamaCpp } from "ai-sdk-llama-cpp";
80
+
81
+ const model = llamaCpp({
82
+ modelPath: "./models/llama-3.2-1b-instruct.Q4_K_M.gguf",
83
+ });
84
+
85
+ try {
86
+ const { textStream } = await streamText({
87
+ model,
88
+ prompt: "Write a haiku about programming.",
89
+ });
90
+
91
+ for await (const chunk of textStream) {
92
+ process.stdout.write(chunk);
93
+ }
94
+ } finally {
95
+ model.dispose();
96
+ }
97
+ ```
98
+
99
+ ### Embedding Example
100
+
101
+ ```typescript
102
+ import { embed, embedMany } from "ai";
103
+ import { llamaCpp } from "ai-sdk-llama-cpp";
104
+
105
+ const model = llamaCpp.embedding({
106
+ modelPath: "./models/nomic-embed-text-v1.5.Q4_K_M.gguf",
107
+ });
108
+
109
+ try {
110
+ const { embedding } = await embed({
111
+ model,
112
+ value: "Hello, world!",
113
+ });
114
+
115
+ const { embeddings } = await embedMany({
116
+ model,
117
+ values: ["Hello, world!", "Hello, ▲!"],
118
+ });
119
+ } finally {
120
+ model.dispose();
121
+ }
122
+ ```
123
+
124
+ ### Configuration Options
125
+
126
+ ```typescript
127
+ const model = llamaCpp({
128
+ // Required: Path to the GGUF model file
129
+ modelPath: "./models/your-model.gguf",
130
+
131
+ // Optional: Maximum context size (default: 2048)
132
+ contextSize: 4096,
133
+
134
+ // Optional: Number of layers to offload to GPU
135
+ // Default: 99 (all layers). Set to 0 to disable GPU.
136
+ gpuLayers: 99,
137
+
138
+ // Optional: Number of CPU threads (default: 4)
139
+ threads: 8,
140
+
141
+ // Optional: Enable verbose debug output from llama.cpp (default: false)
142
+ debug: true,
143
+
144
+ // Optional: Chat template to use for formatting messages
145
+ // - "auto" (default): Use the template embedded in the GGUF model file
146
+ // - Template name: Use a specific built-in template (e.g., "llama3", "chatml", "gemma")
147
+ chatTemplate: "auto",
148
+ });
149
+ ```
150
+
151
+ #### Chat Templates
152
+
153
+ The `chatTemplate` option controls how messages are formatted before being sent to the model. Available templates include:
154
+
155
+ - `chatml`, `llama2`, `llama2-sys`, `llama3`, `llama4`
156
+ - `mistral-v1`, `mistral-v3`, `mistral-v7`
157
+ - `phi3`, `phi4`, `gemma`, `falcon3`, `zephyr`
158
+ - `deepseek`, `deepseek2`, `deepseek3`, `command-r`
159
+ - And more (see llama.cpp documentation for the full list)
160
+
161
+ ### Generation Parameters
162
+
163
+ The standard AI SDK generation parameters are supported:
164
+
165
+ ```typescript
166
+ try {
167
+ const { text } = await generateText({
168
+ model,
169
+ prompt: "Hello!",
170
+ maxTokens: 256, // Maximum tokens to generate
171
+ temperature: 0.7, // Sampling temperature (0-2)
172
+ topP: 0.9, // Nucleus sampling threshold
173
+ topK: 40, // Top-k sampling
174
+ stopSequences: ["\n"], // Stop generation at these sequences
175
+ });
176
+ } finally {
177
+ model.dispose();
178
+ }
179
+ ```
180
+
181
+ ## Model Downloads
182
+
183
+ You'll need to download GGUF-format models separately. Popular sources:
184
+
185
+ - [Hugging Face](https://huggingface.co/models?search=gguf) - Search for GGUF models
186
+ - [TheBloke's Models](https://huggingface.co/TheBloke) - Popular quantized models
187
+
188
+ Example download:
189
+
190
+ ```bash
191
+ # Create models directory
192
+ mkdir -p models
193
+
194
+ # Download a model (example: Llama 3.2 1B)
195
+ wget -P models/ https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf
196
+ ```
197
+
198
+ ## API Reference
199
+
200
+ ### `llamaCpp(config)`
201
+
202
+ Creates a new llama.cpp language model instance.
203
+
204
+ **Parameters:**
205
+
206
+ - `config.modelPath` (string, required): Path to the GGUF model file
207
+ - `config.contextSize` (number, optional): Maximum context size. Default: 2048
208
+ - `config.gpuLayers` (number, optional): GPU layers to offload. Default: 99
209
+ - `config.threads` (number, optional): CPU threads. Default: 4
210
+ - `config.debug` (boolean, optional): Enable verbose llama.cpp output. Default: false
211
+ - `config.chatTemplate` (string, optional): Chat template to use for formatting messages. Default: "auto"
212
+
213
+ **Returns:** `LlamaCppLanguageModel` - A language model compatible with the Vercel AI SDK
214
+
215
+ ### `LlamaCppLanguageModel`
216
+
217
+ Implements the `LanguageModelV3` interface from `@ai-sdk/provider`.
218
+
219
+ **Methods:**
220
+
221
+ - `doGenerate(options)`: Non-streaming text generation
222
+ - `doStream(options)`: Streaming text generation
223
+ - `dispose()`: Unload the model and free GPU/CPU resources. **Always call this when done** to prevent memory leaks, especially when loading multiple models
224
+
225
+ ## Limitations
226
+
227
+ This is a minimal implementation with the following limitations:
228
+
229
+ - **macOS only**: Windows and Linux are not supported
230
+ - **No tool/function calling**: Tool calls are not supported
231
+ - **No image inputs**: Only text prompts are supported
232
+ - **No JSON mode**: Structured output generation is not supported
233
+
234
+ ## Development
235
+
236
+ ### Building from Source
237
+
238
+ ```bash
239
+ # Clone the repository
240
+ git clone https://github.com/lgrammel/ai-sdk-llama-cpp.git
241
+ cd ai-sdk-llama-cpp
242
+
243
+ # Initialize submodules
244
+ git submodule update --init --recursive
245
+
246
+ # Install dependencies
247
+ npm install
248
+
249
+ # Build the native addon and TypeScript
250
+ npm run build
251
+ ```
252
+
253
+ ### Scripts
254
+
255
+ - `npm run build` - Build everything (native + TypeScript)
256
+ - `npm run build:native` - Build only the native addon
257
+ - `npm run build:ts` - Build only TypeScript
258
+ - `npm run clean` - Remove build artifacts
259
+ - `npm run test` - Run tests in watch mode
260
+ - `npm run test:run` - Run all tests once
261
+ - `npm run test:unit` - Run unit tests
262
+ - `npm run test:integration` - Run integration tests
263
+ - `npm run test:e2e` - Run end-to-end tests
264
+ - `npm run test:coverage` - Run tests with coverage
265
+
266
+ ## License
267
+
268
+ MIT
269
+
270
+ ## Acknowledgments
271
+
272
+ - [llama.cpp](https://github.com/ggerganov/llama.cpp) - The underlying inference engine
273
+ - [Vercel AI SDK](https://sdk.vercel.ai/) - The AI SDK framework
274
+ - [node-addon-api](https://github.com/nodejs/node-addon-api) - N-API C++ wrapper
@@ -0,0 +1,7 @@
1
+ import type { LoadModelOptions, GenerateOptions, GenerateResult } from './binding.js';
2
+ export declare function loadModel(options: LoadModelOptions): Promise<number>;
3
+ export declare function unloadModel(handle: number): boolean;
4
+ export declare function isModelLoaded(handle: number): boolean;
5
+ export declare function generate(handle: number, options: GenerateOptions): Promise<GenerateResult>;
6
+ export declare function generateStream(handle: number, options: GenerateOptions, onToken: (token: string) => void): Promise<GenerateResult>;
7
+ //# sourceMappingURL=binding-bun.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"binding-bun.d.ts","sourceRoot":"","sources":["../src/binding-bun.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AA2ItF,wBAAgB,SAAS,CAAC,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC,CAqDpE;AAED,wBAAgB,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAEnD;AAED,wBAAgB,aAAa,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAErD;AA6FD,wBAAgB,QAAQ,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC,CA2C1F;AAED,wBAAgB,cAAc,CAC5B,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,eAAe,EACxB,OAAO,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,IAAI,GAC/B,OAAO,CAAC,cAAc,CAAC,CA4DzB"}
@@ -0,0 +1,354 @@
1
+ import { dlopen, FFIType, suffix, ptr, CString } from "bun:ffi";
2
+ import { join, dirname } from "node:path";
3
+ import { fileURLToPath } from "node:url";
4
+ const __dirname = dirname(fileURLToPath(import.meta.url));
5
+ // Resolve the path to the shared library
6
+ const libPath = join(__dirname, "..", "build", "Release", `libllama_ffi.${suffix}`);
7
+ // Define FFI symbols for the C API
8
+ const lib = dlopen(libPath, {
9
+ llama_load_model: {
10
+ args: [FFIType.ptr],
11
+ returns: FFIType.i32,
12
+ },
13
+ llama_unload_model: {
14
+ args: [FFIType.i32],
15
+ returns: FFIType.bool,
16
+ },
17
+ llama_is_model_loaded: {
18
+ args: [FFIType.i32],
19
+ returns: FFIType.bool,
20
+ },
21
+ llama_generate: {
22
+ args: [FFIType.i32, FFIType.ptr],
23
+ returns: FFIType.ptr,
24
+ },
25
+ llama_generate_stream: {
26
+ args: [FFIType.i32, FFIType.ptr, FFIType.ptr, FFIType.ptr],
27
+ returns: FFIType.ptr,
28
+ },
29
+ llama_free_result: {
30
+ args: [FFIType.ptr],
31
+ returns: FFIType.void,
32
+ },
33
+ llama_get_last_error: {
34
+ args: [],
35
+ returns: FFIType.ptr,
36
+ },
37
+ llama_clear_error: {
38
+ args: [],
39
+ returns: FFIType.void,
40
+ },
41
+ });
42
+ // Workaround for Bun FFI toArrayBuffer bug: https://github.com/oven-sh/bun/issues/23656
43
+ // toArrayBuffer() always returns incorrect byteLength, so we use libc memcpy instead
44
+ const libc = dlopen(process.platform === "darwin" ? "libc.dylib" : "libc.so.6", {
45
+ memcpy: {
46
+ args: [FFIType.ptr, FFIType.ptr, FFIType.u64],
47
+ returns: FFIType.ptr,
48
+ },
49
+ });
50
+ /**
51
+ * Copy memory from a native pointer to a JavaScript ArrayBuffer.
52
+ * This is a workaround for the Bun FFI toArrayBuffer bug.
53
+ * See: https://github.com/oven-sh/bun/issues/23656
54
+ */
55
+ function copyFromCPtr(src, len) {
56
+ const out = new Uint8Array(len >>> 0);
57
+ libc.symbols.memcpy(ptr(out), src, len >>> 0);
58
+ return out.buffer;
59
+ }
60
+ // Helper to encode string to null-terminated buffer
61
+ function encodeString(str) {
62
+ const encoder = new TextEncoder();
63
+ const encoded = encoder.encode(str);
64
+ const buffer = new Uint8Array(encoded.length + 1);
65
+ buffer.set(encoded);
66
+ buffer[encoded.length] = 0;
67
+ return buffer;
68
+ }
69
+ // Helper to read a null-terminated string from a pointer
70
+ function readCString(pointer) {
71
+ const ptrValue = typeof pointer === "number" ? pointer : Number(pointer);
72
+ if (ptrValue === 0)
73
+ return null;
74
+ return new CString(pointer).toString();
75
+ }
76
+ // Structure sizes and offsets (for 64-bit systems with proper alignment)
77
+ // llama_load_options_t layout:
78
+ // const char* model_path; // offset 0, size 8
79
+ // int32_t gpu_layers; // offset 8, size 4
80
+ // int32_t context_size; // offset 12, size 4
81
+ // int32_t threads; // offset 16, size 4
82
+ // bool debug; // offset 20, size 1
83
+ // padding // offset 21, size 3
84
+ // const char* chat_template; // offset 24, size 8
85
+ // Total: 32 bytes
86
+ const LOAD_OPTIONS_SIZE = 32;
87
+ // llama_generate_result_t layout:
88
+ // char* text; // offset 0, size 8
89
+ // int32_t prompt_tokens; // offset 8, size 4
90
+ // int32_t completion_tokens; // offset 12, size 4
91
+ // char* finish_reason; // offset 16, size 8
92
+ // char* error; // offset 24, size 8
93
+ // Total: 32 bytes
94
+ const RESULT_TEXT_OFFSET = 0;
95
+ const RESULT_PROMPT_TOKENS_OFFSET = 8;
96
+ const RESULT_COMPLETION_TOKENS_OFFSET = 12;
97
+ const RESULT_FINISH_REASON_OFFSET = 16;
98
+ const RESULT_ERROR_OFFSET = 24;
99
+ // llama_chat_message_t layout:
100
+ // const char* role; // offset 0, size 8
101
+ // const char* content; // offset 8, size 8
102
+ // Total: 16 bytes
103
+ const MESSAGE_SIZE = 16;
104
+ // llama_generate_options_t layout:
105
+ // llama_chat_message_t* messages; // offset 0, size 8
106
+ // size_t message_count; // offset 8, size 8
107
+ // int32_t max_tokens; // offset 16, size 4
108
+ // float temperature; // offset 20, size 4
109
+ // float top_p; // offset 24, size 4
110
+ // int32_t top_k; // offset 28, size 4
111
+ // const char** stop_sequences; // offset 32, size 8
112
+ // size_t stop_sequence_count; // offset 40, size 8
113
+ // Total: 48 bytes
114
+ const GENERATE_OPTIONS_SIZE = 48;
115
+ // Keep references to buffers to prevent GC
116
+ const bufferRefs = new Map();
117
+ let nextRefId = 1;
118
+ function keepAlive(id, refs) {
119
+ bufferRefs.set(id, refs);
120
+ }
121
+ function releaseRefs(id) {
122
+ bufferRefs.delete(id);
123
+ }
124
+ export function loadModel(options) {
125
+ return new Promise((resolve, reject) => {
126
+ const refId = nextRefId++;
127
+ const refs = [];
128
+ try {
129
+ // Create load options structure
130
+ const optionsBuffer = new ArrayBuffer(LOAD_OPTIONS_SIZE);
131
+ const optionsView = new DataView(optionsBuffer);
132
+ // Encode strings
133
+ const modelPathBuffer = encodeString(options.modelPath);
134
+ refs.push(modelPathBuffer);
135
+ const chatTemplateBuffer = encodeString(options.chatTemplate ?? "auto");
136
+ refs.push(chatTemplateBuffer);
137
+ const modelPathPtr = ptr(modelPathBuffer);
138
+ const chatTemplatePtr = ptr(chatTemplateBuffer);
139
+ // Set pointers and values
140
+ optionsView.setBigUint64(0, BigInt(modelPathPtr), true); // model_path
141
+ optionsView.setInt32(8, options.gpuLayers ?? 99, true); // gpu_layers
142
+ optionsView.setInt32(12, options.contextSize ?? 2048, true); // context_size
143
+ optionsView.setInt32(16, options.threads ?? 4, true); // threads
144
+ optionsView.setUint8(20, options.debug ? 1 : 0); // debug
145
+ optionsView.setBigUint64(24, BigInt(chatTemplatePtr), true); // chat_template
146
+ refs.push(optionsBuffer);
147
+ keepAlive(refId, refs);
148
+ // Create a Uint8Array view that we keep a reference to
149
+ const optionsArray = new Uint8Array(optionsBuffer);
150
+ refs.push(optionsArray);
151
+ // Call the native function
152
+ const handle = lib.symbols.llama_load_model(ptr(optionsArray));
153
+ releaseRefs(refId);
154
+ if (handle < 0) {
155
+ const errorPtr = lib.symbols.llama_get_last_error();
156
+ const error = !isNullPtr(errorPtr) ? readCString(errorPtr) : "Failed to load model";
157
+ lib.symbols.llama_clear_error();
158
+ reject(new Error(error ?? "Failed to load model"));
159
+ }
160
+ else {
161
+ resolve(handle);
162
+ }
163
+ }
164
+ catch (error) {
165
+ releaseRefs(refId);
166
+ reject(error);
167
+ }
168
+ });
169
+ }
170
+ export function unloadModel(handle) {
171
+ return lib.symbols.llama_unload_model(handle);
172
+ }
173
+ export function isModelLoaded(handle) {
174
+ return lib.symbols.llama_is_model_loaded(handle);
175
+ }
176
+ function createGenerateOptions(options) {
177
+ const refs = [];
178
+ // Create message array
179
+ const messagesBuffer = new ArrayBuffer(MESSAGE_SIZE * options.messages.length);
180
+ const messagesView = new DataView(messagesBuffer);
181
+ refs.push(messagesBuffer);
182
+ for (let i = 0; i < options.messages.length; i++) {
183
+ const roleBuffer = encodeString(options.messages[i].role);
184
+ const contentBuffer = encodeString(options.messages[i].content);
185
+ refs.push(roleBuffer, contentBuffer);
186
+ const offset = i * MESSAGE_SIZE;
187
+ messagesView.setBigUint64(offset, BigInt(ptr(roleBuffer)), true);
188
+ messagesView.setBigUint64(offset + 8, BigInt(ptr(contentBuffer)), true);
189
+ }
190
+ // Create stop sequences array if provided
191
+ let stopSeqPtr = 0n;
192
+ if (options.stopSequences && options.stopSequences.length > 0) {
193
+ const stopSeqPtrBuffer = new ArrayBuffer(8 * options.stopSequences.length);
194
+ const stopSeqPtrView = new DataView(stopSeqPtrBuffer);
195
+ refs.push(stopSeqPtrBuffer);
196
+ for (let i = 0; i < options.stopSequences.length; i++) {
197
+ const seqBuffer = encodeString(options.stopSequences[i]);
198
+ refs.push(seqBuffer);
199
+ stopSeqPtrView.setBigUint64(i * 8, BigInt(ptr(seqBuffer)), true);
200
+ }
201
+ stopSeqPtr = BigInt(ptr(new Uint8Array(stopSeqPtrBuffer)));
202
+ }
203
+ // Create generate options structure
204
+ const optionsBuffer = new ArrayBuffer(GENERATE_OPTIONS_SIZE);
205
+ const optionsView = new DataView(optionsBuffer);
206
+ refs.push(optionsBuffer);
207
+ optionsView.setBigUint64(0, BigInt(ptr(new Uint8Array(messagesBuffer))), true); // messages
208
+ optionsView.setBigUint64(8, BigInt(options.messages.length), true); // message_count
209
+ optionsView.setInt32(16, options.maxTokens ?? 256, true); // max_tokens
210
+ optionsView.setFloat32(20, options.temperature ?? 0.7, true); // temperature
211
+ optionsView.setFloat32(24, options.topP ?? 0.9, true); // top_p
212
+ optionsView.setInt32(28, options.topK ?? 40, true); // top_k
213
+ optionsView.setBigUint64(32, stopSeqPtr, true); // stop_sequences
214
+ optionsView.setBigUint64(40, BigInt(options.stopSequences?.length ?? 0), true); // stop_sequence_count
215
+ return { buffer: optionsBuffer, refs };
216
+ }
217
+ // Helper to check if a pointer is null (can be null, 0, or 0n)
218
+ function isNullPtr(ptr) {
219
+ if (ptr === null)
220
+ return true;
221
+ if (typeof ptr === "number")
222
+ return ptr === 0;
223
+ if (typeof ptr === "bigint")
224
+ return ptr === 0n;
225
+ // For Pointer type, convert to number and check
226
+ return Number(ptr) === 0;
227
+ }
228
+ function parseResult(resultPtr) {
229
+ // Get the numeric value of the pointer for null check
230
+ const ptrValue = typeof resultPtr === "number" ? resultPtr : Number(resultPtr);
231
+ if (ptrValue === 0) {
232
+ throw new Error("Generation failed: null result pointer");
233
+ }
234
+ const buffer = copyFromCPtr(resultPtr, 32);
235
+ const view = new DataView(buffer);
236
+ const textPtr = Number(view.getBigUint64(RESULT_TEXT_OFFSET, true));
237
+ const promptTokens = view.getInt32(RESULT_PROMPT_TOKENS_OFFSET, true);
238
+ const completionTokens = view.getInt32(RESULT_COMPLETION_TOKENS_OFFSET, true);
239
+ const finishReasonPtr = Number(view.getBigUint64(RESULT_FINISH_REASON_OFFSET, true));
240
+ const errorPtr = Number(view.getBigUint64(RESULT_ERROR_OFFSET, true));
241
+ const text = textPtr ? readCString(textPtr) ?? "" : "";
242
+ const finishReason = finishReasonPtr ? readCString(finishReasonPtr) ?? "error" : "error";
243
+ const error = errorPtr ? readCString(errorPtr) : null;
244
+ if (error) {
245
+ throw new Error(error);
246
+ }
247
+ return {
248
+ text,
249
+ promptTokens,
250
+ completionTokens,
251
+ finishReason: finishReason,
252
+ };
253
+ }
254
+ export function generate(handle, options) {
255
+ return new Promise((resolve, reject) => {
256
+ const refId = nextRefId++;
257
+ try {
258
+ const { buffer, refs } = createGenerateOptions(options);
259
+ keepAlive(refId, refs);
260
+ const resultPtr = lib.symbols.llama_generate(handle, ptr(new Uint8Array(buffer)));
261
+ if (isNullPtr(resultPtr)) {
262
+ releaseRefs(refId);
263
+ const errorPtr = lib.symbols.llama_get_last_error();
264
+ const error = !isNullPtr(errorPtr) ? readCString(errorPtr) : "Generation failed: null result";
265
+ lib.symbols.llama_clear_error();
266
+ reject(new Error(error ?? "Generation failed: null result"));
267
+ return;
268
+ }
269
+ try {
270
+ const result = parseResult(resultPtr);
271
+ resolve(result);
272
+ }
273
+ catch (parseError) {
274
+ // If parsing fails, check for library error before re-throwing
275
+ const errorPtr = lib.symbols.llama_get_last_error();
276
+ if (!isNullPtr(errorPtr)) {
277
+ const libError = readCString(errorPtr);
278
+ lib.symbols.llama_clear_error();
279
+ if (libError) {
280
+ reject(new Error(libError));
281
+ return;
282
+ }
283
+ }
284
+ throw parseError;
285
+ }
286
+ finally {
287
+ lib.symbols.llama_free_result(resultPtr);
288
+ releaseRefs(refId);
289
+ }
290
+ }
291
+ catch (error) {
292
+ releaseRefs(refId);
293
+ reject(error);
294
+ }
295
+ });
296
+ }
297
+ export function generateStream(handle, options, onToken) {
298
+ return new Promise((resolve, reject) => {
299
+ const refId = nextRefId++;
300
+ try {
301
+ const { buffer, refs } = createGenerateOptions(options);
302
+ keepAlive(refId, refs);
303
+ // Collect tokens during generation
304
+ // Note: Bun FFI callbacks are tricky, so we use a workaround
305
+ // The C API will call the callback synchronously during generation
306
+ // We store tokens and emit them after generation completes
307
+ // For now, we use the non-streaming generate and emit all at once
308
+ // A proper streaming implementation would require Bun's callback support
309
+ const resultPtr = lib.symbols.llama_generate(handle, ptr(new Uint8Array(buffer)));
310
+ if (isNullPtr(resultPtr)) {
311
+ releaseRefs(refId);
312
+ const errorPtr = lib.symbols.llama_get_last_error();
313
+ const error = !isNullPtr(errorPtr) ? readCString(errorPtr) : "Generation failed: null result";
314
+ lib.symbols.llama_clear_error();
315
+ reject(new Error(error ?? "Generation failed: null result"));
316
+ return;
317
+ }
318
+ try {
319
+ const result = parseResult(resultPtr);
320
+ // Emit the full text as tokens (character by character for now)
321
+ // In a proper implementation, we'd use the streaming API with callbacks
322
+ const words = result.text.split(/(\s+)/);
323
+ for (const word of words) {
324
+ if (word) {
325
+ onToken(word);
326
+ }
327
+ }
328
+ resolve(result);
329
+ }
330
+ catch (parseError) {
331
+ // If parsing fails, check for library error before re-throwing
332
+ const errorPtr = lib.symbols.llama_get_last_error();
333
+ if (!isNullPtr(errorPtr)) {
334
+ const libError = readCString(errorPtr);
335
+ lib.symbols.llama_clear_error();
336
+ if (libError) {
337
+ reject(new Error(libError));
338
+ return;
339
+ }
340
+ }
341
+ throw parseError;
342
+ }
343
+ finally {
344
+ lib.symbols.llama_free_result(resultPtr);
345
+ releaseRefs(refId);
346
+ }
347
+ }
348
+ catch (error) {
349
+ releaseRefs(refId);
350
+ reject(error);
351
+ }
352
+ });
353
+ }
354
+ //# sourceMappingURL=binding-bun.js.map