@shipworthy/ai-sdk-llama-cpp 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CMakeLists.txt +6 -0
  2. package/LICENSE +21 -0
  3. package/README.md +274 -0
  4. package/dist/binding-bun.d.ts +7 -0
  5. package/dist/binding-bun.d.ts.map +1 -0
  6. package/dist/binding-bun.js +354 -0
  7. package/dist/binding-bun.js.map +1 -0
  8. package/dist/binding-node.d.ts +7 -0
  9. package/dist/binding-node.d.ts.map +1 -0
  10. package/dist/binding-node.js +59 -0
  11. package/dist/binding-node.js.map +1 -0
  12. package/dist/binding.d.ts +67 -0
  13. package/dist/binding.d.ts.map +1 -0
  14. package/dist/binding.js +105 -0
  15. package/dist/binding.js.map +1 -0
  16. package/dist/index.d.ts +5 -0
  17. package/dist/index.d.ts.map +1 -0
  18. package/dist/index.js +8 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/llama-cpp-embedding-model.d.ts +28 -0
  21. package/dist/llama-cpp-embedding-model.d.ts.map +1 -0
  22. package/dist/llama-cpp-embedding-model.js +78 -0
  23. package/dist/llama-cpp-embedding-model.js.map +1 -0
  24. package/dist/llama-cpp-language-model.d.ts +55 -0
  25. package/dist/llama-cpp-language-model.d.ts.map +1 -0
  26. package/dist/llama-cpp-language-model.js +221 -0
  27. package/dist/llama-cpp-language-model.js.map +1 -0
  28. package/dist/llama-cpp-provider.d.ts +82 -0
  29. package/dist/llama-cpp-provider.d.ts.map +1 -0
  30. package/dist/llama-cpp-provider.js +71 -0
  31. package/dist/llama-cpp-provider.js.map +1 -0
  32. package/dist/native-binding.d.ts +51 -0
  33. package/dist/native-binding.d.ts.map +1 -0
  34. package/dist/native-binding.js +74 -0
  35. package/dist/native-binding.js.map +1 -0
  36. package/native/CMakeLists.txt +74 -0
  37. package/native/binding.cpp +522 -0
  38. package/native/llama-wrapper.cpp +519 -0
  39. package/native/llama-wrapper.h +131 -0
  40. package/package.json +79 -0
  41. package/scripts/postinstall.cjs +74 -0
@@ -0,0 +1,522 @@
1
+ #include <napi.h>
2
+ #include "llama-wrapper.h"
3
+ #include <memory>
4
+ #include <unordered_map>
5
+ #include <mutex>
6
+ #include <atomic>
7
+
8
+ // Global state for managing models
9
+ static std::unordered_map<int, std::unique_ptr<llama_wrapper::LlamaModel>> g_models;
10
+ static std::mutex g_models_mutex;
11
+ static std::atomic<int> g_next_handle{1};
12
+
13
+ // ============================================================================
14
+ // Async Workers
15
+ // ============================================================================
16
+
17
+ class LoadModelWorker : public Napi::AsyncWorker {
18
+ public:
19
+ LoadModelWorker(
20
+ Napi::Function& callback,
21
+ const std::string& model_path,
22
+ int n_gpu_layers,
23
+ int n_ctx,
24
+ int n_threads,
25
+ bool debug,
26
+ const std::string& chat_template,
27
+ bool embedding
28
+ )
29
+ : Napi::AsyncWorker(callback)
30
+ , model_path_(model_path)
31
+ , n_gpu_layers_(n_gpu_layers)
32
+ , n_ctx_(n_ctx)
33
+ , n_threads_(n_threads)
34
+ , debug_(debug)
35
+ , chat_template_(chat_template)
36
+ , embedding_(embedding)
37
+ , handle_(-1)
38
+ , success_(false)
39
+ {}
40
+
41
+ void Execute() override {
42
+ auto model = std::make_unique<llama_wrapper::LlamaModel>();
43
+
44
+ llama_wrapper::ModelParams model_params;
45
+ model_params.model_path = model_path_;
46
+ model_params.n_gpu_layers = n_gpu_layers_;
47
+ model_params.debug = debug_;
48
+ model_params.chat_template = chat_template_;
49
+
50
+ if (!model->load(model_params)) {
51
+ SetError("Failed to load model from: " + model_path_);
52
+ return;
53
+ }
54
+
55
+ llama_wrapper::ContextParams ctx_params;
56
+ ctx_params.n_ctx = n_ctx_;
57
+ ctx_params.n_threads = n_threads_;
58
+ ctx_params.embedding = embedding_;
59
+
60
+ if (!model->create_context(ctx_params)) {
61
+ SetError("Failed to create context");
62
+ return;
63
+ }
64
+
65
+ handle_ = g_next_handle++;
66
+
67
+ {
68
+ std::lock_guard<std::mutex> lock(g_models_mutex);
69
+ g_models[handle_] = std::move(model);
70
+ }
71
+
72
+ success_ = true;
73
+ }
74
+
75
+ void OnOK() override {
76
+ Napi::HandleScope scope(Env());
77
+ Callback().Call({
78
+ Env().Null(),
79
+ Napi::Number::New(Env(), handle_)
80
+ });
81
+ }
82
+
83
+ void OnError(const Napi::Error& e) override {
84
+ Napi::HandleScope scope(Env());
85
+ Callback().Call({
86
+ Napi::String::New(Env(), e.Message()),
87
+ Env().Null()
88
+ });
89
+ }
90
+
91
+ private:
92
+ std::string model_path_;
93
+ int n_gpu_layers_;
94
+ int n_ctx_;
95
+ int n_threads_;
96
+ bool debug_;
97
+ std::string chat_template_;
98
+ bool embedding_;
99
+ int handle_;
100
+ bool success_;
101
+ };
102
+
103
+ class GenerateWorker : public Napi::AsyncWorker {
104
+ public:
105
+ GenerateWorker(
106
+ Napi::Function& callback,
107
+ int handle,
108
+ const std::vector<llama_wrapper::ChatMessage>& messages,
109
+ const llama_wrapper::GenerationParams& params
110
+ )
111
+ : Napi::AsyncWorker(callback)
112
+ , handle_(handle)
113
+ , messages_(messages)
114
+ , params_(params)
115
+ {}
116
+
117
+ void Execute() override {
118
+ llama_wrapper::LlamaModel* model = nullptr;
119
+
120
+ {
121
+ std::lock_guard<std::mutex> lock(g_models_mutex);
122
+ auto it = g_models.find(handle_);
123
+ if (it == g_models.end()) {
124
+ SetError("Invalid model handle");
125
+ return;
126
+ }
127
+ model = it->second.get();
128
+ }
129
+
130
+ result_ = model->generate(messages_, params_);
131
+ }
132
+
133
+ void OnOK() override {
134
+ Napi::HandleScope scope(Env());
135
+
136
+ Napi::Object result = Napi::Object::New(Env());
137
+ result.Set("text", Napi::String::New(Env(), result_.text));
138
+ result.Set("promptTokens", Napi::Number::New(Env(), result_.prompt_tokens));
139
+ result.Set("completionTokens", Napi::Number::New(Env(), result_.completion_tokens));
140
+ result.Set("finishReason", Napi::String::New(Env(), result_.finish_reason));
141
+
142
+ Callback().Call({Env().Null(), result});
143
+ }
144
+
145
+ private:
146
+ int handle_;
147
+ std::vector<llama_wrapper::ChatMessage> messages_;
148
+ llama_wrapper::GenerationParams params_;
149
+ llama_wrapper::GenerationResult result_;
150
+ };
151
+
152
+ // Thread-safe function context for streaming
153
+ class StreamContext {
154
+ public:
155
+ StreamContext(Napi::Env env, Napi::Function callback)
156
+ : callback_(Napi::Persistent(callback))
157
+ , env_(env)
158
+ {}
159
+
160
+ Napi::FunctionReference callback_;
161
+ Napi::Env env_;
162
+ llama_wrapper::GenerationResult result_;
163
+ };
164
+
165
+ void StreamCallJS(Napi::Env env, Napi::Function callback, StreamContext* context, const char* token) {
166
+ if (env != nullptr && callback != nullptr) {
167
+ if (token != nullptr) {
168
+ // Streaming token
169
+ callback.Call({
170
+ env.Null(),
171
+ Napi::String::New(env, "token"),
172
+ Napi::String::New(env, token)
173
+ });
174
+ }
175
+ }
176
+ }
177
+
178
+ class StreamGenerateWorker : public Napi::AsyncWorker {
179
+ public:
180
+ StreamGenerateWorker(
181
+ Napi::Function& callback,
182
+ int handle,
183
+ const std::vector<llama_wrapper::ChatMessage>& messages,
184
+ const llama_wrapper::GenerationParams& params,
185
+ Napi::Function& token_callback
186
+ )
187
+ : Napi::AsyncWorker(callback)
188
+ , handle_(handle)
189
+ , messages_(messages)
190
+ , params_(params)
191
+ , token_callback_(Napi::Persistent(token_callback))
192
+ {}
193
+
194
+ void Execute() override {
195
+ llama_wrapper::LlamaModel* model = nullptr;
196
+
197
+ {
198
+ std::lock_guard<std::mutex> lock(g_models_mutex);
199
+ auto it = g_models.find(handle_);
200
+ if (it == g_models.end()) {
201
+ SetError("Invalid model handle");
202
+ return;
203
+ }
204
+ model = it->second.get();
205
+ }
206
+
207
+ // Collect tokens during generation
208
+ result_ = model->generate_streaming(messages_, params_, [this](const std::string& token) {
209
+ std::lock_guard<std::mutex> lock(tokens_mutex_);
210
+ tokens_.push_back(token);
211
+ return true;
212
+ });
213
+ }
214
+
215
+ void OnOK() override {
216
+ Napi::HandleScope scope(Env());
217
+
218
+ // Call token callback for each collected token
219
+ for (const auto& token : tokens_) {
220
+ token_callback_.Call({
221
+ Napi::String::New(Env(), token)
222
+ });
223
+ }
224
+
225
+ // Final callback with result
226
+ Napi::Object result = Napi::Object::New(Env());
227
+ result.Set("text", Napi::String::New(Env(), result_.text));
228
+ result.Set("promptTokens", Napi::Number::New(Env(), result_.prompt_tokens));
229
+ result.Set("completionTokens", Napi::Number::New(Env(), result_.completion_tokens));
230
+ result.Set("finishReason", Napi::String::New(Env(), result_.finish_reason));
231
+
232
+ Callback().Call({Env().Null(), result});
233
+ }
234
+
235
+ private:
236
+ int handle_;
237
+ std::vector<llama_wrapper::ChatMessage> messages_;
238
+ llama_wrapper::GenerationParams params_;
239
+ llama_wrapper::GenerationResult result_;
240
+ Napi::FunctionReference token_callback_;
241
+ std::vector<std::string> tokens_;
242
+ std::mutex tokens_mutex_;
243
+ };
244
+
245
+ class EmbedWorker : public Napi::AsyncWorker {
246
+ public:
247
+ EmbedWorker(
248
+ Napi::Function& callback,
249
+ int handle,
250
+ const std::vector<std::string>& texts
251
+ )
252
+ : Napi::AsyncWorker(callback)
253
+ , handle_(handle)
254
+ , texts_(texts)
255
+ {}
256
+
257
+ void Execute() override {
258
+ llama_wrapper::LlamaModel* model = nullptr;
259
+
260
+ {
261
+ std::lock_guard<std::mutex> lock(g_models_mutex);
262
+ auto it = g_models.find(handle_);
263
+ if (it == g_models.end()) {
264
+ SetError("Invalid model handle");
265
+ return;
266
+ }
267
+ model = it->second.get();
268
+ }
269
+
270
+ result_ = model->embed(texts_);
271
+
272
+ if (result_.embeddings.empty() && !texts_.empty()) {
273
+ SetError("Failed to generate embeddings");
274
+ return;
275
+ }
276
+ }
277
+
278
+ void OnOK() override {
279
+ Napi::HandleScope scope(Env());
280
+
281
+ // Create embeddings array
282
+ Napi::Array embeddings_arr = Napi::Array::New(Env(), result_.embeddings.size());
283
+ for (size_t i = 0; i < result_.embeddings.size(); i++) {
284
+ const auto& emb = result_.embeddings[i];
285
+ Napi::Float32Array embedding = Napi::Float32Array::New(Env(), emb.size());
286
+ for (size_t j = 0; j < emb.size(); j++) {
287
+ embedding[j] = emb[j];
288
+ }
289
+ embeddings_arr.Set(i, embedding);
290
+ }
291
+
292
+ Napi::Object result = Napi::Object::New(Env());
293
+ result.Set("embeddings", embeddings_arr);
294
+ result.Set("totalTokens", Napi::Number::New(Env(), result_.total_tokens));
295
+
296
+ Callback().Call({Env().Null(), result});
297
+ }
298
+
299
+ private:
300
+ int handle_;
301
+ std::vector<std::string> texts_;
302
+ llama_wrapper::EmbeddingResult result_;
303
+ };
304
+
305
+ // ============================================================================
306
+ // N-API Functions
307
+ // ============================================================================
308
+
309
+ Napi::Value LoadModel(const Napi::CallbackInfo& info) {
310
+ Napi::Env env = info.Env();
311
+
312
+ if (info.Length() < 2 || !info[0].IsObject() || !info[1].IsFunction()) {
313
+ Napi::TypeError::New(env, "Expected (options, callback)").ThrowAsJavaScriptException();
314
+ return env.Null();
315
+ }
316
+
317
+ Napi::Object options = info[0].As<Napi::Object>();
318
+ Napi::Function callback = info[1].As<Napi::Function>();
319
+
320
+ std::string model_path = options.Get("modelPath").As<Napi::String>().Utf8Value();
321
+ int n_gpu_layers = options.Has("gpuLayers") ?
322
+ options.Get("gpuLayers").As<Napi::Number>().Int32Value() : 99;
323
+ int n_ctx = options.Has("contextSize") ?
324
+ options.Get("contextSize").As<Napi::Number>().Int32Value() : 2048;
325
+ int n_threads = options.Has("threads") ?
326
+ options.Get("threads").As<Napi::Number>().Int32Value() : 4;
327
+ bool debug = options.Has("debug") ?
328
+ options.Get("debug").As<Napi::Boolean>().Value() : false;
329
+ std::string chat_template = options.Has("chatTemplate") ?
330
+ options.Get("chatTemplate").As<Napi::String>().Utf8Value() : "auto";
331
+ bool embedding = options.Has("embedding") ?
332
+ options.Get("embedding").As<Napi::Boolean>().Value() : false;
333
+
334
+ auto worker = new LoadModelWorker(callback, model_path, n_gpu_layers, n_ctx, n_threads, debug, chat_template, embedding);
335
+ worker->Queue();
336
+
337
+ return env.Undefined();
338
+ }
339
+
340
+ Napi::Value UnloadModel(const Napi::CallbackInfo& info) {
341
+ Napi::Env env = info.Env();
342
+
343
+ if (info.Length() < 1 || !info[0].IsNumber()) {
344
+ Napi::TypeError::New(env, "Expected model handle").ThrowAsJavaScriptException();
345
+ return env.Null();
346
+ }
347
+
348
+ int handle = info[0].As<Napi::Number>().Int32Value();
349
+
350
+ {
351
+ std::lock_guard<std::mutex> lock(g_models_mutex);
352
+ auto it = g_models.find(handle);
353
+ if (it != g_models.end()) {
354
+ g_models.erase(it);
355
+ }
356
+ }
357
+
358
+ return Napi::Boolean::New(env, true);
359
+ }
360
+
361
+ // Helper function to parse messages array from JavaScript
362
+ std::vector<llama_wrapper::ChatMessage> ParseMessages(Napi::Array messages_arr) {
363
+ std::vector<llama_wrapper::ChatMessage> messages;
364
+ for (uint32_t i = 0; i < messages_arr.Length(); i++) {
365
+ Napi::Object msg_obj = messages_arr.Get(i).As<Napi::Object>();
366
+ llama_wrapper::ChatMessage msg;
367
+ msg.role = msg_obj.Get("role").As<Napi::String>().Utf8Value();
368
+ msg.content = msg_obj.Get("content").As<Napi::String>().Utf8Value();
369
+ messages.push_back(msg);
370
+ }
371
+ return messages;
372
+ }
373
+
374
+ Napi::Value Generate(const Napi::CallbackInfo& info) {
375
+ Napi::Env env = info.Env();
376
+
377
+ if (info.Length() < 3 || !info[0].IsNumber() || !info[1].IsObject() || !info[2].IsFunction()) {
378
+ Napi::TypeError::New(env, "Expected (handle, options, callback)").ThrowAsJavaScriptException();
379
+ return env.Null();
380
+ }
381
+
382
+ int handle = info[0].As<Napi::Number>().Int32Value();
383
+ Napi::Object options = info[1].As<Napi::Object>();
384
+ Napi::Function callback = info[2].As<Napi::Function>();
385
+
386
+ // Parse messages array
387
+ if (!options.Has("messages") || !options.Get("messages").IsArray()) {
388
+ Napi::TypeError::New(env, "Expected messages array in options").ThrowAsJavaScriptException();
389
+ return env.Null();
390
+ }
391
+ std::vector<llama_wrapper::ChatMessage> messages = ParseMessages(options.Get("messages").As<Napi::Array>());
392
+
393
+ llama_wrapper::GenerationParams params;
394
+ params.max_tokens = options.Has("maxTokens") ?
395
+ options.Get("maxTokens").As<Napi::Number>().Int32Value() : 256;
396
+ params.temperature = options.Has("temperature") ?
397
+ options.Get("temperature").As<Napi::Number>().FloatValue() : 0.7f;
398
+ params.top_p = options.Has("topP") ?
399
+ options.Get("topP").As<Napi::Number>().FloatValue() : 0.9f;
400
+ params.top_k = options.Has("topK") ?
401
+ options.Get("topK").As<Napi::Number>().Int32Value() : 40;
402
+
403
+ if (options.Has("stopSequences") && options.Get("stopSequences").IsArray()) {
404
+ Napi::Array stop_arr = options.Get("stopSequences").As<Napi::Array>();
405
+ for (uint32_t i = 0; i < stop_arr.Length(); i++) {
406
+ params.stop_sequences.push_back(stop_arr.Get(i).As<Napi::String>().Utf8Value());
407
+ }
408
+ }
409
+
410
+ auto worker = new GenerateWorker(callback, handle, messages, params);
411
+ worker->Queue();
412
+
413
+ return env.Undefined();
414
+ }
415
+
416
+ Napi::Value GenerateStream(const Napi::CallbackInfo& info) {
417
+ Napi::Env env = info.Env();
418
+
419
+ if (info.Length() < 4 || !info[0].IsNumber() || !info[1].IsObject() ||
420
+ !info[2].IsFunction() || !info[3].IsFunction()) {
421
+ Napi::TypeError::New(env, "Expected (handle, options, tokenCallback, doneCallback)").ThrowAsJavaScriptException();
422
+ return env.Null();
423
+ }
424
+
425
+ int handle = info[0].As<Napi::Number>().Int32Value();
426
+ Napi::Object options = info[1].As<Napi::Object>();
427
+ Napi::Function token_callback = info[2].As<Napi::Function>();
428
+ Napi::Function done_callback = info[3].As<Napi::Function>();
429
+
430
+ // Parse messages array
431
+ if (!options.Has("messages") || !options.Get("messages").IsArray()) {
432
+ Napi::TypeError::New(env, "Expected messages array in options").ThrowAsJavaScriptException();
433
+ return env.Null();
434
+ }
435
+ std::vector<llama_wrapper::ChatMessage> messages = ParseMessages(options.Get("messages").As<Napi::Array>());
436
+
437
+ llama_wrapper::GenerationParams params;
438
+ params.max_tokens = options.Has("maxTokens") ?
439
+ options.Get("maxTokens").As<Napi::Number>().Int32Value() : 256;
440
+ params.temperature = options.Has("temperature") ?
441
+ options.Get("temperature").As<Napi::Number>().FloatValue() : 0.7f;
442
+ params.top_p = options.Has("topP") ?
443
+ options.Get("topP").As<Napi::Number>().FloatValue() : 0.9f;
444
+ params.top_k = options.Has("topK") ?
445
+ options.Get("topK").As<Napi::Number>().Int32Value() : 40;
446
+
447
+ if (options.Has("stopSequences") && options.Get("stopSequences").IsArray()) {
448
+ Napi::Array stop_arr = options.Get("stopSequences").As<Napi::Array>();
449
+ for (uint32_t i = 0; i < stop_arr.Length(); i++) {
450
+ params.stop_sequences.push_back(stop_arr.Get(i).As<Napi::String>().Utf8Value());
451
+ }
452
+ }
453
+
454
+ auto worker = new StreamGenerateWorker(done_callback, handle, messages, params, token_callback);
455
+ worker->Queue();
456
+
457
+ return env.Undefined();
458
+ }
459
+
460
+ Napi::Value IsModelLoaded(const Napi::CallbackInfo& info) {
461
+ Napi::Env env = info.Env();
462
+
463
+ if (info.Length() < 1 || !info[0].IsNumber()) {
464
+ Napi::TypeError::New(env, "Expected model handle").ThrowAsJavaScriptException();
465
+ return env.Null();
466
+ }
467
+
468
+ int handle = info[0].As<Napi::Number>().Int32Value();
469
+
470
+ std::lock_guard<std::mutex> lock(g_models_mutex);
471
+ auto it = g_models.find(handle);
472
+ bool loaded = it != g_models.end() && it->second->is_loaded();
473
+
474
+ return Napi::Boolean::New(env, loaded);
475
+ }
476
+
477
+ Napi::Value Embed(const Napi::CallbackInfo& info) {
478
+ Napi::Env env = info.Env();
479
+
480
+ if (info.Length() < 3 || !info[0].IsNumber() || !info[1].IsObject() || !info[2].IsFunction()) {
481
+ Napi::TypeError::New(env, "Expected (handle, options, callback)").ThrowAsJavaScriptException();
482
+ return env.Null();
483
+ }
484
+
485
+ int handle = info[0].As<Napi::Number>().Int32Value();
486
+ Napi::Object options = info[1].As<Napi::Object>();
487
+ Napi::Function callback = info[2].As<Napi::Function>();
488
+
489
+ // Parse texts array
490
+ if (!options.Has("texts") || !options.Get("texts").IsArray()) {
491
+ Napi::TypeError::New(env, "Expected texts array in options").ThrowAsJavaScriptException();
492
+ return env.Null();
493
+ }
494
+
495
+ Napi::Array texts_arr = options.Get("texts").As<Napi::Array>();
496
+ std::vector<std::string> texts;
497
+ for (uint32_t i = 0; i < texts_arr.Length(); i++) {
498
+ texts.push_back(texts_arr.Get(i).As<Napi::String>().Utf8Value());
499
+ }
500
+
501
+ auto worker = new EmbedWorker(callback, handle, texts);
502
+ worker->Queue();
503
+
504
+ return env.Undefined();
505
+ }
506
+
507
+ // ============================================================================
508
+ // Module Initialization
509
+ // ============================================================================
510
+
511
+ Napi::Object Init(Napi::Env env, Napi::Object exports) {
512
+ exports.Set("loadModel", Napi::Function::New(env, LoadModel));
513
+ exports.Set("unloadModel", Napi::Function::New(env, UnloadModel));
514
+ exports.Set("generate", Napi::Function::New(env, Generate));
515
+ exports.Set("generateStream", Napi::Function::New(env, GenerateStream));
516
+ exports.Set("isModelLoaded", Napi::Function::New(env, IsModelLoaded));
517
+ exports.Set("embed", Napi::Function::New(env, Embed));
518
+ return exports;
519
+ }
520
+
521
+ NODE_API_MODULE(llama_binding, Init)
522
+