npm - @shipworthy/ai-sdk-llama-cpp - Versions diffs - 0.2.3 → 0.2.4 - Mend

@shipworthy/ai-sdk-llama-cpp 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/llama-cpp-embedding-model.d.ts +7 -0
package/dist/llama-cpp-embedding-model.d.ts.map +1 -1
package/dist/llama-cpp-embedding-model.js +12 -2
package/dist/llama-cpp-embedding-model.js.map +1 -1
package/dist/llama-cpp-language-model.d.ts +7 -0
package/dist/llama-cpp-language-model.d.ts.map +1 -1
package/dist/llama-cpp-language-model.js +12 -2
package/dist/llama-cpp-language-model.js.map +1 -1
package/dist/native-binding.d.ts +5 -0
package/dist/native-binding.d.ts.map +1 -1
package/dist/native-binding.js +3 -0
package/dist/native-binding.js.map +1 -1
package/native/binding.cpp +187 -183
package/native/llama-wrapper.cpp +185 -124
package/native/llama-wrapper.h +48 -48
package/package.json +1 -1

package/native/llama-wrapper.cpp CHANGED Viewed

@@ -1,10 +1,12 @@
 #include "llama-wrapper.h"
 #include "llama.h"
 #include <algorithm>
+#include <cmath>
+#include <cstdio>
 #include <cstring>
 #include <stdexcept>
-#include <cstdio>
-#include <cmath>
 namespace llama_wrapper {
@@ -12,9 +14,9 @@ namespace llama_wrapper {
 static bool g_debug_mode = false;
 // Custom log callback that respects debug mode
-static void llama_log_callback(ggml_log_level level, const char* text, void* user_data) {
-    (void)level;
-    (void)user_data;
+static void llama_log_callback(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
     if (g_debug_mode) {
         fprintf(stderr, "%s", text);
     }
@@ -26,33 +28,33 @@ LlamaModel::~LlamaModel() {
     unload();
 }
-LlamaModel::LlamaModel(LlamaModel&& other) noexcept
-    : model_(other.model_)
-    , ctx_(other.ctx_)
-    , sampler_(other.sampler_)
-    , model_path_(std::move(other.model_path_))
-    , chat_template_(std::move(other.chat_template_)) {
-    other.model_ = nullptr;
-    other.ctx_ = nullptr;
+LlamaModel::LlamaModel(LlamaModel && other) noexcept :
+    model_(other.model_),
+    ctx_(other.ctx_),
+    sampler_(other.sampler_),
+    model_path_(std::move(other.model_path_)),
+    chat_template_(std::move(other.chat_template_)) {
+    other.model_   = nullptr;
+    other.ctx_     = nullptr;
     other.sampler_ = nullptr;
 }
-LlamaModel& LlamaModel::operator=(LlamaModel&& other) noexcept {
+LlamaModel & LlamaModel::operator=(LlamaModel && other) noexcept {
     if (this != &other) {
         unload();
-        model_ = other.model_;
-        ctx_ = other.ctx_;
-        sampler_ = other.sampler_;
-        model_path_ = std::move(other.model_path_);
+        model_         = other.model_;
+        ctx_           = other.ctx_;
+        sampler_       = other.sampler_;
+        model_path_    = std::move(other.model_path_);
         chat_template_ = std::move(other.chat_template_);
-        other.model_ = nullptr;
-        other.ctx_ = nullptr;
+        other.model_   = nullptr;
+        other.ctx_     = nullptr;
         other.sampler_ = nullptr;
     }
     return *this;
 }
-bool LlamaModel::load(const ModelParams& params) {
+bool LlamaModel::load(const ModelParams & params) {
     if (model_) {
         unload();
     }
@@ -66,9 +68,9 @@ bool LlamaModel::load(const ModelParams& params) {
     // Set up model parameters
     llama_model_params model_params = llama_model_default_params();
-    model_params.n_gpu_layers = params.n_gpu_layers;
-    model_params.use_mmap = params.use_mmap;
-    model_params.use_mlock = params.use_mlock;
+    model_params.n_gpu_layers       = params.n_gpu_layers;
+    model_params.use_mmap           = params.use_mmap;
+    model_params.use_mlock          = params.use_mlock;
     // Load the model
     model_ = llama_model_load_from_file(params.model_path.c_str(), model_params);
@@ -76,7 +78,7 @@ bool LlamaModel::load(const ModelParams& params) {
         return false;
     }
-    model_path_ = params.model_path;
+    model_path_    = params.model_path;
     chat_template_ = params.chat_template;
     return true;
 }
@@ -102,7 +104,7 @@ void LlamaModel::unload() {
     model_path_.clear();
 }
-bool LlamaModel::create_context(const ContextParams& params) {
+bool LlamaModel::create_context(const ContextParams & params) {
     if (!model_) {
         return false;
     }
@@ -113,21 +115,33 @@ bool LlamaModel::create_context(const ContextParams& params) {
     }
     llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.n_ctx = params.n_ctx;
-    ctx_params.n_batch = params.n_batch;
-    ctx_params.n_threads = params.n_threads;
+    // Only override defaults if non-zero values are provided
+    if (params.n_ctx > 0) {
+        ctx_params.n_ctx = params.n_ctx;
+    }
+    if (params.n_batch > 0) {
+        ctx_params.n_batch = params.n_batch;
+    }
+    ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads;
     if (params.embedding) {
-        ctx_params.embeddings = true;
+        ctx_params.embeddings   = true;
         ctx_params.pooling_type = LLAMA_POOLING_TYPE_MEAN;
+        // For embeddings, batch size must be at least as large as context size
+        // (see llama.cpp/examples/embedding/embedding.cpp)
+        if (ctx_params.n_batch < ctx_params.n_ctx) {
+            ctx_params.n_batch = ctx_params.n_ctx;
+        }
     }
     ctx_ = llama_init_from_model(model_, ctx_params);
     return ctx_ != nullptr;
 }
-void LlamaModel::normalize_embedding(float* embedding, int n_embd) {
+void LlamaModel::normalize_embedding(float * embedding, int n_embd) {
     float sum = 0.0f;
     for (int i = 0; i < n_embd; i++) {
         sum += embedding[i] * embedding[i];
@@ -140,7 +154,58 @@ void LlamaModel::normalize_embedding(float* embedding, int n_embd) {
     }
 }
-EmbeddingResult LlamaModel::embed(const std::vector<std::string>& texts) {
+std::vector<float> LlamaModel::embed_chunk(const std::vector<int32_t> & tokens,
+                                           int                          seq_id,
+                                           int                          n_embd,
+                                           int                          pooling_type) {
+    std::vector<float> embedding(n_embd, 0.0f);
+    if (tokens.empty()) {
+        return embedding;
+    }
+    // Clear the memory/KV cache
+    llama_memory_t mem = llama_get_memory(ctx_);
+    if (mem) {
+        llama_memory_clear(mem, true);
+    }
+    // Create batch with sequence ID
+    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        batch.token[i]     = tokens[i];
+        batch.pos[i]       = i;
+        batch.n_seq_id[i]  = 1;
+        batch.seq_id[i][0] = seq_id;
+        batch.logits[i]    = true;  // We want embeddings for all tokens
+    }
+    batch.n_tokens = tokens.size();
+    // Decode to get embeddings
+    if (llama_decode(ctx_, batch) != 0) {
+        llama_batch_free(batch);
+        return embedding;  // Return zero embedding on failure
+    }
+    // Extract embedding based on pooling type
+    const float * embd = nullptr;
+    if (static_cast<enum llama_pooling_type>(pooling_type) == LLAMA_POOLING_TYPE_NONE) {
+        // Get embedding for last token
+        embd = llama_get_embeddings_ith(ctx_, tokens.size() - 1);
+    } else {
+        // Get pooled embedding for the sequence
+        embd = llama_get_embeddings_seq(ctx_, seq_id);
+    }
+    if (embd) {
+        std::copy(embd, embd + n_embd, embedding.begin());
+    }
+    llama_batch_free(batch);
+    return embedding;
+}
+EmbeddingResult LlamaModel::embed(const std::vector<std::string> & texts) {
     EmbeddingResult result;
     result.total_tokens = 0;
@@ -148,12 +213,17 @@ EmbeddingResult LlamaModel::embed(const std::vector<std::string>& texts) {
         return result;
     }
-    const int n_embd = llama_model_n_embd(model_);
+    const int                     n_embd       = llama_model_n_embd(model_);
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx_);
+    // Get context size for chunking
+    const int n_ctx   = llama_n_ctx(ctx_);
+    const int overlap = n_ctx / 10;  // 10% overlap between chunks
+    const int step    = n_ctx - overlap;
     // Process each text
     for (size_t seq_id = 0; seq_id < texts.size(); seq_id++) {
-        const std::string& text = texts[seq_id];
+        const std::string & text = texts[seq_id];
         // Tokenize the text
         std::vector<int32_t> tokens = tokenize(text, true);
@@ -165,63 +235,64 @@ EmbeddingResult LlamaModel::embed(const std::vector<std::string>& texts) {
             continue;
         }
-        // Clear the memory/KV cache
-        llama_memory_t mem = llama_get_memory(ctx_);
-        if (mem) {
-            llama_memory_clear(mem, true);
-        }
+        // Check if text fits in context (no chunking needed)
+        if (static_cast<int>(tokens.size()) <= n_ctx) {
+            // Process single chunk
+            std::vector<float> embedding = embed_chunk(tokens, seq_id, n_embd, pooling_type);
+            normalize_embedding(embedding.data(), n_embd);
+            result.embeddings.push_back(std::move(embedding));
+        } else {
+            // Text exceeds context size - split into overlapping chunks
+            std::vector<std::vector<float>> chunk_embeddings;
-        // Create batch with sequence ID
-        llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
-        for (size_t i = 0; i < tokens.size(); i++) {
-            batch.token[i] = tokens[i];
-            batch.pos[i] = i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id[i][0] = seq_id;
-            batch.logits[i] = true;  // We want embeddings for all tokens
-        }
-        batch.n_tokens = tokens.size();
+            for (size_t start = 0; start < tokens.size(); start += step) {
+                // Calculate chunk end position
+                size_t end = std::min(start + n_ctx, tokens.size());
-        // Decode to get embeddings
-        if (llama_decode(ctx_, batch) != 0) {
-            llama_batch_free(batch);
-            result.embeddings.push_back(std::vector<float>(n_embd, 0.0f));
-            continue;
-        }
+                // Extract chunk tokens
+                std::vector<int32_t> chunk_tokens(tokens.begin() + start, tokens.begin() + end);
-        // Extract embedding based on pooling type
-        const float* embd = nullptr;
-        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-            // Get embedding for last token
-            embd = llama_get_embeddings_ith(ctx_, tokens.size() - 1);
-        } else {
-            // Get pooled embedding for the sequence
-            embd = llama_get_embeddings_seq(ctx_, seq_id);
-        }
+                // Get embedding for this chunk
+                std::vector<float> chunk_emb = embed_chunk(chunk_tokens, seq_id, n_embd, pooling_type);
+                chunk_embeddings.push_back(std::move(chunk_emb));
-        if (embd) {
-            std::vector<float> embedding(n_embd);
-            std::copy(embd, embd + n_embd, embedding.begin());
-            // Normalize the embedding (L2 normalization)
-            normalize_embedding(embedding.data(), n_embd);
-            result.embeddings.push_back(std::move(embedding));
-        } else {
-            result.embeddings.push_back(std::vector<float>(n_embd, 0.0f));
-        }
+                // If this chunk reached the end, we're done
+                if (end == tokens.size()) {
+                    break;
+                }
+            }
-        llama_batch_free(batch);
+            // Mean-pool all chunk embeddings
+            std::vector<float> final_embedding(n_embd, 0.0f);
+            if (!chunk_embeddings.empty()) {
+                for (const auto & chunk_emb : chunk_embeddings) {
+                    for (int i = 0; i < n_embd; i++) {
+                        final_embedding[i] += chunk_emb[i];
+                    }
+                }
+                // Divide by number of chunks to get mean
+                float num_chunks = static_cast<float>(chunk_embeddings.size());
+                for (int i = 0; i < n_embd; i++) {
+                    final_embedding[i] /= num_chunks;
+                }
+            }
+            // Normalize the final averaged embedding
+            normalize_embedding(final_embedding.data(), n_embd);
+            result.embeddings.push_back(std::move(final_embedding));
+        }
     }
     return result;
 }
-std::string LlamaModel::apply_chat_template(const std::vector<ChatMessage>& messages) {
+std::string LlamaModel::apply_chat_template(const std::vector<ChatMessage> & messages) {
     if (!model_) {
         return "";
     }
     // Determine which template to use
-    const char* tmpl = nullptr;
+    const char * tmpl = nullptr;
     if (chat_template_ == "auto") {
         // Use the template embedded in the model
         tmpl = llama_model_chat_template(model_, nullptr);
@@ -233,22 +304,17 @@ std::string LlamaModel::apply_chat_template(const std::vector<ChatMessage>& mess
     // Convert messages to llama_chat_message format
     std::vector<llama_chat_message> chat_messages;
     chat_messages.reserve(messages.size());
-    for (const auto& msg : messages) {
+    for (const auto & msg : messages) {
         llama_chat_message chat_msg;
-        chat_msg.role = msg.role.c_str();
+        chat_msg.role    = msg.role.c_str();
         chat_msg.content = msg.content.c_str();
         chat_messages.push_back(chat_msg);
     }
     // First call to get required buffer size
-    int32_t result_size = llama_chat_apply_template(
-        tmpl,
-        chat_messages.data(),
-        chat_messages.size(),
-        true,  // add_ass: add assistant prompt
-        nullptr,
-        0
-    );
+    int32_t result_size = llama_chat_apply_template(tmpl, chat_messages.data(), chat_messages.size(),
+                                                    true,  // add_ass: add assistant prompt
+                                                    nullptr, 0);
     if (result_size < 0) {
         // Template not supported, return empty string
@@ -257,19 +323,12 @@ std::string LlamaModel::apply_chat_template(const std::vector<ChatMessage>& mess
     // Allocate buffer and apply template
     std::vector<char> buffer(result_size + 1);
-    llama_chat_apply_template(
-        tmpl,
-        chat_messages.data(),
-        chat_messages.size(),
-        true,
-        buffer.data(),
-        buffer.size()
-    );
+    llama_chat_apply_template(tmpl, chat_messages.data(), chat_messages.size(), true, buffer.data(), buffer.size());
     return std::string(buffer.data(), result_size);
 }
-void LlamaModel::create_sampler(const GenerationParams& params) {
+void LlamaModel::create_sampler(const GenerationParams & params) {
     if (sampler_) {
         llama_sampler_free(sampler_);
     }
@@ -281,14 +340,15 @@ void LlamaModel::create_sampler(const GenerationParams& params) {
     llama_sampler_chain_add(sampler_, llama_sampler_init_top_k(params.top_k));
     llama_sampler_chain_add(sampler_, llama_sampler_init_top_p(params.top_p, 1));
     llama_sampler_chain_add(sampler_, llama_sampler_init_temp(params.temperature));
-    llama_sampler_chain_add(sampler_, llama_sampler_init_dist(42)); // Random seed
+    llama_sampler_chain_add(sampler_, llama_sampler_init_dist(42));  // Random seed
 }
-std::vector<int32_t> LlamaModel::tokenize(const std::string& text, bool add_bos) {
-    const llama_vocab* vocab = llama_model_get_vocab(model_);
+std::vector<int32_t> LlamaModel::tokenize(const std::string & text, bool add_bos) {
+    const llama_vocab * vocab = llama_model_get_vocab(model_);
     // First, get the number of tokens needed
-    // When passing 0 for n_tokens_max, llama_tokenize returns negative of required size
+    // When passing 0 for n_tokens_max, llama_tokenize returns negative of
+    // required size
     int n_tokens = llama_tokenize(vocab, text.c_str(), text.length(), nullptr, 0, add_bos, true);
     if (n_tokens < 0) {
@@ -318,10 +378,10 @@ std::vector<int32_t> LlamaModel::tokenize(const std::string& text, bool add_bos)
 }
 std::string LlamaModel::detokenize(int32_t token) {
-    const llama_vocab* vocab = llama_model_get_vocab(model_);
+    const llama_vocab * vocab = llama_model_get_vocab(model_);
     char buf[256];
-    int n = llama_token_to_piece(vocab, token, buf, sizeof(buf), 0, true);
+    int  n = llama_token_to_piece(vocab, token, buf, sizeof(buf), 0, true);
     if (n < 0) {
         return "";
     }
@@ -329,11 +389,11 @@ std::string LlamaModel::detokenize(int32_t token) {
 }
 bool LlamaModel::is_eos_token(int32_t token) {
-    const llama_vocab* vocab = llama_model_get_vocab(model_);
+    const llama_vocab * vocab = llama_model_get_vocab(model_);
     return llama_vocab_is_eog(vocab, token);
 }
-GenerationResult LlamaModel::generate(const std::vector<ChatMessage>& messages, const GenerationParams& params) {
+GenerationResult LlamaModel::generate(const std::vector<ChatMessage> & messages, const GenerationParams & params) {
     GenerationResult result;
     result.finish_reason = "error";
@@ -349,8 +409,8 @@ GenerationResult LlamaModel::generate(const std::vector<ChatMessage>& messages,
     // Tokenize the prompt
     std::vector<int32_t> prompt_tokens = tokenize(prompt, true);
-    result.prompt_tokens = prompt_tokens.size();
-    result.completion_tokens = 0;
+    result.prompt_tokens               = prompt_tokens.size();
+    result.completion_tokens           = 0;
     // Clear the memory/KV cache
     llama_memory_t mem = llama_get_memory(ctx_);
@@ -370,7 +430,7 @@ GenerationResult LlamaModel::generate(const std::vector<ChatMessage>& messages,
     // Generate tokens
     std::string generated_text;
-    int n_cur = prompt_tokens.size();
+    int         n_cur = prompt_tokens.size();
     for (int i = 0; i < params.max_tokens; i++) {
         // Sample the next token
@@ -389,18 +449,20 @@ GenerationResult LlamaModel::generate(const std::vector<ChatMessage>& messages,
         // Check for stop sequences
         bool should_stop = false;
-        for (const auto& stop_seq : params.stop_sequences) {
+        for (const auto & stop_seq : params.stop_sequences) {
             if (generated_text.length() >= stop_seq.length()) {
                 if (generated_text.substr(generated_text.length() - stop_seq.length()) == stop_seq) {
                     // Remove the stop sequence from output
-                    generated_text = generated_text.substr(0, generated_text.length() - stop_seq.length());
-                    should_stop = true;
+                    generated_text       = generated_text.substr(0, generated_text.length() - stop_seq.length());
+                    should_stop          = true;
                     result.finish_reason = "stop";
                     break;
                 }
             }
         }
-        if (should_stop) break;
+        if (should_stop) {
+            break;
+        }
         // Prepare for next iteration
         batch = llama_batch_get_one(&new_token, 1);
@@ -420,11 +482,9 @@ GenerationResult LlamaModel::generate(const std::vector<ChatMessage>& messages,
     return result;
 }
-GenerationResult LlamaModel::generate_streaming(
-    const std::vector<ChatMessage>& messages,
-    const GenerationParams& params,
-    TokenCallback callback
-) {
+GenerationResult LlamaModel::generate_streaming(const std::vector<ChatMessage> & messages,
+                                                const GenerationParams &         params,
+                                                TokenCallback                    callback) {
     GenerationResult result;
     result.finish_reason = "error";
@@ -440,8 +500,8 @@ GenerationResult LlamaModel::generate_streaming(
     // Tokenize the prompt
     std::vector<int32_t> prompt_tokens = tokenize(prompt, true);
-    result.prompt_tokens = prompt_tokens.size();
-    result.completion_tokens = 0;
+    result.prompt_tokens               = prompt_tokens.size();
+    result.completion_tokens           = 0;
     // Clear the memory/KV cache
     llama_memory_t mem = llama_get_memory(ctx_);
@@ -461,7 +521,7 @@ GenerationResult LlamaModel::generate_streaming(
     // Generate tokens
     std::string generated_text;
-    int n_cur = prompt_tokens.size();
+    int         n_cur = prompt_tokens.size();
     for (int i = 0; i < params.max_tokens; i++) {
         // Sample the next token
@@ -486,16 +546,18 @@ GenerationResult LlamaModel::generate_streaming(
         // Check for stop sequences
         bool should_stop = false;
-        for (const auto& stop_seq : params.stop_sequences) {
+        for (const auto & stop_seq : params.stop_sequences) {
             if (generated_text.length() >= stop_seq.length()) {
                 if (generated_text.substr(generated_text.length() - stop_seq.length()) == stop_seq) {
-                    should_stop = true;
+                    should_stop          = true;
                     result.finish_reason = "stop";
                     break;
                 }
             }
         }
-        if (should_stop) break;
+        if (should_stop) {
+            break;
+        }
         // Prepare for next iteration
         batch = llama_batch_get_one(&new_token, 1);
@@ -515,5 +577,4 @@ GenerationResult LlamaModel::generate_streaming(
     return result;
 }
-} // namespace llama_wrapper
+}  // namespace llama_wrapper