npm - @shipworthy/ai-sdk-llama-cpp - Versions diffs - 0.2.2 → 0.2.4 - Mend

@shipworthy/ai-sdk-llama-cpp 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/dist/llama-cpp-embedding-model.d.ts +7 -0
package/dist/llama-cpp-embedding-model.d.ts.map +1 -1
package/dist/llama-cpp-embedding-model.js +12 -2
package/dist/llama-cpp-embedding-model.js.map +1 -1
package/dist/llama-cpp-language-model.d.ts +7 -0
package/dist/llama-cpp-language-model.d.ts.map +1 -1
package/dist/llama-cpp-language-model.js +12 -2
package/dist/llama-cpp-language-model.js.map +1 -1
package/dist/native-binding.d.ts +5 -0
package/dist/native-binding.d.ts.map +1 -1
package/dist/native-binding.js +3 -0
package/dist/native-binding.js.map +1 -1
package/native/CMakeLists.txt +2 -2
package/native/binding.cpp +187 -183
package/native/llama-wrapper.cpp +185 -124
package/native/llama-wrapper.h +48 -48
package/package.json +1 -1

package/native/llama-wrapper.h CHANGED Viewed

@@ -1,10 +1,10 @@
 #ifndef LLAMA_WRAPPER_H
 #define LLAMA_WRAPPER_H
-#include <string>
-#include <vector>
 #include <functional>
 #include <memory>
+#include <string>
+#include <vector>
 // Forward declarations for llama.cpp types
 struct llama_model;
@@ -15,10 +15,10 @@ namespace llama_wrapper {
 struct ModelParams {
     std::string model_path;
-    int n_gpu_layers = 99;  // Use GPU by default if available
-    bool use_mmap = true;
-    bool use_mlock = false;
-    bool debug = false;     // Show verbose llama.cpp output
+    int         n_gpu_layers  = 99;  // Use GPU by default if available
+    bool        use_mmap      = true;
+    bool        use_mlock     = false;
+    bool        debug         = false;   // Show verbose llama.cpp output
     std::string chat_template = "auto";  // "auto" uses template from model, or specify a built-in template
 };
@@ -28,51 +28,51 @@ struct ChatMessage {
 };
 struct ContextParams {
-    int n_ctx = 2048;      // Context size
-    int n_batch = 512;     // Batch size for prompt processing
-    int n_threads = 4;     // Number of threads
-    bool embedding = false; // Enable embedding mode with mean pooling
+    int  n_ctx     = 0;      // Context size
+    int  n_batch   = 0;      // Batch size for prompt processing
+    int  n_threads = 4;      // Number of threads
+    bool embedding = false;  // Enable embedding mode with mean pooling
 };
 struct GenerationParams {
-    int max_tokens = 256;
-    float temperature = 0.7f;
-    float top_p = 0.9f;
-    int top_k = 40;
-    float repeat_penalty = 1.1f;
+    int                      max_tokens     = 256;
+    float                    temperature    = 0.7f;
+    float                    top_p          = 0.9f;
+    int                      top_k          = 40;
+    float                    repeat_penalty = 1.1f;
     std::vector<std::string> stop_sequences;
 };
 struct GenerationResult {
     std::string text;
-    int prompt_tokens;
-    int completion_tokens;
+    int         prompt_tokens;
+    int         completion_tokens;
     std::string finish_reason;  // "stop", "length", or "error"
 };
 struct EmbeddingResult {
     std::vector<std::vector<float>> embeddings;  // One embedding vector per input text
-    int total_tokens;
+    int                             total_tokens;
 };
 // Token callback for streaming: returns false to stop generation
-using TokenCallback = std::function<bool(const std::string& token)>;
+using TokenCallback = std::function<bool(const std::string & token)>;
 class LlamaModel {
-public:
+  public:
     LlamaModel();
     ~LlamaModel();
     // Disable copy
-    LlamaModel(const LlamaModel&) = delete;
-    LlamaModel& operator=(const LlamaModel&) = delete;
+    LlamaModel(const LlamaModel &)             = delete;
+    LlamaModel & operator=(const LlamaModel &) = delete;
     // Enable move
-    LlamaModel(LlamaModel&& other) noexcept;
-    LlamaModel& operator=(LlamaModel&& other) noexcept;
+    LlamaModel(LlamaModel && other) noexcept;
+    LlamaModel & operator=(LlamaModel && other) noexcept;
     // Load a model from a GGUF file
-    bool load(const ModelParams& params);
+    bool load(const ModelParams & params);
     // Check if model is loaded
     bool is_loaded() const;
@@ -81,51 +81,51 @@ public:
     void unload();
     // Get the model path
-    const std::string& get_model_path() const { return model_path_; }
+    const std::string & get_model_path() const { return model_path_; }
     // Create a context for inference (or embeddings if params.embedding is true)
-    bool create_context(const ContextParams& params);
+    bool create_context(const ContextParams & params);
     // Apply chat template to messages and return formatted prompt
-    std::string apply_chat_template(const std::vector<ChatMessage>& messages);
+    std::string apply_chat_template(const std::vector<ChatMessage> & messages);
     // Generate text from messages (non-streaming)
-    GenerationResult generate(const std::vector<ChatMessage>& messages, const GenerationParams& params);
+    GenerationResult generate(const std::vector<ChatMessage> & messages, const GenerationParams & params);
     // Generate text from messages (streaming)
-    GenerationResult generate_streaming(
-        const std::vector<ChatMessage>& messages,
-        const GenerationParams& params,
-        TokenCallback callback
-    );
+    GenerationResult generate_streaming(const std::vector<ChatMessage> & messages,
+                                        const GenerationParams &         params,
+                                        TokenCallback                    callback);
     // Generate embeddings for multiple texts
-    EmbeddingResult embed(const std::vector<std::string>& texts);
+    EmbeddingResult embed(const std::vector<std::string> & texts);
-private:
-    llama_model* model_ = nullptr;
-    llama_context* ctx_ = nullptr;
-    llama_sampler* sampler_ = nullptr;
-    std::string model_path_;
-    std::string chat_template_;
+    // Tokenize a string (public for external access)
+    std::vector<int32_t> tokenize(const std::string & text, bool add_bos);
-    // Tokenize a string
-    std::vector<int32_t> tokenize(const std::string& text, bool add_bos);
+  private:
+    llama_model *   model_   = nullptr;
+    llama_context * ctx_     = nullptr;
+    llama_sampler * sampler_ = nullptr;
+    std::string     model_path_;
+    std::string     chat_template_;
     // Normalize an embedding vector (L2 normalization)
-    static void normalize_embedding(float* embedding, int n_embd);
+    static void normalize_embedding(float * embedding, int n_embd);
+    // Process a single chunk of tokens and return its embedding (unnormalized)
+    std::vector<float> embed_chunk(const std::vector<int32_t> & tokens, int seq_id, int n_embd, int pooling_type);
     // Detokenize a single token
     std::string detokenize(int32_t token);
     // Create sampler with given params
-    void create_sampler(const GenerationParams& params);
+    void create_sampler(const GenerationParams & params);
     // Check if token is end-of-sequence
     bool is_eos_token(int32_t token);
 };
-} // namespace llama_wrapper
-#endif // LLAMA_WRAPPER_H
+}  // namespace llama_wrapper
+#endif  // LLAMA_WRAPPER_H

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@shipworthy/ai-sdk-llama-cpp",
-  "version": "0.2.2",
+  "version": "0.2.4",
   "description": "A minimal llama.cpp provider for the Vercel AI SDK implementing LanguageModelV3 and EmbeddingModelV3",
   "type": "module",
   "main": "./dist/index.js",