@shipworthy/ai-sdk-llama-cpp 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  #ifndef LLAMA_WRAPPER_H
2
2
  #define LLAMA_WRAPPER_H
3
3
 
4
- #include <string>
5
- #include <vector>
6
4
  #include <functional>
7
5
  #include <memory>
6
+ #include <string>
7
+ #include <vector>
8
8
 
9
9
  // Forward declarations for llama.cpp types
10
10
  struct llama_model;
@@ -15,10 +15,10 @@ namespace llama_wrapper {
15
15
 
16
16
  struct ModelParams {
17
17
  std::string model_path;
18
- int n_gpu_layers = 99; // Use GPU by default if available
19
- bool use_mmap = true;
20
- bool use_mlock = false;
21
- bool debug = false; // Show verbose llama.cpp output
18
+ int n_gpu_layers = 99; // Use GPU by default if available
19
+ bool use_mmap = true;
20
+ bool use_mlock = false;
21
+ bool debug = false; // Show verbose llama.cpp output
22
22
  std::string chat_template = "auto"; // "auto" uses template from model, or specify a built-in template
23
23
  };
24
24
 
@@ -28,51 +28,51 @@ struct ChatMessage {
28
28
  };
29
29
 
30
30
  struct ContextParams {
31
- int n_ctx = 2048; // Context size
32
- int n_batch = 512; // Batch size for prompt processing
33
- int n_threads = 4; // Number of threads
34
- bool embedding = false; // Enable embedding mode with mean pooling
31
+ int n_ctx = 0; // Context size
32
+ int n_batch = 0; // Batch size for prompt processing
33
+ int n_threads = 4; // Number of threads
34
+ bool embedding = false; // Enable embedding mode with mean pooling
35
35
  };
36
36
 
37
37
  struct GenerationParams {
38
- int max_tokens = 256;
39
- float temperature = 0.7f;
40
- float top_p = 0.9f;
41
- int top_k = 40;
42
- float repeat_penalty = 1.1f;
38
+ int max_tokens = 256;
39
+ float temperature = 0.7f;
40
+ float top_p = 0.9f;
41
+ int top_k = 40;
42
+ float repeat_penalty = 1.1f;
43
43
  std::vector<std::string> stop_sequences;
44
44
  };
45
45
 
46
46
  struct GenerationResult {
47
47
  std::string text;
48
- int prompt_tokens;
49
- int completion_tokens;
48
+ int prompt_tokens;
49
+ int completion_tokens;
50
50
  std::string finish_reason; // "stop", "length", or "error"
51
51
  };
52
52
 
53
53
  struct EmbeddingResult {
54
54
  std::vector<std::vector<float>> embeddings; // One embedding vector per input text
55
- int total_tokens;
55
+ int total_tokens;
56
56
  };
57
57
 
58
58
  // Token callback for streaming: returns false to stop generation
59
- using TokenCallback = std::function<bool(const std::string& token)>;
59
+ using TokenCallback = std::function<bool(const std::string & token)>;
60
60
 
61
61
  class LlamaModel {
62
- public:
62
+ public:
63
63
  LlamaModel();
64
64
  ~LlamaModel();
65
65
 
66
66
  // Disable copy
67
- LlamaModel(const LlamaModel&) = delete;
68
- LlamaModel& operator=(const LlamaModel&) = delete;
67
+ LlamaModel(const LlamaModel &) = delete;
68
+ LlamaModel & operator=(const LlamaModel &) = delete;
69
69
 
70
70
  // Enable move
71
- LlamaModel(LlamaModel&& other) noexcept;
72
- LlamaModel& operator=(LlamaModel&& other) noexcept;
71
+ LlamaModel(LlamaModel && other) noexcept;
72
+ LlamaModel & operator=(LlamaModel && other) noexcept;
73
73
 
74
74
  // Load a model from a GGUF file
75
- bool load(const ModelParams& params);
75
+ bool load(const ModelParams & params);
76
76
 
77
77
  // Check if model is loaded
78
78
  bool is_loaded() const;
@@ -81,51 +81,51 @@ public:
81
81
  void unload();
82
82
 
83
83
  // Get the model path
84
- const std::string& get_model_path() const { return model_path_; }
84
+ const std::string & get_model_path() const { return model_path_; }
85
85
 
86
86
  // Create a context for inference (or embeddings if params.embedding is true)
87
- bool create_context(const ContextParams& params);
87
+ bool create_context(const ContextParams & params);
88
88
 
89
89
  // Apply chat template to messages and return formatted prompt
90
- std::string apply_chat_template(const std::vector<ChatMessage>& messages);
90
+ std::string apply_chat_template(const std::vector<ChatMessage> & messages);
91
91
 
92
92
  // Generate text from messages (non-streaming)
93
- GenerationResult generate(const std::vector<ChatMessage>& messages, const GenerationParams& params);
93
+ GenerationResult generate(const std::vector<ChatMessage> & messages, const GenerationParams & params);
94
94
 
95
95
  // Generate text from messages (streaming)
96
- GenerationResult generate_streaming(
97
- const std::vector<ChatMessage>& messages,
98
- const GenerationParams& params,
99
- TokenCallback callback
100
- );
96
+ GenerationResult generate_streaming(const std::vector<ChatMessage> & messages,
97
+ const GenerationParams & params,
98
+ TokenCallback callback);
101
99
 
102
100
  // Generate embeddings for multiple texts
103
- EmbeddingResult embed(const std::vector<std::string>& texts);
101
+ EmbeddingResult embed(const std::vector<std::string> & texts);
104
102
 
105
- private:
106
- llama_model* model_ = nullptr;
107
- llama_context* ctx_ = nullptr;
108
- llama_sampler* sampler_ = nullptr;
109
- std::string model_path_;
110
- std::string chat_template_;
103
+ // Tokenize a string (public for external access)
104
+ std::vector<int32_t> tokenize(const std::string & text, bool add_bos);
111
105
 
112
- // Tokenize a string
113
- std::vector<int32_t> tokenize(const std::string& text, bool add_bos);
106
+ private:
107
+ llama_model * model_ = nullptr;
108
+ llama_context * ctx_ = nullptr;
109
+ llama_sampler * sampler_ = nullptr;
110
+ std::string model_path_;
111
+ std::string chat_template_;
114
112
 
115
113
  // Normalize an embedding vector (L2 normalization)
116
- static void normalize_embedding(float* embedding, int n_embd);
114
+ static void normalize_embedding(float * embedding, int n_embd);
115
+
116
+ // Process a single chunk of tokens and return its embedding (unnormalized)
117
+ std::vector<float> embed_chunk(const std::vector<int32_t> & tokens, int seq_id, int n_embd, int pooling_type);
117
118
 
118
119
  // Detokenize a single token
119
120
  std::string detokenize(int32_t token);
120
121
 
121
122
  // Create sampler with given params
122
- void create_sampler(const GenerationParams& params);
123
+ void create_sampler(const GenerationParams & params);
123
124
 
124
125
  // Check if token is end-of-sequence
125
126
  bool is_eos_token(int32_t token);
126
127
  };
127
128
 
128
- } // namespace llama_wrapper
129
-
130
- #endif // LLAMA_WRAPPER_H
129
+ } // namespace llama_wrapper
131
130
 
131
+ #endif // LLAMA_WRAPPER_H
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shipworthy/ai-sdk-llama-cpp",
3
- "version": "0.2.2",
3
+ "version": "0.2.4",
4
4
  "description": "A minimal llama.cpp provider for the Vercel AI SDK implementing LanguageModelV3 and EmbeddingModelV3",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",