@shipworthy/ai-sdk-llama-cpp 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/llama-cpp-embedding-model.d.ts +7 -0
- package/dist/llama-cpp-embedding-model.d.ts.map +1 -1
- package/dist/llama-cpp-embedding-model.js +12 -2
- package/dist/llama-cpp-embedding-model.js.map +1 -1
- package/dist/llama-cpp-language-model.d.ts +7 -0
- package/dist/llama-cpp-language-model.d.ts.map +1 -1
- package/dist/llama-cpp-language-model.js +12 -2
- package/dist/llama-cpp-language-model.js.map +1 -1
- package/dist/native-binding.d.ts +5 -0
- package/dist/native-binding.d.ts.map +1 -1
- package/dist/native-binding.js +3 -0
- package/dist/native-binding.js.map +1 -1
- package/native/CMakeLists.txt +2 -2
- package/native/binding.cpp +187 -183
- package/native/llama-wrapper.cpp +185 -124
- package/native/llama-wrapper.h +48 -48
- package/package.json +1 -1
package/native/llama-wrapper.h
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#ifndef LLAMA_WRAPPER_H
|
|
2
2
|
#define LLAMA_WRAPPER_H
|
|
3
3
|
|
|
4
|
-
#include <string>
|
|
5
|
-
#include <vector>
|
|
6
4
|
#include <functional>
|
|
7
5
|
#include <memory>
|
|
6
|
+
#include <string>
|
|
7
|
+
#include <vector>
|
|
8
8
|
|
|
9
9
|
// Forward declarations for llama.cpp types
|
|
10
10
|
struct llama_model;
|
|
@@ -15,10 +15,10 @@ namespace llama_wrapper {
|
|
|
15
15
|
|
|
16
16
|
struct ModelParams {
|
|
17
17
|
std::string model_path;
|
|
18
|
-
int
|
|
19
|
-
bool
|
|
20
|
-
bool
|
|
21
|
-
bool
|
|
18
|
+
int n_gpu_layers = 99; // Use GPU by default if available
|
|
19
|
+
bool use_mmap = true;
|
|
20
|
+
bool use_mlock = false;
|
|
21
|
+
bool debug = false; // Show verbose llama.cpp output
|
|
22
22
|
std::string chat_template = "auto"; // "auto" uses template from model, or specify a built-in template
|
|
23
23
|
};
|
|
24
24
|
|
|
@@ -28,51 +28,51 @@ struct ChatMessage {
|
|
|
28
28
|
};
|
|
29
29
|
|
|
30
30
|
struct ContextParams {
|
|
31
|
-
int
|
|
32
|
-
int
|
|
33
|
-
int
|
|
34
|
-
bool embedding = false;
|
|
31
|
+
int n_ctx = 0; // Context size
|
|
32
|
+
int n_batch = 0; // Batch size for prompt processing
|
|
33
|
+
int n_threads = 4; // Number of threads
|
|
34
|
+
bool embedding = false; // Enable embedding mode with mean pooling
|
|
35
35
|
};
|
|
36
36
|
|
|
37
37
|
struct GenerationParams {
|
|
38
|
-
int
|
|
39
|
-
float
|
|
40
|
-
float
|
|
41
|
-
int
|
|
42
|
-
float
|
|
38
|
+
int max_tokens = 256;
|
|
39
|
+
float temperature = 0.7f;
|
|
40
|
+
float top_p = 0.9f;
|
|
41
|
+
int top_k = 40;
|
|
42
|
+
float repeat_penalty = 1.1f;
|
|
43
43
|
std::vector<std::string> stop_sequences;
|
|
44
44
|
};
|
|
45
45
|
|
|
46
46
|
struct GenerationResult {
|
|
47
47
|
std::string text;
|
|
48
|
-
int
|
|
49
|
-
int
|
|
48
|
+
int prompt_tokens;
|
|
49
|
+
int completion_tokens;
|
|
50
50
|
std::string finish_reason; // "stop", "length", or "error"
|
|
51
51
|
};
|
|
52
52
|
|
|
53
53
|
struct EmbeddingResult {
|
|
54
54
|
std::vector<std::vector<float>> embeddings; // One embedding vector per input text
|
|
55
|
-
int
|
|
55
|
+
int total_tokens;
|
|
56
56
|
};
|
|
57
57
|
|
|
58
58
|
// Token callback for streaming: returns false to stop generation
|
|
59
|
-
using TokenCallback = std::function<bool(const std::string& token)>;
|
|
59
|
+
using TokenCallback = std::function<bool(const std::string & token)>;
|
|
60
60
|
|
|
61
61
|
class LlamaModel {
|
|
62
|
-
public:
|
|
62
|
+
public:
|
|
63
63
|
LlamaModel();
|
|
64
64
|
~LlamaModel();
|
|
65
65
|
|
|
66
66
|
// Disable copy
|
|
67
|
-
LlamaModel(const LlamaModel&)
|
|
68
|
-
LlamaModel& operator=(const LlamaModel&) = delete;
|
|
67
|
+
LlamaModel(const LlamaModel &) = delete;
|
|
68
|
+
LlamaModel & operator=(const LlamaModel &) = delete;
|
|
69
69
|
|
|
70
70
|
// Enable move
|
|
71
|
-
LlamaModel(LlamaModel&& other) noexcept;
|
|
72
|
-
LlamaModel& operator=(LlamaModel&& other) noexcept;
|
|
71
|
+
LlamaModel(LlamaModel && other) noexcept;
|
|
72
|
+
LlamaModel & operator=(LlamaModel && other) noexcept;
|
|
73
73
|
|
|
74
74
|
// Load a model from a GGUF file
|
|
75
|
-
bool load(const ModelParams& params);
|
|
75
|
+
bool load(const ModelParams & params);
|
|
76
76
|
|
|
77
77
|
// Check if model is loaded
|
|
78
78
|
bool is_loaded() const;
|
|
@@ -81,51 +81,51 @@ public:
|
|
|
81
81
|
void unload();
|
|
82
82
|
|
|
83
83
|
// Get the model path
|
|
84
|
-
const std::string& get_model_path() const { return model_path_; }
|
|
84
|
+
const std::string & get_model_path() const { return model_path_; }
|
|
85
85
|
|
|
86
86
|
// Create a context for inference (or embeddings if params.embedding is true)
|
|
87
|
-
bool create_context(const ContextParams& params);
|
|
87
|
+
bool create_context(const ContextParams & params);
|
|
88
88
|
|
|
89
89
|
// Apply chat template to messages and return formatted prompt
|
|
90
|
-
std::string apply_chat_template(const std::vector<ChatMessage
|
|
90
|
+
std::string apply_chat_template(const std::vector<ChatMessage> & messages);
|
|
91
91
|
|
|
92
92
|
// Generate text from messages (non-streaming)
|
|
93
|
-
GenerationResult generate(const std::vector<ChatMessage
|
|
93
|
+
GenerationResult generate(const std::vector<ChatMessage> & messages, const GenerationParams & params);
|
|
94
94
|
|
|
95
95
|
// Generate text from messages (streaming)
|
|
96
|
-
GenerationResult generate_streaming(
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
TokenCallback callback
|
|
100
|
-
);
|
|
96
|
+
GenerationResult generate_streaming(const std::vector<ChatMessage> & messages,
|
|
97
|
+
const GenerationParams & params,
|
|
98
|
+
TokenCallback callback);
|
|
101
99
|
|
|
102
100
|
// Generate embeddings for multiple texts
|
|
103
|
-
EmbeddingResult embed(const std::vector<std::string
|
|
101
|
+
EmbeddingResult embed(const std::vector<std::string> & texts);
|
|
104
102
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
llama_context* ctx_ = nullptr;
|
|
108
|
-
llama_sampler* sampler_ = nullptr;
|
|
109
|
-
std::string model_path_;
|
|
110
|
-
std::string chat_template_;
|
|
103
|
+
// Tokenize a string (public for external access)
|
|
104
|
+
std::vector<int32_t> tokenize(const std::string & text, bool add_bos);
|
|
111
105
|
|
|
112
|
-
|
|
113
|
-
|
|
106
|
+
private:
|
|
107
|
+
llama_model * model_ = nullptr;
|
|
108
|
+
llama_context * ctx_ = nullptr;
|
|
109
|
+
llama_sampler * sampler_ = nullptr;
|
|
110
|
+
std::string model_path_;
|
|
111
|
+
std::string chat_template_;
|
|
114
112
|
|
|
115
113
|
// Normalize an embedding vector (L2 normalization)
|
|
116
|
-
static void normalize_embedding(float* embedding, int n_embd);
|
|
114
|
+
static void normalize_embedding(float * embedding, int n_embd);
|
|
115
|
+
|
|
116
|
+
// Process a single chunk of tokens and return its embedding (unnormalized)
|
|
117
|
+
std::vector<float> embed_chunk(const std::vector<int32_t> & tokens, int seq_id, int n_embd, int pooling_type);
|
|
117
118
|
|
|
118
119
|
// Detokenize a single token
|
|
119
120
|
std::string detokenize(int32_t token);
|
|
120
121
|
|
|
121
122
|
// Create sampler with given params
|
|
122
|
-
void create_sampler(const GenerationParams& params);
|
|
123
|
+
void create_sampler(const GenerationParams & params);
|
|
123
124
|
|
|
124
125
|
// Check if token is end-of-sequence
|
|
125
126
|
bool is_eos_token(int32_t token);
|
|
126
127
|
};
|
|
127
128
|
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
#endif // LLAMA_WRAPPER_H
|
|
129
|
+
} // namespace llama_wrapper
|
|
131
130
|
|
|
131
|
+
#endif // LLAMA_WRAPPER_H
|
package/package.json
CHANGED