cui-llama.rn 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/android/src/main/CMakeLists.txt +14 -8
  2. package/android/src/main/jni.cpp +38 -37
  3. package/cpp/common.cpp +50 -30
  4. package/cpp/common.h +32 -13
  5. package/cpp/ggml-alloc.c +0 -1
  6. package/cpp/ggml-backend-reg.cpp +79 -49
  7. package/cpp/ggml-backend.cpp +5 -2
  8. package/cpp/ggml-cpp.h +1 -0
  9. package/cpp/ggml-cpu-aarch64.cpp +57 -72
  10. package/cpp/ggml-cpu-quants.c +5 -1
  11. package/cpp/ggml-cpu.c +6 -6
  12. package/cpp/ggml-cpu.cpp +9 -0
  13. package/cpp/ggml-impl.h +11 -0
  14. package/cpp/ggml-metal.m +2 -2
  15. package/cpp/ggml.c +129 -1388
  16. package/cpp/ggml.h +29 -152
  17. package/cpp/gguf.cpp +1325 -0
  18. package/cpp/gguf.h +202 -0
  19. package/cpp/llama-adapter.cpp +346 -0
  20. package/cpp/llama-adapter.h +73 -0
  21. package/cpp/llama-arch.cpp +1434 -0
  22. package/cpp/llama-arch.h +395 -0
  23. package/cpp/llama-batch.cpp +368 -0
  24. package/cpp/llama-batch.h +88 -0
  25. package/cpp/llama-chat.cpp +567 -0
  26. package/cpp/llama-chat.h +51 -0
  27. package/cpp/llama-context.cpp +1771 -0
  28. package/cpp/llama-context.h +128 -0
  29. package/cpp/llama-cparams.cpp +1 -0
  30. package/cpp/llama-cparams.h +37 -0
  31. package/cpp/llama-cpp.h +30 -0
  32. package/cpp/llama-grammar.cpp +16 -15
  33. package/cpp/llama-grammar.h +5 -6
  34. package/cpp/llama-hparams.cpp +71 -0
  35. package/cpp/llama-hparams.h +140 -0
  36. package/cpp/llama-impl.cpp +167 -0
  37. package/cpp/llama-impl.h +16 -136
  38. package/cpp/llama-kv-cache.cpp +718 -0
  39. package/cpp/llama-kv-cache.h +218 -0
  40. package/cpp/llama-mmap.cpp +589 -0
  41. package/cpp/llama-mmap.h +67 -0
  42. package/cpp/llama-model-loader.cpp +1011 -0
  43. package/cpp/llama-model-loader.h +158 -0
  44. package/cpp/llama-model.cpp +2202 -0
  45. package/cpp/llama-model.h +391 -0
  46. package/cpp/llama-sampling.cpp +117 -4
  47. package/cpp/llama-vocab.cpp +26 -29
  48. package/cpp/llama-vocab.h +14 -2
  49. package/cpp/llama.cpp +8839 -19131
  50. package/cpp/llama.cpp.rej +23 -0
  51. package/cpp/llama.h +31 -9
  52. package/cpp/rn-llama.hpp +39 -37
  53. package/cpp/sgemm.cpp +1091 -378
  54. package/cpp/sgemm.h +2 -2
  55. package/cpp/unicode.cpp +6 -0
  56. package/package.json +1 -1
package/cpp/gguf.h ADDED
@@ -0,0 +1,202 @@
1
+ // This file contains functionality related to "GGUF" files, the binary file format used by ggml.
2
+ // GGUF files have the following structure:
3
+ //
4
+ // 1. File magic "GGUF" (4 bytes).
5
+ // 2. File version (uint32_t).
6
+ // 3. Number of ggml tensors in file (int64_t).
7
+ // 4. Number of key-value-pairs in file (int64_t).
8
+ // 5. For each KV pair:
9
+ // 1. The key (string).
10
+ // 2. The value type (lm_gguf_type).
11
+ // 3a. If the value type is LM_GGUF_TYPE_ARRAY:
12
+ // 1. The type of the array (lm_gguf_type).
13
+ // 2. The number of elements in the array (uint64_t).
14
+ // 3. The binary representation of each element in the array.
15
+ // 3b. Otherwise:
16
+ // 1. The binary representation of the value.
17
+ // 6. For each ggml tensor:
18
+ // 1. The tensor name (string).
19
+ // 2. The number of dimensions of the tensor (uint32_t).
20
+ // 3. For each dimension:
21
+ // 1. The size of the tensor in the dimension (int64_t).
22
+ // 4. The tensor data type (lm_ggml_type).
23
+ // 5. The tensor data offset in the tensor data binary blob (uint64_t).
24
+ // 7. The tensor data binary blob (optional, aligned).
25
+ //
26
+ // Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
27
+ // All enums are stored as int32_t.
28
+ // All bool values are stored as int8_t.
29
+ // If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
30
+ // otherwise LM_GGUF_DEFAULT_ALIGNMENT is used.
31
+ //
32
+ // Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
33
+
34
+ #pragma once
35
+
36
+ #include "ggml.h"
37
+
38
+ #include <stdbool.h>
39
+ #include <stdint.h>
40
+
41
+ #define LM_GGUF_MAGIC "GGUF"
42
+ #define LM_GGUF_VERSION 3
43
+
44
+ #define LM_GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
45
+
46
+ #define LM_GGUF_DEFAULT_ALIGNMENT 32
47
+
48
+ #ifdef __cplusplus
49
+ extern "C" {
50
+ #endif
51
+
52
+ // types that can be stored as GGUF KV data
53
+ enum lm_gguf_type {
54
+ LM_GGUF_TYPE_UINT8 = 0,
55
+ LM_GGUF_TYPE_INT8 = 1,
56
+ LM_GGUF_TYPE_UINT16 = 2,
57
+ LM_GGUF_TYPE_INT16 = 3,
58
+ LM_GGUF_TYPE_UINT32 = 4,
59
+ LM_GGUF_TYPE_INT32 = 5,
60
+ LM_GGUF_TYPE_FLOAT32 = 6,
61
+ LM_GGUF_TYPE_BOOL = 7,
62
+ LM_GGUF_TYPE_STRING = 8,
63
+ LM_GGUF_TYPE_ARRAY = 9,
64
+ LM_GGUF_TYPE_UINT64 = 10,
65
+ LM_GGUF_TYPE_INT64 = 11,
66
+ LM_GGUF_TYPE_FLOAT64 = 12,
67
+ LM_GGUF_TYPE_COUNT, // marks the end of the enum
68
+ };
69
+
70
+ struct lm_gguf_context;
71
+
72
+ struct lm_gguf_init_params {
73
+ bool no_alloc;
74
+
75
+ // if not NULL, create a lm_ggml_context and allocate the tensor data in it
76
+ struct lm_ggml_context ** ctx;
77
+ };
78
+
79
+ LM_GGML_API struct lm_gguf_context * lm_gguf_init_empty(void);
80
+ LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gguf_init_params params);
81
+ //LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_buffer(..);
82
+
83
+ LM_GGML_API void lm_gguf_free(struct lm_gguf_context * ctx);
84
+
85
+ LM_GGML_API const char * lm_gguf_type_name(enum lm_gguf_type type);
86
+
87
+ LM_GGML_API uint32_t lm_gguf_get_version (const struct lm_gguf_context * ctx);
88
+ LM_GGML_API size_t lm_gguf_get_alignment (const struct lm_gguf_context * ctx);
89
+ LM_GGML_API size_t lm_gguf_get_data_offset(const struct lm_gguf_context * ctx);
90
+
91
+ LM_GGML_API int64_t lm_gguf_get_n_kv(const struct lm_gguf_context * ctx);
92
+ LM_GGML_API int64_t lm_gguf_find_key(const struct lm_gguf_context * ctx, const char * key); // returns -1 if key is not found
93
+ LM_GGML_API const char * lm_gguf_get_key (const struct lm_gguf_context * ctx, int64_t key_id);
94
+
95
+ LM_GGML_API enum lm_gguf_type lm_gguf_get_kv_type (const struct lm_gguf_context * ctx, int64_t key_id);
96
+ LM_GGML_API enum lm_gguf_type lm_gguf_get_arr_type(const struct lm_gguf_context * ctx, int64_t key_id);
97
+
98
+ // will abort if the wrong type is used for the key
99
+ LM_GGML_API uint8_t lm_gguf_get_val_u8 (const struct lm_gguf_context * ctx, int64_t key_id);
100
+ LM_GGML_API int8_t lm_gguf_get_val_i8 (const struct lm_gguf_context * ctx, int64_t key_id);
101
+ LM_GGML_API uint16_t lm_gguf_get_val_u16 (const struct lm_gguf_context * ctx, int64_t key_id);
102
+ LM_GGML_API int16_t lm_gguf_get_val_i16 (const struct lm_gguf_context * ctx, int64_t key_id);
103
+ LM_GGML_API uint32_t lm_gguf_get_val_u32 (const struct lm_gguf_context * ctx, int64_t key_id);
104
+ LM_GGML_API int32_t lm_gguf_get_val_i32 (const struct lm_gguf_context * ctx, int64_t key_id);
105
+ LM_GGML_API float lm_gguf_get_val_f32 (const struct lm_gguf_context * ctx, int64_t key_id);
106
+ LM_GGML_API uint64_t lm_gguf_get_val_u64 (const struct lm_gguf_context * ctx, int64_t key_id);
107
+ LM_GGML_API int64_t lm_gguf_get_val_i64 (const struct lm_gguf_context * ctx, int64_t key_id);
108
+ LM_GGML_API double lm_gguf_get_val_f64 (const struct lm_gguf_context * ctx, int64_t key_id);
109
+ LM_GGML_API bool lm_gguf_get_val_bool(const struct lm_gguf_context * ctx, int64_t key_id);
110
+ LM_GGML_API const char * lm_gguf_get_val_str (const struct lm_gguf_context * ctx, int64_t key_id);
111
+ LM_GGML_API const void * lm_gguf_get_val_data(const struct lm_gguf_context * ctx, int64_t key_id);
112
+ LM_GGML_API size_t lm_gguf_get_arr_n (const struct lm_gguf_context * ctx, int64_t key_id);
113
+
114
+ // get raw pointer to the first element of the array with the given key_id
115
+ // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
116
+ LM_GGML_API const void * lm_gguf_get_arr_data(const struct lm_gguf_context * ctx, int64_t key_id);
117
+
118
+ // get ith C string from array with given key_id
119
+ LM_GGML_API const char * lm_gguf_get_arr_str (const struct lm_gguf_context * ctx, int64_t key_id, size_t i);
120
+
121
+ LM_GGML_API int64_t lm_gguf_get_n_tensors (const struct lm_gguf_context * ctx);
122
+ LM_GGML_API int64_t lm_gguf_find_tensor (const struct lm_gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
123
+ LM_GGML_API size_t lm_gguf_get_tensor_offset(const struct lm_gguf_context * ctx, int64_t tensor_id);
124
+ LM_GGML_API const char * lm_gguf_get_tensor_name (const struct lm_gguf_context * ctx, int64_t tensor_id);
125
+ LM_GGML_API enum lm_ggml_type lm_gguf_get_tensor_type (const struct lm_gguf_context * ctx, int64_t tensor_id);
126
+ LM_GGML_API size_t lm_gguf_get_tensor_size (const struct lm_gguf_context * ctx, int64_t tensor_id);
127
+
128
+ // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
129
+ LM_GGML_API int64_t lm_gguf_remove_key(struct lm_gguf_context * ctx, const char * key);
130
+
131
+ // overrides an existing KV pair or adds a new one, the new KV pair is always at the back
132
+ LM_GGML_API void lm_gguf_set_val_u8 (struct lm_gguf_context * ctx, const char * key, uint8_t val);
133
+ LM_GGML_API void lm_gguf_set_val_i8 (struct lm_gguf_context * ctx, const char * key, int8_t val);
134
+ LM_GGML_API void lm_gguf_set_val_u16 (struct lm_gguf_context * ctx, const char * key, uint16_t val);
135
+ LM_GGML_API void lm_gguf_set_val_i16 (struct lm_gguf_context * ctx, const char * key, int16_t val);
136
+ LM_GGML_API void lm_gguf_set_val_u32 (struct lm_gguf_context * ctx, const char * key, uint32_t val);
137
+ LM_GGML_API void lm_gguf_set_val_i32 (struct lm_gguf_context * ctx, const char * key, int32_t val);
138
+ LM_GGML_API void lm_gguf_set_val_f32 (struct lm_gguf_context * ctx, const char * key, float val);
139
+ LM_GGML_API void lm_gguf_set_val_u64 (struct lm_gguf_context * ctx, const char * key, uint64_t val);
140
+ LM_GGML_API void lm_gguf_set_val_i64 (struct lm_gguf_context * ctx, const char * key, int64_t val);
141
+ LM_GGML_API void lm_gguf_set_val_f64 (struct lm_gguf_context * ctx, const char * key, double val);
142
+ LM_GGML_API void lm_gguf_set_val_bool(struct lm_gguf_context * ctx, const char * key, bool val);
143
+ LM_GGML_API void lm_gguf_set_val_str (struct lm_gguf_context * ctx, const char * key, const char * val);
144
+
145
+ // creates a new array with n elements of the given type and copies the corresponding number of bytes from data
146
+ LM_GGML_API void lm_gguf_set_arr_data(struct lm_gguf_context * ctx, const char * key, enum lm_gguf_type type, const void * data, size_t n);
147
+
148
+ // creates a new array with n strings and copies the corresponding strings from data
149
+ LM_GGML_API void lm_gguf_set_arr_str (struct lm_gguf_context * ctx, const char * key, const char ** data, size_t n);
150
+
151
+ // set or add KV pairs from another context
152
+ LM_GGML_API void lm_gguf_set_kv(struct lm_gguf_context * ctx, const struct lm_gguf_context * src);
153
+
154
+ // add tensor to GGUF context, tensor name must be unique
155
+ LM_GGML_API void lm_gguf_add_tensor(struct lm_gguf_context * ctx, const struct lm_ggml_tensor * tensor);
156
+
157
+ // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
158
+ // in such a way that the tensor data remains as one contiguous block (except for padding)
159
+ LM_GGML_API void lm_gguf_set_tensor_type(struct lm_gguf_context * ctx, const char * name, enum lm_ggml_type type);
160
+
161
+ // assumes that at least lm_gguf_get_tensor_size bytes can be read from data
162
+ LM_GGML_API void lm_gguf_set_tensor_data(struct lm_gguf_context * ctx, const char * name, const void * data);
163
+
164
+ // writing gguf files can be done in 3 ways:
165
+ //
166
+ // - write the entire lm_gguf_context to a binary file in a single pass:
167
+ //
168
+ // lm_gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
169
+ //
170
+ // - write only the meta data to a file, then re-open the file and append the tensor data:
171
+ //
172
+ // lm_gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
173
+ // FILE * f = fopen(fname, "ab");
174
+ // fwrite(f, ...); // write tensor data
175
+ // fclose(f);
176
+ //
177
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
178
+ //
179
+ // FILE * f = fopen(fname, "wb");
180
+ // const size_t size_meta = lm_gguf_get_meta_size(ctx);
181
+ // fseek(f, size_meta, SEEK_SET);
182
+ // fwrite(f, ...); // write tensor data
183
+ // void * data = malloc(size_meta);
184
+ // lm_gguf_get_meta_data(ctx, data);
185
+ // rewind(f);
186
+ // fwrite(data, 1, data, f);
187
+ // free(data);
188
+ // fclose(f);
189
+ //
190
+
191
+ // write the entire context to a binary file
192
+ LM_GGML_API bool lm_gguf_write_to_file(const struct lm_gguf_context * ctx, const char * fname, bool only_meta);
193
+
194
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
195
+ LM_GGML_API size_t lm_gguf_get_meta_size(const struct lm_gguf_context * ctx);
196
+
197
+ // writes the meta data to pointer "data"
198
+ LM_GGML_API void lm_gguf_get_meta_data(const struct lm_gguf_context * ctx, void * data);
199
+
200
+ #ifdef __cplusplus
201
+ }
202
+ #endif
@@ -0,0 +1,346 @@
1
+ #include "llama-adapter.h"
2
+
3
+ #include "llama-model.h"
4
+
5
+ #include <algorithm>
6
+ #include <map>
7
+ #include <cassert>
8
+ #include <stdexcept>
9
+
10
+ // vec
11
+
12
+ struct lm_ggml_tensor * llama_control_vector::tensor_for(int il) const {
13
+ if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
14
+ return nullptr;
15
+ }
16
+
17
+ return tensors[il];
18
+ }
19
+
20
+ struct lm_ggml_tensor * llama_control_vector::apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int il) const {
21
+ lm_ggml_tensor * layer_dir = tensor_for(il);
22
+ if (layer_dir != nullptr) {
23
+ cur = lm_ggml_add(ctx, cur, layer_dir);
24
+ }
25
+
26
+ return cur;
27
+ }
28
+
29
+ static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
30
+ const auto & hparams = model.hparams;
31
+
32
+ LM_GGML_ASSERT(cvec.tensors.empty());
33
+ LM_GGML_ASSERT(cvec.ctxs.empty());
34
+ LM_GGML_ASSERT(cvec.bufs.empty());
35
+
36
+ // create a context for each buffer type
37
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
38
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
39
+ auto it = ctx_map.find(buft);
40
+ if (it == ctx_map.end()) {
41
+ struct lm_ggml_init_params params = {
42
+ /*.mem_size =*/ hparams.n_layer*lm_ggml_tensor_overhead(),
43
+ /*.mem_buffer =*/ NULL,
44
+ /*.no_alloc =*/ true,
45
+ };
46
+
47
+ lm_ggml_context * ctx = lm_ggml_init(params);
48
+ if (!ctx) {
49
+ return nullptr;
50
+ }
51
+
52
+ ctx_map[buft] = ctx;
53
+ cvec.ctxs.emplace_back(ctx);
54
+
55
+ return ctx;
56
+ }
57
+
58
+ return it->second;
59
+ };
60
+
61
+ // make tensors
62
+ cvec.tensors.reserve(hparams.n_layer);
63
+ cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
64
+ for (size_t il = 1; il < hparams.n_layer; il++) {
65
+ lm_ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
66
+ lm_ggml_context * ctx = ctx_for_buft(buft);
67
+ if (!ctx) {
68
+ LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
69
+ return false;
70
+ }
71
+ lm_ggml_tensor * tensor = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, hparams.n_embd);
72
+ cvec.tensors.push_back(tensor);
73
+ }
74
+
75
+ // allocate tensors / buffers and zero
76
+ cvec.bufs.reserve(ctx_map.size());
77
+ for (auto it : ctx_map) {
78
+ lm_ggml_backend_buffer_type_t buft = it.first;
79
+ lm_ggml_context * ctx = it.second;
80
+ lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
81
+ if (!buf) {
82
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
83
+ return false;
84
+ }
85
+ lm_ggml_backend_buffer_clear(buf, 0);
86
+ cvec.bufs.emplace_back(buf);
87
+ }
88
+
89
+ return true;
90
+ }
91
+
92
+ int32_t llama_control_vector_apply(
93
+ struct llama_control_vector & cvec,
94
+ const llama_model & model,
95
+ const float * data,
96
+ size_t len,
97
+ int32_t n_embd,
98
+ int32_t il_start,
99
+ int32_t il_end) {
100
+ const auto & hparams = model.hparams;
101
+
102
+ if (data == nullptr) {
103
+ // disable the current control vector (but leave allocated for later)
104
+ cvec.layer_start = -1;
105
+ cvec.layer_end = -1;
106
+ return 0;
107
+ }
108
+
109
+ if (n_embd != (int) hparams.n_embd) {
110
+ LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
111
+ return 1;
112
+ }
113
+
114
+ if (cvec.tensors.empty()) {
115
+ if (!llama_control_vector_init(cvec, model)) {
116
+ return 1;
117
+ }
118
+ }
119
+
120
+ cvec.layer_start = il_start;
121
+ cvec.layer_end = il_end;
122
+
123
+ for (size_t il = 1; il < hparams.n_layer; il++) {
124
+ assert(cvec.tensors[il] != nullptr);
125
+
126
+ const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
127
+ if (off + n_embd <= len) {
128
+ lm_ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * lm_ggml_element_size(cvec.tensors[il]));
129
+ }
130
+ }
131
+
132
+ return 0;
133
+ }
134
+
135
+ // lora
136
+
137
+ llama_lora_weight * llama_lora_adapter::get_weight(struct lm_ggml_tensor * w) {
138
+ const std::string name(w->name);
139
+
140
+ const auto pos = ab_map.find(name);
141
+ if (pos != ab_map.end()) {
142
+ return &pos->second;
143
+ }
144
+
145
+ return nullptr;
146
+ }
147
+
148
+ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
149
+ delete adapter;
150
+ }
151
+
152
+ static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
153
+ LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
154
+
155
+ lm_ggml_context * ctx_init;
156
+ struct lm_gguf_init_params meta_lm_gguf_params = {
157
+ /* .no_alloc = */ true,
158
+ /* .ctx = */ &ctx_init,
159
+ };
160
+
161
+ lm_gguf_context_ptr ctx_gguf { lm_gguf_init_from_file(path_lora, meta_lm_gguf_params) };
162
+ if (!ctx_gguf) {
163
+ throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
164
+ }
165
+
166
+ lm_ggml_context_ptr ctx { ctx_init };
167
+
168
+ // check metadata
169
+ {
170
+ auto get_kv_str = [&](const std::string & key) -> std::string {
171
+ int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
172
+ return id < 0 ? "" : std::string(lm_gguf_get_val_str(ctx_gguf.get(), id));
173
+ };
174
+ auto get_kv_f32 = [&](const std::string & key) -> float {
175
+ int id = lm_gguf_find_key(ctx_gguf.get(), key.c_str());
176
+ return id < 0 ? 0.0f : lm_gguf_get_val_f32(ctx_gguf.get(), id);
177
+ };
178
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
179
+
180
+ auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
181
+ if (general_type != "adapter") {
182
+ throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
183
+ }
184
+
185
+ auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
186
+ auto general_arch = llm_arch_from_string(general_arch_str);
187
+ if (general_arch != model.arch) {
188
+ throw std::runtime_error("model arch and LoRA arch mismatch");
189
+ }
190
+
191
+ auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
192
+ if (adapter_type != "lora") {
193
+ throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
194
+ }
195
+
196
+ adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
197
+ }
198
+
199
+ int n_tensors = lm_gguf_get_n_tensors(ctx_gguf.get());
200
+
201
+ // contexts for each buffer type
202
+ std::map<lm_ggml_backend_buffer_type_t, lm_ggml_context *> ctx_map;
203
+ auto ctx_for_buft = [&](lm_ggml_backend_buffer_type_t buft) -> lm_ggml_context * {
204
+ auto it = ctx_map.find(buft);
205
+ if (it == ctx_map.end()) {
206
+ // add a new context
207
+ struct lm_ggml_init_params params = {
208
+ /*.mem_size =*/ n_tensors*lm_ggml_tensor_overhead(),
209
+ /*.mem_buffer =*/ NULL,
210
+ /*.no_alloc =*/ true,
211
+ };
212
+ lm_ggml_context * buft_ctx = lm_ggml_init(params);
213
+ if (!buft_ctx) {
214
+ return nullptr;
215
+ }
216
+ ctx_map[buft] = buft_ctx;
217
+ adapter.ctxs.emplace_back(buft_ctx);
218
+ return buft_ctx;
219
+ };
220
+ return it->second;
221
+ };
222
+
223
+ // bundle lora_a and lora_b into pairs
224
+ std::map<std::string, llama_lora_weight> ab_map;
225
+ auto str_endswith = [](const std::string & str, const std::string & suffix) {
226
+ return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
227
+ };
228
+
229
+ for (lm_ggml_tensor * cur = lm_ggml_get_first_tensor(ctx.get()); cur; cur = lm_ggml_get_next_tensor(ctx.get(), cur)) {
230
+ std::string name(cur->name);
231
+ if (str_endswith(name, ".lora_a")) {
232
+ replace_all(name, ".lora_a", "");
233
+ if (ab_map.find(name) == ab_map.end()) {
234
+ ab_map[name] = llama_lora_weight(cur, nullptr);
235
+ } else {
236
+ ab_map[name].a = cur;
237
+ }
238
+ } else if (str_endswith(name, ".lora_b")) {
239
+ replace_all(name, ".lora_b", "");
240
+ if (ab_map.find(name) == ab_map.end()) {
241
+ ab_map[name] = llama_lora_weight(nullptr, cur);
242
+ } else {
243
+ ab_map[name].b = cur;
244
+ }
245
+ } else if (str_endswith(name, "_norm.weight")) {
246
+ // TODO: add support for norm vector
247
+ // for now, we don't really care because most adapters still work fine without it
248
+ continue;
249
+ } else {
250
+ throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
251
+ }
252
+ }
253
+
254
+ // add tensors
255
+ for (auto & it : ab_map) {
256
+ const std::string & name = it.first;
257
+ llama_lora_weight & w = it.second;
258
+ bool is_token_embd = str_endswith(name, "token_embd.weight");
259
+
260
+ if (!w.a || !w.b) {
261
+ throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
262
+ }
263
+
264
+ // device buft and device ctx
265
+ auto * model_tensor = llama_model_get_tensor(model, name.c_str());
266
+ if (!model_tensor) {
267
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
268
+ }
269
+
270
+ struct lm_ggml_context * dev_ctx = ctx_for_buft(lm_ggml_backend_buffer_get_type(model_tensor->buffer));
271
+ // validate tensor shape
272
+ if (is_token_embd) {
273
+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
274
+ if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
275
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
276
+ }
277
+ } else {
278
+ if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
279
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
280
+ }
281
+ if (w.a->ne[1] != w.b->ne[0]) {
282
+ throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
283
+ }
284
+ }
285
+
286
+ // save tensor to adapter
287
+ struct lm_ggml_tensor * tensor_a = lm_ggml_dup_tensor(dev_ctx, w.a);
288
+ struct lm_ggml_tensor * tensor_b = lm_ggml_dup_tensor(dev_ctx, w.b);
289
+ lm_ggml_set_name(tensor_a, w.a->name);
290
+ lm_ggml_set_name(tensor_b, w.b->name);
291
+ adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
292
+ }
293
+
294
+ // allocate tensors / buffers and zero
295
+ {
296
+ adapter.ctxs.reserve(ctx_map.size());
297
+ adapter.bufs.reserve(ctx_map.size());
298
+ for (auto & it : ctx_map) {
299
+ lm_ggml_backend_buffer_type_t buft = it.first;
300
+ lm_ggml_context * ctx_dev = it.second;
301
+ lm_ggml_backend_buffer_ptr buf { lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
302
+ if (!buf) {
303
+ throw std::runtime_error("failed to allocate buffer for lora adapter\n");
304
+ }
305
+ LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
306
+ adapter.bufs.emplace_back(std::move(buf));
307
+ }
308
+ }
309
+
310
+ // set tensor data
311
+ {
312
+ llama_file lm_gguf_file(path_lora, "rb");
313
+ std::vector<uint8_t> read_buf;
314
+ auto set_tensor = [&](struct lm_ggml_tensor * orig, struct lm_ggml_tensor * dev) {
315
+ size_t offs = lm_gguf_get_data_offset(ctx_gguf.get()) + lm_gguf_get_tensor_offset(ctx_gguf.get(), lm_gguf_find_tensor(ctx_gguf.get(), orig->name));
316
+ size_t size = lm_ggml_nbytes(orig);
317
+ read_buf.resize(size);
318
+ lm_gguf_file.seek(offs, SEEK_SET);
319
+ lm_gguf_file.read_raw(read_buf.data(), size);
320
+ lm_ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
321
+ };
322
+ for (auto & it : adapter.ab_map) {
323
+ auto orig = ab_map[it.first];
324
+ auto dev = it.second;
325
+ set_tensor(orig.a, dev.a);
326
+ set_tensor(orig.b, dev.b);
327
+ }
328
+ }
329
+
330
+ LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
331
+ }
332
+
333
+ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
334
+ struct llama_lora_adapter * adapter = new llama_lora_adapter();
335
+
336
+ try {
337
+ llama_lora_adapter_init_impl(*model, path_lora, *adapter);
338
+ return adapter;
339
+ } catch (const std::exception & err) {
340
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
341
+
342
+ delete adapter;
343
+ }
344
+
345
+ return nullptr;
346
+ }
@@ -0,0 +1,73 @@
1
+ #pragma once
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-hparams.h"
5
+
6
+ #include "ggml-cpp.h"
7
+
8
+ #include <unordered_map>
9
+ #include <vector>
10
+
11
+ //
12
+ // llama_adapter_cvec
13
+ //
14
+
15
+ // TODO: rename to llama_adapter_cvec
16
+ struct llama_control_vector {
17
+ std::vector<lm_ggml_context_ptr> ctxs;
18
+ std::vector<lm_ggml_backend_buffer_ptr> bufs;
19
+
20
+ std::vector<struct lm_ggml_tensor *> tensors; // per layer
21
+
22
+ int32_t layer_start = -1;
23
+ int32_t layer_end = -1;
24
+
25
+ struct lm_ggml_tensor * tensor_for(int il) const;
26
+
27
+ struct lm_ggml_tensor * apply_to(struct lm_ggml_context * ctx, struct lm_ggml_tensor * cur, int il) const;
28
+ };
29
+
30
+ int32_t llama_control_vector_apply(
31
+ struct llama_control_vector & cvec,
32
+ const llama_model & model,
33
+ const float * data,
34
+ size_t len,
35
+ int32_t n_embd,
36
+ int32_t il_start,
37
+ int32_t il_end);
38
+
39
+ //
40
+ // llama_adapter_lora
41
+ //
42
+
43
+ // TODO: rename to llama_adapter_lora_weight
44
+ struct llama_lora_weight {
45
+ struct lm_ggml_tensor * a = nullptr;
46
+ struct lm_ggml_tensor * b = nullptr;
47
+
48
+ // get actual scale based on rank and alpha
49
+ float get_scale(float alpha, float adapter_scale) {
50
+ const float rank = (float) b->ne[0];
51
+ const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
52
+ return scale;
53
+ }
54
+
55
+ llama_lora_weight() = default;
56
+ llama_lora_weight(struct lm_ggml_tensor * a, struct lm_ggml_tensor * b) : a(a), b(b) {}
57
+ };
58
+
59
+ // TODO: rename to llama_adapter_lora
60
+ struct llama_lora_adapter {
61
+ // map tensor name to lora_a_b
62
+ std::unordered_map<std::string, struct llama_lora_weight> ab_map;
63
+
64
+ std::vector<lm_ggml_context_ptr> ctxs;
65
+ std::vector<lm_ggml_backend_buffer_ptr> bufs;
66
+
67
+ float alpha;
68
+
69
+ llama_lora_adapter() = default;
70
+ ~llama_lora_adapter() = default;
71
+
72
+ llama_lora_weight * get_weight(struct lm_ggml_tensor * w);
73
+ };