llama_cpp 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -201,6 +201,8 @@
|
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
#define GGML_UNUSED(x) (void)(x)
|
205
|
+
|
204
206
|
#define GGML_ASSERT(x) \
|
205
207
|
do { \
|
206
208
|
if (!(x)) { \
|
@@ -209,6 +211,30 @@
|
|
209
211
|
} \
|
210
212
|
} while (0)
|
211
213
|
|
214
|
+
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
215
|
+
// main purpose is to reduce code duplication and improve readability.
|
216
|
+
//
|
217
|
+
// example:
|
218
|
+
//
|
219
|
+
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
220
|
+
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
221
|
+
//
|
222
|
+
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
223
|
+
const type prefix##0 = (pointer)->array[0]; \
|
224
|
+
GGML_UNUSED(prefix##0);
|
225
|
+
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
226
|
+
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
227
|
+
const type prefix##1 = (pointer)->array[1]; \
|
228
|
+
GGML_UNUSED(prefix##1);
|
229
|
+
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
230
|
+
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
231
|
+
const type prefix##2 = (pointer)->array[2]; \
|
232
|
+
GGML_UNUSED(prefix##2);
|
233
|
+
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
234
|
+
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
235
|
+
const type prefix##3 = (pointer)->array[3]; \
|
236
|
+
GGML_UNUSED(prefix##3);
|
237
|
+
|
212
238
|
#ifdef __cplusplus
|
213
239
|
extern "C" {
|
214
240
|
#endif
|
@@ -224,8 +250,8 @@ extern "C" {
|
|
224
250
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
225
251
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
226
252
|
|
227
|
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
228
|
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
253
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
254
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
229
255
|
|
230
256
|
struct ggml_object;
|
231
257
|
struct ggml_context;
|
@@ -295,12 +321,15 @@ extern "C" {
|
|
295
321
|
GGML_OP_SUM,
|
296
322
|
GGML_OP_SUM_ROWS,
|
297
323
|
GGML_OP_MEAN,
|
324
|
+
GGML_OP_ARGMAX,
|
298
325
|
GGML_OP_REPEAT,
|
299
326
|
GGML_OP_REPEAT_BACK,
|
300
327
|
GGML_OP_ABS,
|
301
328
|
GGML_OP_SGN,
|
302
329
|
GGML_OP_NEG,
|
303
330
|
GGML_OP_STEP,
|
331
|
+
GGML_OP_TANH,
|
332
|
+
GGML_OP_ELU,
|
304
333
|
GGML_OP_RELU,
|
305
334
|
GGML_OP_GELU,
|
306
335
|
GGML_OP_GELU_QUICK,
|
@@ -332,9 +361,8 @@ extern "C" {
|
|
332
361
|
GGML_OP_ROPE_BACK,
|
333
362
|
GGML_OP_ALIBI,
|
334
363
|
GGML_OP_CLAMP,
|
335
|
-
|
336
|
-
|
337
|
-
GGML_OP_CONV_2D_SK_P0,
|
364
|
+
GGML_OP_CONV_1D,
|
365
|
+
GGML_OP_CONV_2D,
|
338
366
|
|
339
367
|
GGML_OP_FLASH_ATTN,
|
340
368
|
GGML_OP_FLASH_FF,
|
@@ -444,6 +472,9 @@ extern "C" {
|
|
444
472
|
|
445
473
|
|
446
474
|
// compute types
|
475
|
+
|
476
|
+
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
477
|
+
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
447
478
|
enum ggml_task_type {
|
448
479
|
GGML_TASK_INIT = 0,
|
449
480
|
GGML_TASK_COMPUTE,
|
@@ -687,6 +718,11 @@ extern "C" {
|
|
687
718
|
struct ggml_context * ctx,
|
688
719
|
struct ggml_tensor * a);
|
689
720
|
|
721
|
+
// argmax along rows
|
722
|
+
GGML_API struct ggml_tensor * ggml_argmax(
|
723
|
+
struct ggml_context * ctx,
|
724
|
+
struct ggml_tensor * a);
|
725
|
+
|
690
726
|
// if a is the same shape as b, and a is not parameter, return a
|
691
727
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
692
728
|
GGML_API struct ggml_tensor * ggml_repeat(
|
@@ -731,6 +767,22 @@ extern "C" {
|
|
731
767
|
struct ggml_context * ctx,
|
732
768
|
struct ggml_tensor * a);
|
733
769
|
|
770
|
+
GGML_API struct ggml_tensor * ggml_tanh(
|
771
|
+
struct ggml_context * ctx,
|
772
|
+
struct ggml_tensor * a);
|
773
|
+
|
774
|
+
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
775
|
+
struct ggml_context * ctx,
|
776
|
+
struct ggml_tensor * a);
|
777
|
+
|
778
|
+
GGML_API struct ggml_tensor * ggml_elu(
|
779
|
+
struct ggml_context * ctx,
|
780
|
+
struct ggml_tensor * a);
|
781
|
+
|
782
|
+
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
783
|
+
struct ggml_context * ctx,
|
784
|
+
struct ggml_tensor * a);
|
785
|
+
|
734
786
|
GGML_API struct ggml_tensor * ggml_relu(
|
735
787
|
struct ggml_context * ctx,
|
736
788
|
struct ggml_tensor * a);
|
@@ -1081,58 +1133,33 @@ extern "C" {
|
|
1081
1133
|
float min,
|
1082
1134
|
float max);
|
1083
1135
|
|
1084
|
-
|
1085
|
-
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1086
|
-
// struct ggml_context * ctx,
|
1087
|
-
// struct ggml_tensor * a,
|
1088
|
-
// struct ggml_tensor * b,
|
1089
|
-
// int s0
|
1090
|
-
// int p0,
|
1091
|
-
// int d0);
|
1092
|
-
//
|
1093
|
-
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1094
|
-
// struct ggml_context * ctx,
|
1095
|
-
// struct ggml_tensor * a,
|
1096
|
-
// struct ggml_tensor * b,
|
1097
|
-
// int s0,
|
1098
|
-
// int s1,
|
1099
|
-
// int p0,
|
1100
|
-
// int p1,
|
1101
|
-
// int d0,
|
1102
|
-
// int d1);
|
1103
|
-
|
1104
|
-
// padding = half
|
1105
|
-
// TODO: we don't support extra parameters for now
|
1106
|
-
// that's why we are hard-coding the stride, padding, and dilation
|
1107
|
-
// not great ..
|
1108
|
-
// example:
|
1109
|
-
// a: 3 80 768 1
|
1110
|
-
// b: 3000 80 1 1
|
1111
|
-
// res: 3000 768 1 1
|
1112
|
-
// used in whisper
|
1113
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1136
|
+
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1114
1137
|
struct ggml_context * ctx,
|
1115
1138
|
struct ggml_tensor * a,
|
1116
|
-
struct ggml_tensor * b
|
1139
|
+
struct ggml_tensor * b,
|
1140
|
+
int s0, // stride
|
1141
|
+
int p0, // padding
|
1142
|
+
int d0); // dilation
|
1117
1143
|
|
1118
|
-
|
1119
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1144
|
+
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1120
1145
|
struct ggml_context * ctx,
|
1121
1146
|
struct ggml_tensor * a,
|
1122
|
-
struct ggml_tensor * b
|
1147
|
+
struct ggml_tensor * b,
|
1148
|
+
int s0,
|
1149
|
+
int s1,
|
1150
|
+
int p0,
|
1151
|
+
int p1,
|
1152
|
+
int d0,
|
1153
|
+
int d1);
|
1123
1154
|
|
1124
|
-
//
|
1125
|
-
//
|
1126
|
-
|
1127
|
-
// example:
|
1128
|
-
// a: 16 16 3 768
|
1129
|
-
// b: 1024 1024 3 1
|
1130
|
-
// res: 64 64 768 1
|
1131
|
-
// used in sam
|
1132
|
-
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1155
|
+
// conv_1d with padding = half
|
1156
|
+
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1157
|
+
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1133
1158
|
struct ggml_context * ctx,
|
1134
1159
|
struct ggml_tensor * a,
|
1135
|
-
struct ggml_tensor * b
|
1160
|
+
struct ggml_tensor * b,
|
1161
|
+
int s,
|
1162
|
+
int d);
|
1136
1163
|
|
1137
1164
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1138
1165
|
struct ggml_context * ctx,
|
@@ -1488,25 +1515,24 @@ extern "C" {
|
|
1488
1515
|
//
|
1489
1516
|
|
1490
1517
|
#ifdef __cplusplus
|
1491
|
-
|
1518
|
+
// restrict not standard in C++
|
1492
1519
|
#define GGML_RESTRICT
|
1493
1520
|
#else
|
1494
1521
|
#define GGML_RESTRICT restrict
|
1495
1522
|
#endif
|
1496
|
-
typedef void (*
|
1497
|
-
typedef void (*
|
1498
|
-
typedef void (*
|
1523
|
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
1524
|
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
1525
|
+
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
1499
1526
|
|
1500
1527
|
typedef struct {
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
1528
|
+
ggml_to_float_t to_float;
|
1529
|
+
ggml_from_float_t from_float;
|
1530
|
+
ggml_from_float_t from_float_reference;
|
1531
|
+
ggml_vec_dot_t vec_dot;
|
1532
|
+
enum ggml_type vec_dot_type;
|
1533
|
+
} ggml_type_traits_t;
|
1534
|
+
|
1535
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
1510
1536
|
|
1511
1537
|
#ifdef __cplusplus
|
1512
1538
|
}
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -66,6 +66,7 @@ enum e_model {
|
|
66
66
|
MODEL_65B,
|
67
67
|
};
|
68
68
|
|
69
|
+
static const size_t kB = 1024;
|
69
70
|
static const size_t MB = 1024*1024;
|
70
71
|
|
71
72
|
// computed for n_ctx == 2048
|
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
129
130
|
return k_sizes;
|
130
131
|
}
|
131
132
|
|
133
|
+
// amount of VRAM needed per batch size to hold temporary results
|
134
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
135
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
136
|
+
{
|
137
|
+
static std::map<e_model, size_t> k_sizes = {
|
138
|
+
{ MODEL_3B, 512ull * kB },
|
139
|
+
{ MODEL_7B, 512ull * kB },
|
140
|
+
{ MODEL_13B, 640ull * kB },
|
141
|
+
{ MODEL_30B, 768ull * kB },
|
142
|
+
{ MODEL_65B, 1536ull * kB },
|
143
|
+
};
|
144
|
+
return k_sizes;
|
145
|
+
}
|
146
|
+
|
147
|
+
// amount of VRAM needed per batch size and context to hold temporary results
|
148
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
149
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
150
|
+
{
|
151
|
+
static std::map<e_model, size_t> k_sizes = {
|
152
|
+
{ MODEL_3B, 128ull },
|
153
|
+
{ MODEL_7B, 128ull },
|
154
|
+
{ MODEL_13B, 160ull },
|
155
|
+
{ MODEL_30B, 208ull },
|
156
|
+
{ MODEL_65B, 416ull },
|
157
|
+
};
|
158
|
+
return k_sizes;
|
159
|
+
}
|
160
|
+
|
132
161
|
// default hparams (LLaMA 7B)
|
133
162
|
struct llama_hparams {
|
134
163
|
uint32_t n_vocab = 32000;
|
@@ -165,8 +194,8 @@ struct llama_layer {
|
|
165
194
|
};
|
166
195
|
|
167
196
|
struct llama_kv_cache {
|
168
|
-
struct ggml_tensor * k;
|
169
|
-
struct ggml_tensor * v;
|
197
|
+
struct ggml_tensor * k = NULL;
|
198
|
+
struct ggml_tensor * v = NULL;
|
170
199
|
|
171
200
|
struct ggml_context * ctx = NULL;
|
172
201
|
|
@@ -253,7 +282,13 @@ struct llama_model {
|
|
253
282
|
|
254
283
|
struct llama_context {
|
255
284
|
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
-
|
285
|
+
#ifdef GGML_USE_METAL
|
286
|
+
~llama_context() {
|
287
|
+
if (ctx_metal) {
|
288
|
+
ggml_metal_free(ctx_metal);
|
289
|
+
}
|
290
|
+
}
|
291
|
+
#endif
|
257
292
|
std::mt19937 rng;
|
258
293
|
|
259
294
|
bool has_evaluated_once = false;
|
@@ -446,9 +481,7 @@ struct llama_file_loader {
|
|
446
481
|
std::string word = file.read_string(len);
|
447
482
|
|
448
483
|
float score = 0.0f;
|
449
|
-
|
450
|
-
file.read_raw(&score, sizeof(score));
|
451
|
-
}
|
484
|
+
file.read_raw(&score, sizeof(score));
|
452
485
|
|
453
486
|
vocab.token_to_id[word] = i;
|
454
487
|
|
@@ -1112,14 +1145,18 @@ static void llama_model_load_internal(
|
|
1112
1145
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1113
1146
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1114
1147
|
} else {
|
1115
|
-
|
1148
|
+
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
1149
|
+
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
|
1150
|
+
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1116
1151
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1117
1152
|
if (n_gpu_layers > 0) {
|
1118
|
-
fprintf(stderr, "%s: allocating batch_size x
|
1119
|
-
__func__,
|
1153
|
+
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1154
|
+
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1155
|
+
(vram_scratch + MB - 1) / MB); // round up
|
1120
1156
|
}
|
1121
1157
|
}
|
1122
1158
|
#endif // GGML_USE_CUBLAS
|
1159
|
+
|
1123
1160
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1124
1161
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1125
1162
|
|
@@ -1128,6 +1165,10 @@ static void llama_model_load_internal(
|
|
1128
1165
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1129
1166
|
}
|
1130
1167
|
size_t vram_kv_cache = 0;
|
1168
|
+
|
1169
|
+
#ifdef GGML_USE_CUBLAS
|
1170
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
1171
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1131
1172
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1132
1173
|
if (low_vram) {
|
1133
1174
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
@@ -1144,14 +1185,18 @@ static void llama_model_load_internal(
|
|
1144
1185
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1145
1186
|
}
|
1146
1187
|
}
|
1147
|
-
|
1188
|
+
#elif defined(GGML_USE_CLBLAST)
|
1189
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
1190
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
1191
|
+
#endif // GGML_USE_CUBLAS
|
1192
|
+
|
1148
1193
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1149
|
-
__func__, std::min(n_gpu_layers, max_offloadable_layers),
|
1194
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1150
1195
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1151
1196
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1152
1197
|
#else
|
1153
1198
|
(void) n_gpu_layers;
|
1154
|
-
#endif
|
1199
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1155
1200
|
}
|
1156
1201
|
|
1157
1202
|
// populate `tensors_by_name`
|
@@ -1860,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
1860
1905
|
return;
|
1861
1906
|
}
|
1862
1907
|
|
1863
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1864
|
-
|
1865
1908
|
llama_sample_softmax(ctx, candidates);
|
1866
1909
|
|
1910
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1911
|
+
|
1867
1912
|
// Compute the cumulative probabilities
|
1868
1913
|
float cum_sum = 0.0f;
|
1869
1914
|
size_t last_idx = candidates->size;
|
@@ -1892,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
1892
1937
|
return;
|
1893
1938
|
}
|
1894
1939
|
|
1895
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1896
|
-
|
1897
1940
|
llama_sample_softmax(nullptr, candidates);
|
1941
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1898
1942
|
|
1899
1943
|
// Compute the first and second derivatives
|
1900
1944
|
std::vector<float> first_derivatives(candidates->size - 1);
|
@@ -1946,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
1946
1990
|
return;
|
1947
1991
|
}
|
1948
1992
|
|
1949
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1950
|
-
|
1951
1993
|
// Compute the softmax of logits and calculate entropy
|
1952
1994
|
llama_sample_softmax(nullptr, candidates);
|
1953
1995
|
|
1996
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1997
|
+
|
1954
1998
|
float entropy = 0.0f;
|
1955
1999
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1956
2000
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
@@ -2119,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
2119
2163
|
|
2120
2164
|
if (ctx) {
|
2121
2165
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2122
|
-
ctx->n_sample++;
|
2123
2166
|
}
|
2124
2167
|
return X;
|
2125
2168
|
}
|
2126
2169
|
|
2127
2170
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
2128
|
-
assert(ctx);
|
2129
2171
|
int64_t t_start_sample_us;
|
2130
2172
|
t_start_sample_us = ggml_time_us();
|
2131
2173
|
|
@@ -2140,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
2140
2182
|
candidates->size = 1;
|
2141
2183
|
}
|
2142
2184
|
|
2185
|
+
if (ctx) {
|
2186
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2187
|
+
}
|
2188
|
+
|
2143
2189
|
// Normalize the probabilities of the remaining words
|
2144
2190
|
llama_sample_softmax(ctx, candidates);
|
2145
2191
|
|
2146
2192
|
// Sample the next word X from the remaining words
|
2147
|
-
if (ctx) {
|
2148
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2149
|
-
}
|
2150
2193
|
llama_token X = llama_sample_token(ctx, candidates);
|
2151
2194
|
t_start_sample_us = ggml_time_us();
|
2152
2195
|
|
@@ -2214,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2214
2257
|
}
|
2215
2258
|
float * f32_output = (float *) output.addr;
|
2216
2259
|
|
2217
|
-
|
2260
|
+
ggml_type_traits_t qtype;
|
2218
2261
|
if (ggml_is_quantized(tensor.type)) {
|
2219
|
-
qtype =
|
2220
|
-
if (qtype.
|
2262
|
+
qtype = ggml_internal_get_type_traits(tensor.type);
|
2263
|
+
if (qtype.to_float == NULL) {
|
2221
2264
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2222
2265
|
}
|
2223
2266
|
} else if (tensor.type != GGML_TYPE_F16) {
|
@@ -2228,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2228
2271
|
if (tensor.type == GGML_TYPE_F16) {
|
2229
2272
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2230
2273
|
} else if (ggml_is_quantized(tensor.type)) {
|
2231
|
-
qtype.
|
2274
|
+
qtype.to_float(tensor.data, f32_output, nelements);
|
2232
2275
|
} else {
|
2233
2276
|
LLAMA_ASSERT(false); // unreachable
|
2234
2277
|
}
|
@@ -2253,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2253
2296
|
if (typ == GGML_TYPE_F16) {
|
2254
2297
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2255
2298
|
} else {
|
2256
|
-
qtype.
|
2299
|
+
qtype.to_float(inbuf, outbuf, nels);
|
2257
2300
|
}
|
2258
2301
|
};
|
2259
2302
|
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
@@ -3219,7 +3262,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3219
3262
|
return nread;
|
3220
3263
|
}
|
3221
3264
|
|
3222
|
-
bool
|
3265
|
+
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3223
3266
|
llama_file file(path_session, "rb");
|
3224
3267
|
|
3225
3268
|
// sanity checks
|
@@ -3273,6 +3316,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3273
3316
|
return true;
|
3274
3317
|
}
|
3275
3318
|
|
3319
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3320
|
+
try {
|
3321
|
+
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
3322
|
+
} catch (const std::exception & err) {
|
3323
|
+
fprintf(stderr, "error loading session file: %s\n", err.what());
|
3324
|
+
return false;
|
3325
|
+
}
|
3326
|
+
}
|
3327
|
+
|
3276
3328
|
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
3277
3329
|
llama_file file(path_session, "wb");
|
3278
3330
|
|
@@ -3428,23 +3480,35 @@ llama_token llama_token_nl() {
|
|
3428
3480
|
return 13;
|
3429
3481
|
}
|
3430
3482
|
|
3483
|
+
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
3484
|
+
struct llama_timings result = {
|
3485
|
+
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
3486
|
+
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
3487
|
+
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
3488
|
+
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
3489
|
+
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
3490
|
+
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
3431
3491
|
|
3432
|
-
|
3433
|
-
|
3492
|
+
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
3493
|
+
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
3494
|
+
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
3495
|
+
};
|
3434
3496
|
|
3435
|
-
|
3436
|
-
|
3437
|
-
|
3497
|
+
return result;
|
3498
|
+
}
|
3499
|
+
|
3500
|
+
void llama_print_timings(struct llama_context * ctx) {
|
3501
|
+
const llama_timings timings = llama_get_timings(ctx);
|
3438
3502
|
|
3439
3503
|
fprintf(stderr, "\n");
|
3440
|
-
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__,
|
3504
|
+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
3441
3505
|
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3442
|
-
__func__,
|
3506
|
+
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
3443
3507
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3444
|
-
__func__,
|
3508
|
+
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
3445
3509
|
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3446
|
-
__func__,
|
3447
|
-
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (
|
3510
|
+
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
3511
|
+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
3448
3512
|
}
|
3449
3513
|
|
3450
3514
|
void llama_reset_timings(struct llama_context * ctx) {
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -134,6 +134,20 @@ extern "C" {
|
|
134
134
|
bool quantize_output_tensor; // quantize output.weight
|
135
135
|
} llama_model_quantize_params;
|
136
136
|
|
137
|
+
// performance timing information
|
138
|
+
struct llama_timings {
|
139
|
+
double t_start_ms;
|
140
|
+
double t_end_ms;
|
141
|
+
double t_load_ms;
|
142
|
+
double t_sample_ms;
|
143
|
+
double t_p_eval_ms;
|
144
|
+
double t_eval_ms;
|
145
|
+
|
146
|
+
int32_t n_sample;
|
147
|
+
int32_t n_p_eval;
|
148
|
+
int32_t n_eval;
|
149
|
+
};
|
150
|
+
|
137
151
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
138
152
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
139
153
|
|
@@ -331,6 +345,7 @@ extern "C" {
|
|
331
345
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
332
346
|
|
333
347
|
// Performance information
|
348
|
+
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
334
349
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
335
350
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
336
351
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-481f793'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
|
|
16
16
|
# @param prompt [String] The prompt to start generation with.
|
17
17
|
# @param n_predict [Integer] The number of tokens to predict.
|
18
18
|
# @param n_threads [Integer] The number of threads.
|
19
|
+
# @param n_keep [Integer] The number of tokens to keep in the context.
|
20
|
+
# @param n_batch [Integer] The number of tokens to process in a batch.
|
21
|
+
# @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
|
22
|
+
# @param repeat_penalty [Float] The repetition penalty.
|
23
|
+
# @param frequency [Float] The frequency penalty.
|
24
|
+
# @param presence [Float] The presence penalty.
|
25
|
+
# @param top_k [Integer] The number of tokens to consider for top-k sampling.
|
26
|
+
# @param top_p [Float] The probability threshold for nucleus sampling.
|
27
|
+
# @param tfs_z [Float] The z parameter for tail-free sampling.
|
28
|
+
# @param typical_p [Float] The probability for typical sampling.
|
29
|
+
# @param temperature [Float] The temperature for temperature sampling.
|
19
30
|
# @return [String]
|
20
|
-
def generate(context, prompt,
|
31
|
+
def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
32
|
+
n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
|
33
|
+
repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
|
34
|
+
top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
|
21
35
|
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
22
36
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
23
37
|
|
@@ -31,19 +45,8 @@ module LLaMACpp
|
|
31
45
|
|
32
46
|
embd = []
|
33
47
|
n_consumed = 0
|
34
|
-
n_keep = 10
|
35
48
|
n_past = 0
|
36
49
|
n_remain = n_predict
|
37
|
-
repeat_last_n = 64
|
38
|
-
repeat_penalty = 1.1
|
39
|
-
frequency = 0.0
|
40
|
-
presence = 0.0
|
41
|
-
top_k = 40
|
42
|
-
top_p = 0.95
|
43
|
-
tfs_z = 1.0
|
44
|
-
typical_p = 1.0
|
45
|
-
temperature = 0.8
|
46
|
-
n_batch = 512
|
47
50
|
n_vocab = context.n_vocab
|
48
51
|
output = []
|
49
52
|
|
data/sig/llama_cpp.rbs
CHANGED
@@ -28,7 +28,10 @@ module LLaMACpp
|
|
28
28
|
|
29
29
|
def self?.init_backend: (?numa: bool) -> void
|
30
30
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
31
|
-
def self?.generate: (::LLaMACpp::Context, String,
|
31
|
+
def self?.generate: (::LLaMACpp::Context, String,
|
32
|
+
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
33
|
+
?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
|
34
|
+
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
32
35
|
def self?.print_system_info: () -> void
|
33
36
|
def self?.token_bos: () -> Integer
|
34
37
|
def self?.token_eos: () -> Integer
|
@@ -67,6 +70,20 @@ module LLaMACpp
|
|
67
70
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
68
71
|
end
|
69
72
|
|
73
|
+
class Timings
|
74
|
+
public
|
75
|
+
|
76
|
+
def t_start_ms: () -> Float
|
77
|
+
def t_end_ms: () -> Float
|
78
|
+
def t_load_ms: () -> Float
|
79
|
+
def t_sample_ms: () -> Float
|
80
|
+
def t_p_eval_ms: () -> Float
|
81
|
+
def t_eval_ms: () -> Float
|
82
|
+
def n_sample: () -> Integer
|
83
|
+
def n_p_eval: () -> Integer
|
84
|
+
def n_eval: () -> Integer
|
85
|
+
end
|
86
|
+
|
70
87
|
class Context
|
71
88
|
public
|
72
89
|
|
@@ -80,6 +97,7 @@ module LLaMACpp
|
|
80
97
|
def n_embd: () -> Integer
|
81
98
|
def n_vocab: () -> Integer
|
82
99
|
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
100
|
+
def timings: () -> ::LLaMACpp::Timings
|
83
101
|
def print_timings: () -> void
|
84
102
|
def reset_timings: () -> void
|
85
103
|
def token_to_str: (Integer) -> String
|