llama_cpp 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -201,6 +201,8 @@
|
|
201
201
|
#define GGML_MAX_NAME 48
|
202
202
|
#define GGML_DEFAULT_N_THREADS 4
|
203
203
|
|
204
|
+
#define GGML_UNUSED(x) (void)(x)
|
205
|
+
|
204
206
|
#define GGML_ASSERT(x) \
|
205
207
|
do { \
|
206
208
|
if (!(x)) { \
|
@@ -209,6 +211,30 @@
|
|
209
211
|
} \
|
210
212
|
} while (0)
|
211
213
|
|
214
|
+
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
215
|
+
// main purpose is to reduce code duplication and improve readability.
|
216
|
+
//
|
217
|
+
// example:
|
218
|
+
//
|
219
|
+
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
220
|
+
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
221
|
+
//
|
222
|
+
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
223
|
+
const type prefix##0 = (pointer)->array[0]; \
|
224
|
+
GGML_UNUSED(prefix##0);
|
225
|
+
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
226
|
+
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
227
|
+
const type prefix##1 = (pointer)->array[1]; \
|
228
|
+
GGML_UNUSED(prefix##1);
|
229
|
+
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
230
|
+
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
231
|
+
const type prefix##2 = (pointer)->array[2]; \
|
232
|
+
GGML_UNUSED(prefix##2);
|
233
|
+
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
234
|
+
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
235
|
+
const type prefix##3 = (pointer)->array[3]; \
|
236
|
+
GGML_UNUSED(prefix##3);
|
237
|
+
|
212
238
|
#ifdef __cplusplus
|
213
239
|
extern "C" {
|
214
240
|
#endif
|
@@ -224,8 +250,8 @@ extern "C" {
|
|
224
250
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
225
251
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
226
252
|
|
227
|
-
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y,
|
228
|
-
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y,
|
253
|
+
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
254
|
+
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
229
255
|
|
230
256
|
struct ggml_object;
|
231
257
|
struct ggml_context;
|
@@ -295,12 +321,15 @@ extern "C" {
|
|
295
321
|
GGML_OP_SUM,
|
296
322
|
GGML_OP_SUM_ROWS,
|
297
323
|
GGML_OP_MEAN,
|
324
|
+
GGML_OP_ARGMAX,
|
298
325
|
GGML_OP_REPEAT,
|
299
326
|
GGML_OP_REPEAT_BACK,
|
300
327
|
GGML_OP_ABS,
|
301
328
|
GGML_OP_SGN,
|
302
329
|
GGML_OP_NEG,
|
303
330
|
GGML_OP_STEP,
|
331
|
+
GGML_OP_TANH,
|
332
|
+
GGML_OP_ELU,
|
304
333
|
GGML_OP_RELU,
|
305
334
|
GGML_OP_GELU,
|
306
335
|
GGML_OP_GELU_QUICK,
|
@@ -332,9 +361,8 @@ extern "C" {
|
|
332
361
|
GGML_OP_ROPE_BACK,
|
333
362
|
GGML_OP_ALIBI,
|
334
363
|
GGML_OP_CLAMP,
|
335
|
-
|
336
|
-
|
337
|
-
GGML_OP_CONV_2D_SK_P0,
|
364
|
+
GGML_OP_CONV_1D,
|
365
|
+
GGML_OP_CONV_2D,
|
338
366
|
|
339
367
|
GGML_OP_FLASH_ATTN,
|
340
368
|
GGML_OP_FLASH_FF,
|
@@ -444,6 +472,9 @@ extern "C" {
|
|
444
472
|
|
445
473
|
|
446
474
|
// compute types
|
475
|
+
|
476
|
+
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
477
|
+
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
447
478
|
enum ggml_task_type {
|
448
479
|
GGML_TASK_INIT = 0,
|
449
480
|
GGML_TASK_COMPUTE,
|
@@ -687,6 +718,11 @@ extern "C" {
|
|
687
718
|
struct ggml_context * ctx,
|
688
719
|
struct ggml_tensor * a);
|
689
720
|
|
721
|
+
// argmax along rows
|
722
|
+
GGML_API struct ggml_tensor * ggml_argmax(
|
723
|
+
struct ggml_context * ctx,
|
724
|
+
struct ggml_tensor * a);
|
725
|
+
|
690
726
|
// if a is the same shape as b, and a is not parameter, return a
|
691
727
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
692
728
|
GGML_API struct ggml_tensor * ggml_repeat(
|
@@ -731,6 +767,22 @@ extern "C" {
|
|
731
767
|
struct ggml_context * ctx,
|
732
768
|
struct ggml_tensor * a);
|
733
769
|
|
770
|
+
GGML_API struct ggml_tensor * ggml_tanh(
|
771
|
+
struct ggml_context * ctx,
|
772
|
+
struct ggml_tensor * a);
|
773
|
+
|
774
|
+
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
775
|
+
struct ggml_context * ctx,
|
776
|
+
struct ggml_tensor * a);
|
777
|
+
|
778
|
+
GGML_API struct ggml_tensor * ggml_elu(
|
779
|
+
struct ggml_context * ctx,
|
780
|
+
struct ggml_tensor * a);
|
781
|
+
|
782
|
+
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
783
|
+
struct ggml_context * ctx,
|
784
|
+
struct ggml_tensor * a);
|
785
|
+
|
734
786
|
GGML_API struct ggml_tensor * ggml_relu(
|
735
787
|
struct ggml_context * ctx,
|
736
788
|
struct ggml_tensor * a);
|
@@ -1081,58 +1133,33 @@ extern "C" {
|
|
1081
1133
|
float min,
|
1082
1134
|
float max);
|
1083
1135
|
|
1084
|
-
|
1085
|
-
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1086
|
-
// struct ggml_context * ctx,
|
1087
|
-
// struct ggml_tensor * a,
|
1088
|
-
// struct ggml_tensor * b,
|
1089
|
-
// int s0
|
1090
|
-
// int p0,
|
1091
|
-
// int d0);
|
1092
|
-
//
|
1093
|
-
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1094
|
-
// struct ggml_context * ctx,
|
1095
|
-
// struct ggml_tensor * a,
|
1096
|
-
// struct ggml_tensor * b,
|
1097
|
-
// int s0,
|
1098
|
-
// int s1,
|
1099
|
-
// int p0,
|
1100
|
-
// int p1,
|
1101
|
-
// int d0,
|
1102
|
-
// int d1);
|
1103
|
-
|
1104
|
-
// padding = half
|
1105
|
-
// TODO: we don't support extra parameters for now
|
1106
|
-
// that's why we are hard-coding the stride, padding, and dilation
|
1107
|
-
// not great ..
|
1108
|
-
// example:
|
1109
|
-
// a: 3 80 768 1
|
1110
|
-
// b: 3000 80 1 1
|
1111
|
-
// res: 3000 768 1 1
|
1112
|
-
// used in whisper
|
1113
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1136
|
+
GGML_API struct ggml_tensor * ggml_conv_1d(
|
1114
1137
|
struct ggml_context * ctx,
|
1115
1138
|
struct ggml_tensor * a,
|
1116
|
-
struct ggml_tensor * b
|
1139
|
+
struct ggml_tensor * b,
|
1140
|
+
int s0, // stride
|
1141
|
+
int p0, // padding
|
1142
|
+
int d0); // dilation
|
1117
1143
|
|
1118
|
-
|
1119
|
-
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1144
|
+
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1120
1145
|
struct ggml_context * ctx,
|
1121
1146
|
struct ggml_tensor * a,
|
1122
|
-
struct ggml_tensor * b
|
1147
|
+
struct ggml_tensor * b,
|
1148
|
+
int s0,
|
1149
|
+
int s1,
|
1150
|
+
int p0,
|
1151
|
+
int p1,
|
1152
|
+
int d0,
|
1153
|
+
int d1);
|
1123
1154
|
|
1124
|
-
//
|
1125
|
-
//
|
1126
|
-
|
1127
|
-
// example:
|
1128
|
-
// a: 16 16 3 768
|
1129
|
-
// b: 1024 1024 3 1
|
1130
|
-
// res: 64 64 768 1
|
1131
|
-
// used in sam
|
1132
|
-
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1155
|
+
// conv_1d with padding = half
|
1156
|
+
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1157
|
+
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1133
1158
|
struct ggml_context * ctx,
|
1134
1159
|
struct ggml_tensor * a,
|
1135
|
-
struct ggml_tensor * b
|
1160
|
+
struct ggml_tensor * b,
|
1161
|
+
int s,
|
1162
|
+
int d);
|
1136
1163
|
|
1137
1164
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1138
1165
|
struct ggml_context * ctx,
|
@@ -1488,25 +1515,24 @@ extern "C" {
|
|
1488
1515
|
//
|
1489
1516
|
|
1490
1517
|
#ifdef __cplusplus
|
1491
|
-
|
1518
|
+
// restrict not standard in C++
|
1492
1519
|
#define GGML_RESTRICT
|
1493
1520
|
#else
|
1494
1521
|
#define GGML_RESTRICT restrict
|
1495
1522
|
#endif
|
1496
|
-
typedef void (*
|
1497
|
-
typedef void (*
|
1498
|
-
typedef void (*
|
1523
|
+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
1524
|
+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
1525
|
+
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
1499
1526
|
|
1500
1527
|
typedef struct {
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
1528
|
+
ggml_to_float_t to_float;
|
1529
|
+
ggml_from_float_t from_float;
|
1530
|
+
ggml_from_float_t from_float_reference;
|
1531
|
+
ggml_vec_dot_t vec_dot;
|
1532
|
+
enum ggml_type vec_dot_type;
|
1533
|
+
} ggml_type_traits_t;
|
1534
|
+
|
1535
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
1510
1536
|
|
1511
1537
|
#ifdef __cplusplus
|
1512
1538
|
}
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -66,6 +66,7 @@ enum e_model {
|
|
66
66
|
MODEL_65B,
|
67
67
|
};
|
68
68
|
|
69
|
+
static const size_t kB = 1024;
|
69
70
|
static const size_t MB = 1024*1024;
|
70
71
|
|
71
72
|
// computed for n_ctx == 2048
|
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
129
130
|
return k_sizes;
|
130
131
|
}
|
131
132
|
|
133
|
+
// amount of VRAM needed per batch size to hold temporary results
|
134
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
135
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
136
|
+
{
|
137
|
+
static std::map<e_model, size_t> k_sizes = {
|
138
|
+
{ MODEL_3B, 512ull * kB },
|
139
|
+
{ MODEL_7B, 512ull * kB },
|
140
|
+
{ MODEL_13B, 640ull * kB },
|
141
|
+
{ MODEL_30B, 768ull * kB },
|
142
|
+
{ MODEL_65B, 1536ull * kB },
|
143
|
+
};
|
144
|
+
return k_sizes;
|
145
|
+
}
|
146
|
+
|
147
|
+
// amount of VRAM needed per batch size and context to hold temporary results
|
148
|
+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
149
|
+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
150
|
+
{
|
151
|
+
static std::map<e_model, size_t> k_sizes = {
|
152
|
+
{ MODEL_3B, 128ull },
|
153
|
+
{ MODEL_7B, 128ull },
|
154
|
+
{ MODEL_13B, 160ull },
|
155
|
+
{ MODEL_30B, 208ull },
|
156
|
+
{ MODEL_65B, 416ull },
|
157
|
+
};
|
158
|
+
return k_sizes;
|
159
|
+
}
|
160
|
+
|
132
161
|
// default hparams (LLaMA 7B)
|
133
162
|
struct llama_hparams {
|
134
163
|
uint32_t n_vocab = 32000;
|
@@ -165,8 +194,8 @@ struct llama_layer {
|
|
165
194
|
};
|
166
195
|
|
167
196
|
struct llama_kv_cache {
|
168
|
-
struct ggml_tensor * k;
|
169
|
-
struct ggml_tensor * v;
|
197
|
+
struct ggml_tensor * k = NULL;
|
198
|
+
struct ggml_tensor * v = NULL;
|
170
199
|
|
171
200
|
struct ggml_context * ctx = NULL;
|
172
201
|
|
@@ -253,7 +282,13 @@ struct llama_model {
|
|
253
282
|
|
254
283
|
struct llama_context {
|
255
284
|
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
256
|
-
|
285
|
+
#ifdef GGML_USE_METAL
|
286
|
+
~llama_context() {
|
287
|
+
if (ctx_metal) {
|
288
|
+
ggml_metal_free(ctx_metal);
|
289
|
+
}
|
290
|
+
}
|
291
|
+
#endif
|
257
292
|
std::mt19937 rng;
|
258
293
|
|
259
294
|
bool has_evaluated_once = false;
|
@@ -446,9 +481,7 @@ struct llama_file_loader {
|
|
446
481
|
std::string word = file.read_string(len);
|
447
482
|
|
448
483
|
float score = 0.0f;
|
449
|
-
|
450
|
-
file.read_raw(&score, sizeof(score));
|
451
|
-
}
|
484
|
+
file.read_raw(&score, sizeof(score));
|
452
485
|
|
453
486
|
vocab.token_to_id[word] = i;
|
454
487
|
|
@@ -1112,14 +1145,18 @@ static void llama_model_load_internal(
|
|
1112
1145
|
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
1113
1146
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
1114
1147
|
} else {
|
1115
|
-
|
1148
|
+
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
1149
|
+
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
|
1150
|
+
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
1116
1151
|
ggml_cuda_set_scratch_size(vram_scratch);
|
1117
1152
|
if (n_gpu_layers > 0) {
|
1118
|
-
fprintf(stderr, "%s: allocating batch_size x
|
1119
|
-
__func__,
|
1153
|
+
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
|
1154
|
+
__func__, vram_scratch_base / kB, vram_scratch_per_context,
|
1155
|
+
(vram_scratch + MB - 1) / MB); // round up
|
1120
1156
|
}
|
1121
1157
|
}
|
1122
1158
|
#endif // GGML_USE_CUBLAS
|
1159
|
+
|
1123
1160
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1124
1161
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
1125
1162
|
|
@@ -1128,6 +1165,10 @@ static void llama_model_load_internal(
|
|
1128
1165
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
1129
1166
|
}
|
1130
1167
|
size_t vram_kv_cache = 0;
|
1168
|
+
|
1169
|
+
#ifdef GGML_USE_CUBLAS
|
1170
|
+
const int max_backend_supported_layers = hparams.n_layer + 3;
|
1171
|
+
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
1131
1172
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
1132
1173
|
if (low_vram) {
|
1133
1174
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
@@ -1144,14 +1185,18 @@ static void llama_model_load_internal(
|
|
1144
1185
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
1145
1186
|
}
|
1146
1187
|
}
|
1147
|
-
|
1188
|
+
#elif defined(GGML_USE_CLBLAST)
|
1189
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
1190
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
1191
|
+
#endif // GGML_USE_CUBLAS
|
1192
|
+
|
1148
1193
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
1149
|
-
__func__, std::min(n_gpu_layers, max_offloadable_layers),
|
1194
|
+
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
1150
1195
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
1151
1196
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
1152
1197
|
#else
|
1153
1198
|
(void) n_gpu_layers;
|
1154
|
-
#endif
|
1199
|
+
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
1155
1200
|
}
|
1156
1201
|
|
1157
1202
|
// populate `tensors_by_name`
|
@@ -1860,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
1860
1905
|
return;
|
1861
1906
|
}
|
1862
1907
|
|
1863
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1864
|
-
|
1865
1908
|
llama_sample_softmax(ctx, candidates);
|
1866
1909
|
|
1910
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1911
|
+
|
1867
1912
|
// Compute the cumulative probabilities
|
1868
1913
|
float cum_sum = 0.0f;
|
1869
1914
|
size_t last_idx = candidates->size;
|
@@ -1892,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
|
1892
1937
|
return;
|
1893
1938
|
}
|
1894
1939
|
|
1895
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1896
|
-
|
1897
1940
|
llama_sample_softmax(nullptr, candidates);
|
1941
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1898
1942
|
|
1899
1943
|
// Compute the first and second derivatives
|
1900
1944
|
std::vector<float> first_derivatives(candidates->size - 1);
|
@@ -1946,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
1946
1990
|
return;
|
1947
1991
|
}
|
1948
1992
|
|
1949
|
-
const int64_t t_start_sample_us = ggml_time_us();
|
1950
|
-
|
1951
1993
|
// Compute the softmax of logits and calculate entropy
|
1952
1994
|
llama_sample_softmax(nullptr, candidates);
|
1953
1995
|
|
1996
|
+
const int64_t t_start_sample_us = ggml_time_us();
|
1997
|
+
|
1954
1998
|
float entropy = 0.0f;
|
1955
1999
|
for (size_t i = 0; i < candidates->size; ++i) {
|
1956
2000
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
@@ -2119,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
2119
2163
|
|
2120
2164
|
if (ctx) {
|
2121
2165
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2122
|
-
ctx->n_sample++;
|
2123
2166
|
}
|
2124
2167
|
return X;
|
2125
2168
|
}
|
2126
2169
|
|
2127
2170
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
2128
|
-
assert(ctx);
|
2129
2171
|
int64_t t_start_sample_us;
|
2130
2172
|
t_start_sample_us = ggml_time_us();
|
2131
2173
|
|
@@ -2140,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
|
2140
2182
|
candidates->size = 1;
|
2141
2183
|
}
|
2142
2184
|
|
2185
|
+
if (ctx) {
|
2186
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2187
|
+
}
|
2188
|
+
|
2143
2189
|
// Normalize the probabilities of the remaining words
|
2144
2190
|
llama_sample_softmax(ctx, candidates);
|
2145
2191
|
|
2146
2192
|
// Sample the next word X from the remaining words
|
2147
|
-
if (ctx) {
|
2148
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
2149
|
-
}
|
2150
2193
|
llama_token X = llama_sample_token(ctx, candidates);
|
2151
2194
|
t_start_sample_us = ggml_time_us();
|
2152
2195
|
|
@@ -2214,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2214
2257
|
}
|
2215
2258
|
float * f32_output = (float *) output.addr;
|
2216
2259
|
|
2217
|
-
|
2260
|
+
ggml_type_traits_t qtype;
|
2218
2261
|
if (ggml_is_quantized(tensor.type)) {
|
2219
|
-
qtype =
|
2220
|
-
if (qtype.
|
2262
|
+
qtype = ggml_internal_get_type_traits(tensor.type);
|
2263
|
+
if (qtype.to_float == NULL) {
|
2221
2264
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
2222
2265
|
}
|
2223
2266
|
} else if (tensor.type != GGML_TYPE_F16) {
|
@@ -2228,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2228
2271
|
if (tensor.type == GGML_TYPE_F16) {
|
2229
2272
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
2230
2273
|
} else if (ggml_is_quantized(tensor.type)) {
|
2231
|
-
qtype.
|
2274
|
+
qtype.to_float(tensor.data, f32_output, nelements);
|
2232
2275
|
} else {
|
2233
2276
|
LLAMA_ASSERT(false); // unreachable
|
2234
2277
|
}
|
@@ -2253,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
|
2253
2296
|
if (typ == GGML_TYPE_F16) {
|
2254
2297
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
2255
2298
|
} else {
|
2256
|
-
qtype.
|
2299
|
+
qtype.to_float(inbuf, outbuf, nels);
|
2257
2300
|
}
|
2258
2301
|
};
|
2259
2302
|
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
@@ -3219,7 +3262,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3219
3262
|
return nread;
|
3220
3263
|
}
|
3221
3264
|
|
3222
|
-
bool
|
3265
|
+
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3223
3266
|
llama_file file(path_session, "rb");
|
3224
3267
|
|
3225
3268
|
// sanity checks
|
@@ -3273,6 +3316,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3273
3316
|
return true;
|
3274
3317
|
}
|
3275
3318
|
|
3319
|
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3320
|
+
try {
|
3321
|
+
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
3322
|
+
} catch (const std::exception & err) {
|
3323
|
+
fprintf(stderr, "error loading session file: %s\n", err.what());
|
3324
|
+
return false;
|
3325
|
+
}
|
3326
|
+
}
|
3327
|
+
|
3276
3328
|
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
3277
3329
|
llama_file file(path_session, "wb");
|
3278
3330
|
|
@@ -3428,23 +3480,35 @@ llama_token llama_token_nl() {
|
|
3428
3480
|
return 13;
|
3429
3481
|
}
|
3430
3482
|
|
3483
|
+
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
3484
|
+
struct llama_timings result = {
|
3485
|
+
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
3486
|
+
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
3487
|
+
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
3488
|
+
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
3489
|
+
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
3490
|
+
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
3431
3491
|
|
3432
|
-
|
3433
|
-
|
3492
|
+
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
3493
|
+
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
3494
|
+
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
3495
|
+
};
|
3434
3496
|
|
3435
|
-
|
3436
|
-
|
3437
|
-
|
3497
|
+
return result;
|
3498
|
+
}
|
3499
|
+
|
3500
|
+
void llama_print_timings(struct llama_context * ctx) {
|
3501
|
+
const llama_timings timings = llama_get_timings(ctx);
|
3438
3502
|
|
3439
3503
|
fprintf(stderr, "\n");
|
3440
|
-
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__,
|
3504
|
+
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
3441
3505
|
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3442
|
-
__func__,
|
3506
|
+
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
3443
3507
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3444
|
-
__func__,
|
3508
|
+
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
3445
3509
|
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3446
|
-
__func__,
|
3447
|
-
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (
|
3510
|
+
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
3511
|
+
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
3448
3512
|
}
|
3449
3513
|
|
3450
3514
|
void llama_reset_timings(struct llama_context * ctx) {
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -134,6 +134,20 @@ extern "C" {
|
|
134
134
|
bool quantize_output_tensor; // quantize output.weight
|
135
135
|
} llama_model_quantize_params;
|
136
136
|
|
137
|
+
// performance timing information
|
138
|
+
struct llama_timings {
|
139
|
+
double t_start_ms;
|
140
|
+
double t_end_ms;
|
141
|
+
double t_load_ms;
|
142
|
+
double t_sample_ms;
|
143
|
+
double t_p_eval_ms;
|
144
|
+
double t_eval_ms;
|
145
|
+
|
146
|
+
int32_t n_sample;
|
147
|
+
int32_t n_p_eval;
|
148
|
+
int32_t n_eval;
|
149
|
+
};
|
150
|
+
|
137
151
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
138
152
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
139
153
|
|
@@ -331,6 +345,7 @@ extern "C" {
|
|
331
345
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
332
346
|
|
333
347
|
// Performance information
|
348
|
+
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
334
349
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
335
350
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
336
351
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-481f793'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
|
|
16
16
|
# @param prompt [String] The prompt to start generation with.
|
17
17
|
# @param n_predict [Integer] The number of tokens to predict.
|
18
18
|
# @param n_threads [Integer] The number of threads.
|
19
|
+
# @param n_keep [Integer] The number of tokens to keep in the context.
|
20
|
+
# @param n_batch [Integer] The number of tokens to process in a batch.
|
21
|
+
# @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
|
22
|
+
# @param repeat_penalty [Float] The repetition penalty.
|
23
|
+
# @param frequency [Float] The frequency penalty.
|
24
|
+
# @param presence [Float] The presence penalty.
|
25
|
+
# @param top_k [Integer] The number of tokens to consider for top-k sampling.
|
26
|
+
# @param top_p [Float] The probability threshold for nucleus sampling.
|
27
|
+
# @param tfs_z [Float] The z parameter for tail-free sampling.
|
28
|
+
# @param typical_p [Float] The probability for typical sampling.
|
29
|
+
# @param temperature [Float] The temperature for temperature sampling.
|
19
30
|
# @return [String]
|
20
|
-
def generate(context, prompt,
|
31
|
+
def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
32
|
+
n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
|
33
|
+
repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
|
34
|
+
top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
|
21
35
|
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
22
36
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
23
37
|
|
@@ -31,19 +45,8 @@ module LLaMACpp
|
|
31
45
|
|
32
46
|
embd = []
|
33
47
|
n_consumed = 0
|
34
|
-
n_keep = 10
|
35
48
|
n_past = 0
|
36
49
|
n_remain = n_predict
|
37
|
-
repeat_last_n = 64
|
38
|
-
repeat_penalty = 1.1
|
39
|
-
frequency = 0.0
|
40
|
-
presence = 0.0
|
41
|
-
top_k = 40
|
42
|
-
top_p = 0.95
|
43
|
-
tfs_z = 1.0
|
44
|
-
typical_p = 1.0
|
45
|
-
temperature = 0.8
|
46
|
-
n_batch = 512
|
47
50
|
n_vocab = context.n_vocab
|
48
51
|
output = []
|
49
52
|
|
data/sig/llama_cpp.rbs
CHANGED
@@ -28,7 +28,10 @@ module LLaMACpp
|
|
28
28
|
|
29
29
|
def self?.init_backend: (?numa: bool) -> void
|
30
30
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
31
|
-
def self?.generate: (::LLaMACpp::Context, String,
|
31
|
+
def self?.generate: (::LLaMACpp::Context, String,
|
32
|
+
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
33
|
+
?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
|
34
|
+
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
32
35
|
def self?.print_system_info: () -> void
|
33
36
|
def self?.token_bos: () -> Integer
|
34
37
|
def self?.token_eos: () -> Integer
|
@@ -67,6 +70,20 @@ module LLaMACpp
|
|
67
70
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
68
71
|
end
|
69
72
|
|
73
|
+
class Timings
|
74
|
+
public
|
75
|
+
|
76
|
+
def t_start_ms: () -> Float
|
77
|
+
def t_end_ms: () -> Float
|
78
|
+
def t_load_ms: () -> Float
|
79
|
+
def t_sample_ms: () -> Float
|
80
|
+
def t_p_eval_ms: () -> Float
|
81
|
+
def t_eval_ms: () -> Float
|
82
|
+
def n_sample: () -> Integer
|
83
|
+
def n_p_eval: () -> Integer
|
84
|
+
def n_eval: () -> Integer
|
85
|
+
end
|
86
|
+
|
70
87
|
class Context
|
71
88
|
public
|
72
89
|
|
@@ -80,6 +97,7 @@ module LLaMACpp
|
|
80
97
|
def n_embd: () -> Integer
|
81
98
|
def n_vocab: () -> Integer
|
82
99
|
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
100
|
+
def timings: () -> ::LLaMACpp::Timings
|
83
101
|
def print_timings: () -> void
|
84
102
|
def reset_timings: () -> void
|
85
103
|
def token_to_str: (Integer) -> String
|