llama_cpp 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -201,6 +201,8 @@
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+ #define GGML_UNUSED(x) (void)(x)
205
+
204
206
  #define GGML_ASSERT(x) \
205
207
  do { \
206
208
  if (!(x)) { \
@@ -209,6 +211,30 @@
209
211
  } \
210
212
  } while (0)
211
213
 
214
+ // used to copy the number of elements and stride in bytes of tensors into local variables.
215
+ // main purpose is to reduce code duplication and improve readability.
216
+ //
217
+ // example:
218
+ //
219
+ // GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
220
+ // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
221
+ //
222
+ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
223
+ const type prefix##0 = (pointer)->array[0]; \
224
+ GGML_UNUSED(prefix##0);
225
+ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
226
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
227
+ const type prefix##1 = (pointer)->array[1]; \
228
+ GGML_UNUSED(prefix##1);
229
+ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
230
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
231
+ const type prefix##2 = (pointer)->array[2]; \
232
+ GGML_UNUSED(prefix##2);
233
+ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
234
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
235
+ const type prefix##3 = (pointer)->array[3]; \
236
+ GGML_UNUSED(prefix##3);
237
+
212
238
  #ifdef __cplusplus
213
239
  extern "C" {
214
240
  #endif
@@ -224,8 +250,8 @@ extern "C" {
224
250
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
225
251
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
226
252
 
227
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
228
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
253
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
254
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
229
255
 
230
256
  struct ggml_object;
231
257
  struct ggml_context;
@@ -295,12 +321,15 @@ extern "C" {
295
321
  GGML_OP_SUM,
296
322
  GGML_OP_SUM_ROWS,
297
323
  GGML_OP_MEAN,
324
+ GGML_OP_ARGMAX,
298
325
  GGML_OP_REPEAT,
299
326
  GGML_OP_REPEAT_BACK,
300
327
  GGML_OP_ABS,
301
328
  GGML_OP_SGN,
302
329
  GGML_OP_NEG,
303
330
  GGML_OP_STEP,
331
+ GGML_OP_TANH,
332
+ GGML_OP_ELU,
304
333
  GGML_OP_RELU,
305
334
  GGML_OP_GELU,
306
335
  GGML_OP_GELU_QUICK,
@@ -332,9 +361,8 @@ extern "C" {
332
361
  GGML_OP_ROPE_BACK,
333
362
  GGML_OP_ALIBI,
334
363
  GGML_OP_CLAMP,
335
- GGML_OP_CONV_1D_S1_PH,
336
- GGML_OP_CONV_1D_S2_PH,
337
- GGML_OP_CONV_2D_SK_P0,
364
+ GGML_OP_CONV_1D,
365
+ GGML_OP_CONV_2D,
338
366
 
339
367
  GGML_OP_FLASH_ATTN,
340
368
  GGML_OP_FLASH_FF,
@@ -444,6 +472,9 @@ extern "C" {
444
472
 
445
473
 
446
474
  // compute types
475
+
476
+ // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
477
+ // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
447
478
  enum ggml_task_type {
448
479
  GGML_TASK_INIT = 0,
449
480
  GGML_TASK_COMPUTE,
@@ -687,6 +718,11 @@ extern "C" {
687
718
  struct ggml_context * ctx,
688
719
  struct ggml_tensor * a);
689
720
 
721
+ // argmax along rows
722
+ GGML_API struct ggml_tensor * ggml_argmax(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
690
726
  // if a is the same shape as b, and a is not parameter, return a
691
727
  // otherwise, return a new tensor: repeat(a) to fit in b
692
728
  GGML_API struct ggml_tensor * ggml_repeat(
@@ -731,6 +767,22 @@ extern "C" {
731
767
  struct ggml_context * ctx,
732
768
  struct ggml_tensor * a);
733
769
 
770
+ GGML_API struct ggml_tensor * ggml_tanh(
771
+ struct ggml_context * ctx,
772
+ struct ggml_tensor * a);
773
+
774
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
775
+ struct ggml_context * ctx,
776
+ struct ggml_tensor * a);
777
+
778
+ GGML_API struct ggml_tensor * ggml_elu(
779
+ struct ggml_context * ctx,
780
+ struct ggml_tensor * a);
781
+
782
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
783
+ struct ggml_context * ctx,
784
+ struct ggml_tensor * a);
785
+
734
786
  GGML_API struct ggml_tensor * ggml_relu(
735
787
  struct ggml_context * ctx,
736
788
  struct ggml_tensor * a);
@@ -1081,58 +1133,33 @@ extern "C" {
1081
1133
  float min,
1082
1134
  float max);
1083
1135
 
1084
- // TODO: implement general-purpose convolutions
1085
- // GGML_API struct ggml_tensor * ggml_conv_1d(
1086
- // struct ggml_context * ctx,
1087
- // struct ggml_tensor * a,
1088
- // struct ggml_tensor * b,
1089
- // int s0
1090
- // int p0,
1091
- // int d0);
1092
- //
1093
- // GGML_API struct ggml_tensor * ggml_conv_2d(
1094
- // struct ggml_context * ctx,
1095
- // struct ggml_tensor * a,
1096
- // struct ggml_tensor * b,
1097
- // int s0,
1098
- // int s1,
1099
- // int p0,
1100
- // int p1,
1101
- // int d0,
1102
- // int d1);
1103
-
1104
- // padding = half
1105
- // TODO: we don't support extra parameters for now
1106
- // that's why we are hard-coding the stride, padding, and dilation
1107
- // not great ..
1108
- // example:
1109
- // a: 3 80 768 1
1110
- // b: 3000 80 1 1
1111
- // res: 3000 768 1 1
1112
- // used in whisper
1113
- GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1136
+ GGML_API struct ggml_tensor * ggml_conv_1d(
1114
1137
  struct ggml_context * ctx,
1115
1138
  struct ggml_tensor * a,
1116
- struct ggml_tensor * b);
1139
+ struct ggml_tensor * b,
1140
+ int s0, // stride
1141
+ int p0, // padding
1142
+ int d0); // dilation
1117
1143
 
1118
- // used in whisper
1119
- GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1144
+ GGML_API struct ggml_tensor * ggml_conv_2d(
1120
1145
  struct ggml_context * ctx,
1121
1146
  struct ggml_tensor * a,
1122
- struct ggml_tensor * b);
1147
+ struct ggml_tensor * b,
1148
+ int s0,
1149
+ int s1,
1150
+ int p0,
1151
+ int p1,
1152
+ int d0,
1153
+ int d1);
1123
1154
 
1124
- // kernel size is a->ne[0] x a->ne[1]
1125
- // stride is equal to kernel size
1126
- // padding is zero
1127
- // example:
1128
- // a: 16 16 3 768
1129
- // b: 1024 1024 3 1
1130
- // res: 64 64 768 1
1131
- // used in sam
1132
- GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1155
+ // conv_1d with padding = half
1156
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1157
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1133
1158
  struct ggml_context * ctx,
1134
1159
  struct ggml_tensor * a,
1135
- struct ggml_tensor * b);
1160
+ struct ggml_tensor * b,
1161
+ int s,
1162
+ int d);
1136
1163
 
1137
1164
  GGML_API struct ggml_tensor * ggml_flash_attn(
1138
1165
  struct ggml_context * ctx,
@@ -1488,25 +1515,24 @@ extern "C" {
1488
1515
  //
1489
1516
 
1490
1517
  #ifdef __cplusplus
1491
- // restrict not standard in C++
1518
+ // restrict not standard in C++
1492
1519
  #define GGML_RESTRICT
1493
1520
  #else
1494
1521
  #define GGML_RESTRICT restrict
1495
1522
  #endif
1496
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1497
- typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1498
- typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1523
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1524
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1525
+ typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1499
1526
 
1500
1527
  typedef struct {
1501
- dequantize_row_q_t dequantize_row_q;
1502
- quantize_row_q_t quantize_row_q;
1503
- quantize_row_q_t quantize_row_q_reference;
1504
- quantize_row_q_t quantize_row_q_dot;
1505
- vec_dot_q_t vec_dot_q;
1506
- enum ggml_type vec_dot_type;
1507
- } quantize_fns_t;
1508
-
1509
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
1528
+ ggml_to_float_t to_float;
1529
+ ggml_from_float_t from_float;
1530
+ ggml_from_float_t from_float_reference;
1531
+ ggml_vec_dot_t vec_dot;
1532
+ enum ggml_type vec_dot_type;
1533
+ } ggml_type_traits_t;
1534
+
1535
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
1510
1536
 
1511
1537
  #ifdef __cplusplus
1512
1538
  }
@@ -66,6 +66,7 @@ enum e_model {
66
66
  MODEL_65B,
67
67
  };
68
68
 
69
+ static const size_t kB = 1024;
69
70
  static const size_t MB = 1024*1024;
70
71
 
71
72
  // computed for n_ctx == 2048
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
129
130
  return k_sizes;
130
131
  }
131
132
 
133
+ // amount of VRAM needed per batch size to hold temporary results
134
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
135
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
136
+ {
137
+ static std::map<e_model, size_t> k_sizes = {
138
+ { MODEL_3B, 512ull * kB },
139
+ { MODEL_7B, 512ull * kB },
140
+ { MODEL_13B, 640ull * kB },
141
+ { MODEL_30B, 768ull * kB },
142
+ { MODEL_65B, 1536ull * kB },
143
+ };
144
+ return k_sizes;
145
+ }
146
+
147
+ // amount of VRAM needed per batch size and context to hold temporary results
148
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
149
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
150
+ {
151
+ static std::map<e_model, size_t> k_sizes = {
152
+ { MODEL_3B, 128ull },
153
+ { MODEL_7B, 128ull },
154
+ { MODEL_13B, 160ull },
155
+ { MODEL_30B, 208ull },
156
+ { MODEL_65B, 416ull },
157
+ };
158
+ return k_sizes;
159
+ }
160
+
132
161
  // default hparams (LLaMA 7B)
133
162
  struct llama_hparams {
134
163
  uint32_t n_vocab = 32000;
@@ -165,8 +194,8 @@ struct llama_layer {
165
194
  };
166
195
 
167
196
  struct llama_kv_cache {
168
- struct ggml_tensor * k;
169
- struct ggml_tensor * v;
197
+ struct ggml_tensor * k = NULL;
198
+ struct ggml_tensor * v = NULL;
170
199
 
171
200
  struct ggml_context * ctx = NULL;
172
201
 
@@ -253,7 +282,13 @@ struct llama_model {
253
282
 
254
283
  struct llama_context {
255
284
  llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
-
285
+ #ifdef GGML_USE_METAL
286
+ ~llama_context() {
287
+ if (ctx_metal) {
288
+ ggml_metal_free(ctx_metal);
289
+ }
290
+ }
291
+ #endif
257
292
  std::mt19937 rng;
258
293
 
259
294
  bool has_evaluated_once = false;
@@ -446,9 +481,7 @@ struct llama_file_loader {
446
481
  std::string word = file.read_string(len);
447
482
 
448
483
  float score = 0.0f;
449
- if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
450
- file.read_raw(&score, sizeof(score));
451
- }
484
+ file.read_raw(&score, sizeof(score));
452
485
 
453
486
  vocab.token_to_id[word] = i;
454
487
 
@@ -1112,14 +1145,18 @@ static void llama_model_load_internal(
1112
1145
  fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1113
1146
  ggml_cuda_set_scratch_size(0); // disable scratch
1114
1147
  } else {
1115
- vram_scratch = n_batch * MB;
1148
+ const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1149
+ const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1150
+ vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1116
1151
  ggml_cuda_set_scratch_size(vram_scratch);
1117
1152
  if (n_gpu_layers > 0) {
1118
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1119
- __func__, vram_scratch / MB);
1153
+ fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1154
+ __func__, vram_scratch_base / kB, vram_scratch_per_context,
1155
+ (vram_scratch + MB - 1) / MB); // round up
1120
1156
  }
1121
1157
  }
1122
1158
  #endif // GGML_USE_CUBLAS
1159
+
1123
1160
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1124
1161
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1125
1162
 
@@ -1128,6 +1165,10 @@ static void llama_model_load_internal(
1128
1165
  fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1129
1166
  }
1130
1167
  size_t vram_kv_cache = 0;
1168
+
1169
+ #ifdef GGML_USE_CUBLAS
1170
+ const int max_backend_supported_layers = hparams.n_layer + 3;
1171
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1131
1172
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1132
1173
  if (low_vram) {
1133
1174
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@@ -1144,14 +1185,18 @@ static void llama_model_load_internal(
1144
1185
  vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1145
1186
  }
1146
1187
  }
1147
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1188
+ #elif defined(GGML_USE_CLBLAST)
1189
+ const int max_backend_supported_layers = hparams.n_layer + 1;
1190
+ const int max_offloadable_layers = hparams.n_layer + 1;
1191
+ #endif // GGML_USE_CUBLAS
1192
+
1148
1193
  fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1149
- __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1194
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1150
1195
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1151
1196
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1152
1197
  #else
1153
1198
  (void) n_gpu_layers;
1154
- #endif
1199
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1155
1200
  }
1156
1201
 
1157
1202
  // populate `tensors_by_name`
@@ -1860,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
1860
1905
  return;
1861
1906
  }
1862
1907
 
1863
- const int64_t t_start_sample_us = ggml_time_us();
1864
-
1865
1908
  llama_sample_softmax(ctx, candidates);
1866
1909
 
1910
+ const int64_t t_start_sample_us = ggml_time_us();
1911
+
1867
1912
  // Compute the cumulative probabilities
1868
1913
  float cum_sum = 0.0f;
1869
1914
  size_t last_idx = candidates->size;
@@ -1892,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
1892
1937
  return;
1893
1938
  }
1894
1939
 
1895
- const int64_t t_start_sample_us = ggml_time_us();
1896
-
1897
1940
  llama_sample_softmax(nullptr, candidates);
1941
+ const int64_t t_start_sample_us = ggml_time_us();
1898
1942
 
1899
1943
  // Compute the first and second derivatives
1900
1944
  std::vector<float> first_derivatives(candidates->size - 1);
@@ -1946,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
1946
1990
  return;
1947
1991
  }
1948
1992
 
1949
- const int64_t t_start_sample_us = ggml_time_us();
1950
-
1951
1993
  // Compute the softmax of logits and calculate entropy
1952
1994
  llama_sample_softmax(nullptr, candidates);
1953
1995
 
1996
+ const int64_t t_start_sample_us = ggml_time_us();
1997
+
1954
1998
  float entropy = 0.0f;
1955
1999
  for (size_t i = 0; i < candidates->size; ++i) {
1956
2000
  entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@@ -2119,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
2119
2163
 
2120
2164
  if (ctx) {
2121
2165
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2122
- ctx->n_sample++;
2123
2166
  }
2124
2167
  return X;
2125
2168
  }
2126
2169
 
2127
2170
  llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
2128
- assert(ctx);
2129
2171
  int64_t t_start_sample_us;
2130
2172
  t_start_sample_us = ggml_time_us();
2131
2173
 
@@ -2140,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2140
2182
  candidates->size = 1;
2141
2183
  }
2142
2184
 
2185
+ if (ctx) {
2186
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2187
+ }
2188
+
2143
2189
  // Normalize the probabilities of the remaining words
2144
2190
  llama_sample_softmax(ctx, candidates);
2145
2191
 
2146
2192
  // Sample the next word X from the remaining words
2147
- if (ctx) {
2148
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2149
- }
2150
2193
  llama_token X = llama_sample_token(ctx, candidates);
2151
2194
  t_start_sample_us = ggml_time_us();
2152
2195
 
@@ -2214,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2214
2257
  }
2215
2258
  float * f32_output = (float *) output.addr;
2216
2259
 
2217
- quantize_fns_t qtype;
2260
+ ggml_type_traits_t qtype;
2218
2261
  if (ggml_is_quantized(tensor.type)) {
2219
- qtype = ggml_internal_get_quantize_fn(tensor.type);
2220
- if (qtype.dequantize_row_q == NULL) {
2262
+ qtype = ggml_internal_get_type_traits(tensor.type);
2263
+ if (qtype.to_float == NULL) {
2221
2264
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2222
2265
  }
2223
2266
  } else if (tensor.type != GGML_TYPE_F16) {
@@ -2228,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2228
2271
  if (tensor.type == GGML_TYPE_F16) {
2229
2272
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2230
2273
  } else if (ggml_is_quantized(tensor.type)) {
2231
- qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2274
+ qtype.to_float(tensor.data, f32_output, nelements);
2232
2275
  } else {
2233
2276
  LLAMA_ASSERT(false); // unreachable
2234
2277
  }
@@ -2253,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2253
2296
  if (typ == GGML_TYPE_F16) {
2254
2297
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2255
2298
  } else {
2256
- qtype.dequantize_row_q(inbuf, outbuf, nels);
2299
+ qtype.to_float(inbuf, outbuf, nels);
2257
2300
  }
2258
2301
  };
2259
2302
  workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
@@ -3219,7 +3262,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3219
3262
  return nread;
3220
3263
  }
3221
3264
 
3222
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3265
+ static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3223
3266
  llama_file file(path_session, "rb");
3224
3267
 
3225
3268
  // sanity checks
@@ -3273,6 +3316,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
3273
3316
  return true;
3274
3317
  }
3275
3318
 
3319
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3320
+ try {
3321
+ return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
3322
+ } catch (const std::exception & err) {
3323
+ fprintf(stderr, "error loading session file: %s\n", err.what());
3324
+ return false;
3325
+ }
3326
+ }
3327
+
3276
3328
  bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
3277
3329
  llama_file file(path_session, "wb");
3278
3330
 
@@ -3428,23 +3480,35 @@ llama_token llama_token_nl() {
3428
3480
  return 13;
3429
3481
  }
3430
3482
 
3483
+ struct llama_timings llama_get_timings(struct llama_context * ctx) {
3484
+ struct llama_timings result = {
3485
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
3486
+ /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
3487
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
3488
+ /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
3489
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
3490
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
3431
3491
 
3432
- void llama_print_timings(struct llama_context * ctx) {
3433
- const int64_t t_end_us = ggml_time_us();
3492
+ /*.n_sample =*/ std::max(1, ctx->n_sample),
3493
+ /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
3494
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
3495
+ };
3434
3496
 
3435
- const int32_t n_sample = std::max(1, ctx->n_sample);
3436
- const int32_t n_eval = std::max(1, ctx->n_eval);
3437
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
3497
+ return result;
3498
+ }
3499
+
3500
+ void llama_print_timings(struct llama_context * ctx) {
3501
+ const llama_timings timings = llama_get_timings(ctx);
3438
3502
 
3439
3503
  fprintf(stderr, "\n");
3440
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3504
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
3441
3505
  fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3442
- __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3506
+ __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
3443
3507
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3444
- __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3508
+ __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
3445
3509
  fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3446
- __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3447
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3510
+ __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
3511
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
3448
3512
  }
3449
3513
 
3450
3514
  void llama_reset_timings(struct llama_context * ctx) {
@@ -134,6 +134,20 @@ extern "C" {
134
134
  bool quantize_output_tensor; // quantize output.weight
135
135
  } llama_model_quantize_params;
136
136
 
137
+ // performance timing information
138
+ struct llama_timings {
139
+ double t_start_ms;
140
+ double t_end_ms;
141
+ double t_load_ms;
142
+ double t_sample_ms;
143
+ double t_p_eval_ms;
144
+ double t_eval_ms;
145
+
146
+ int32_t n_sample;
147
+ int32_t n_p_eval;
148
+ int32_t n_eval;
149
+ };
150
+
137
151
  LLAMA_API struct llama_context_params llama_context_default_params();
138
152
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
139
153
 
@@ -331,6 +345,7 @@ extern "C" {
331
345
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
332
346
 
333
347
  // Performance information
348
+ LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
334
349
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
335
350
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
336
351
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.1'
6
+ VERSION = '0.3.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-b8c8dda'
9
+ LLAMA_CPP_VERSION = 'master-481f793'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
16
16
  # @param prompt [String] The prompt to start generation with.
17
17
  # @param n_predict [Integer] The number of tokens to predict.
18
18
  # @param n_threads [Integer] The number of threads.
19
+ # @param n_keep [Integer] The number of tokens to keep in the context.
20
+ # @param n_batch [Integer] The number of tokens to process in a batch.
21
+ # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
22
+ # @param repeat_penalty [Float] The repetition penalty.
23
+ # @param frequency [Float] The frequency penalty.
24
+ # @param presence [Float] The presence penalty.
25
+ # @param top_k [Integer] The number of tokens to consider for top-k sampling.
26
+ # @param top_p [Float] The probability threshold for nucleus sampling.
27
+ # @param tfs_z [Float] The z parameter for tail-free sampling.
28
+ # @param typical_p [Float] The probability for typical sampling.
29
+ # @param temperature [Float] The temperature for temperature sampling.
19
30
  # @return [String]
20
- def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
31
+ def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
32
+ n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
33
+ repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
34
+ top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
21
35
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
22
36
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
23
37
 
@@ -31,19 +45,8 @@ module LLaMACpp
31
45
 
32
46
  embd = []
33
47
  n_consumed = 0
34
- n_keep = 10
35
48
  n_past = 0
36
49
  n_remain = n_predict
37
- repeat_last_n = 64
38
- repeat_penalty = 1.1
39
- frequency = 0.0
40
- presence = 0.0
41
- top_k = 40
42
- top_p = 0.95
43
- tfs_z = 1.0
44
- typical_p = 1.0
45
- temperature = 0.8
46
- n_batch = 512
47
50
  n_vocab = context.n_vocab
48
51
  output = []
49
52
 
data/sig/llama_cpp.rbs CHANGED
@@ -28,7 +28,10 @@ module LLaMACpp
28
28
 
29
29
  def self?.init_backend: (?numa: bool) -> void
30
30
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
31
- def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
31
+ def self?.generate: (::LLaMACpp::Context, String,
32
+ ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
33
+ ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
34
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
32
35
  def self?.print_system_info: () -> void
33
36
  def self?.token_bos: () -> Integer
34
37
  def self?.token_eos: () -> Integer
@@ -67,6 +70,20 @@ module LLaMACpp
67
70
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
68
71
  end
69
72
 
73
+ class Timings
74
+ public
75
+
76
+ def t_start_ms: () -> Float
77
+ def t_end_ms: () -> Float
78
+ def t_load_ms: () -> Float
79
+ def t_sample_ms: () -> Float
80
+ def t_p_eval_ms: () -> Float
81
+ def t_eval_ms: () -> Float
82
+ def n_sample: () -> Integer
83
+ def n_p_eval: () -> Integer
84
+ def n_eval: () -> Integer
85
+ end
86
+
70
87
  class Context
71
88
  public
72
89
 
@@ -80,6 +97,7 @@ module LLaMACpp
80
97
  def n_embd: () -> Integer
81
98
  def n_vocab: () -> Integer
82
99
  def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
100
+ def timings: () -> ::LLaMACpp::Timings
83
101
  def print_timings: () -> void
84
102
  def reset_timings: () -> void
85
103
  def token_to_str: (Integer) -> String