llama_cpp 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -201,6 +201,8 @@
201
201
  #define GGML_MAX_NAME 48
202
202
  #define GGML_DEFAULT_N_THREADS 4
203
203
 
204
+ #define GGML_UNUSED(x) (void)(x)
205
+
204
206
  #define GGML_ASSERT(x) \
205
207
  do { \
206
208
  if (!(x)) { \
@@ -209,6 +211,30 @@
209
211
  } \
210
212
  } while (0)
211
213
 
214
+ // used to copy the number of elements and stride in bytes of tensors into local variables.
215
+ // main purpose is to reduce code duplication and improve readability.
216
+ //
217
+ // example:
218
+ //
219
+ // GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
220
+ // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
221
+ //
222
+ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
223
+ const type prefix##0 = (pointer)->array[0]; \
224
+ GGML_UNUSED(prefix##0);
225
+ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
226
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
227
+ const type prefix##1 = (pointer)->array[1]; \
228
+ GGML_UNUSED(prefix##1);
229
+ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
230
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
231
+ const type prefix##2 = (pointer)->array[2]; \
232
+ GGML_UNUSED(prefix##2);
233
+ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
234
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
235
+ const type prefix##3 = (pointer)->array[3]; \
236
+ GGML_UNUSED(prefix##3);
237
+
212
238
  #ifdef __cplusplus
213
239
  extern "C" {
214
240
  #endif
@@ -224,8 +250,8 @@ extern "C" {
224
250
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
225
251
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
226
252
 
227
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
228
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
253
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
254
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
229
255
 
230
256
  struct ggml_object;
231
257
  struct ggml_context;
@@ -295,12 +321,15 @@ extern "C" {
295
321
  GGML_OP_SUM,
296
322
  GGML_OP_SUM_ROWS,
297
323
  GGML_OP_MEAN,
324
+ GGML_OP_ARGMAX,
298
325
  GGML_OP_REPEAT,
299
326
  GGML_OP_REPEAT_BACK,
300
327
  GGML_OP_ABS,
301
328
  GGML_OP_SGN,
302
329
  GGML_OP_NEG,
303
330
  GGML_OP_STEP,
331
+ GGML_OP_TANH,
332
+ GGML_OP_ELU,
304
333
  GGML_OP_RELU,
305
334
  GGML_OP_GELU,
306
335
  GGML_OP_GELU_QUICK,
@@ -332,9 +361,8 @@ extern "C" {
332
361
  GGML_OP_ROPE_BACK,
333
362
  GGML_OP_ALIBI,
334
363
  GGML_OP_CLAMP,
335
- GGML_OP_CONV_1D_S1_PH,
336
- GGML_OP_CONV_1D_S2_PH,
337
- GGML_OP_CONV_2D_SK_P0,
364
+ GGML_OP_CONV_1D,
365
+ GGML_OP_CONV_2D,
338
366
 
339
367
  GGML_OP_FLASH_ATTN,
340
368
  GGML_OP_FLASH_FF,
@@ -444,6 +472,9 @@ extern "C" {
444
472
 
445
473
 
446
474
  // compute types
475
+
476
+ // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
477
+ // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
447
478
  enum ggml_task_type {
448
479
  GGML_TASK_INIT = 0,
449
480
  GGML_TASK_COMPUTE,
@@ -687,6 +718,11 @@ extern "C" {
687
718
  struct ggml_context * ctx,
688
719
  struct ggml_tensor * a);
689
720
 
721
+ // argmax along rows
722
+ GGML_API struct ggml_tensor * ggml_argmax(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * a);
725
+
690
726
  // if a is the same shape as b, and a is not parameter, return a
691
727
  // otherwise, return a new tensor: repeat(a) to fit in b
692
728
  GGML_API struct ggml_tensor * ggml_repeat(
@@ -731,6 +767,22 @@ extern "C" {
731
767
  struct ggml_context * ctx,
732
768
  struct ggml_tensor * a);
733
769
 
770
+ GGML_API struct ggml_tensor * ggml_tanh(
771
+ struct ggml_context * ctx,
772
+ struct ggml_tensor * a);
773
+
774
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
775
+ struct ggml_context * ctx,
776
+ struct ggml_tensor * a);
777
+
778
+ GGML_API struct ggml_tensor * ggml_elu(
779
+ struct ggml_context * ctx,
780
+ struct ggml_tensor * a);
781
+
782
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
783
+ struct ggml_context * ctx,
784
+ struct ggml_tensor * a);
785
+
734
786
  GGML_API struct ggml_tensor * ggml_relu(
735
787
  struct ggml_context * ctx,
736
788
  struct ggml_tensor * a);
@@ -1081,58 +1133,33 @@ extern "C" {
1081
1133
  float min,
1082
1134
  float max);
1083
1135
 
1084
- // TODO: implement general-purpose convolutions
1085
- // GGML_API struct ggml_tensor * ggml_conv_1d(
1086
- // struct ggml_context * ctx,
1087
- // struct ggml_tensor * a,
1088
- // struct ggml_tensor * b,
1089
- // int s0
1090
- // int p0,
1091
- // int d0);
1092
- //
1093
- // GGML_API struct ggml_tensor * ggml_conv_2d(
1094
- // struct ggml_context * ctx,
1095
- // struct ggml_tensor * a,
1096
- // struct ggml_tensor * b,
1097
- // int s0,
1098
- // int s1,
1099
- // int p0,
1100
- // int p1,
1101
- // int d0,
1102
- // int d1);
1103
-
1104
- // padding = half
1105
- // TODO: we don't support extra parameters for now
1106
- // that's why we are hard-coding the stride, padding, and dilation
1107
- // not great ..
1108
- // example:
1109
- // a: 3 80 768 1
1110
- // b: 3000 80 1 1
1111
- // res: 3000 768 1 1
1112
- // used in whisper
1113
- GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1136
+ GGML_API struct ggml_tensor * ggml_conv_1d(
1114
1137
  struct ggml_context * ctx,
1115
1138
  struct ggml_tensor * a,
1116
- struct ggml_tensor * b);
1139
+ struct ggml_tensor * b,
1140
+ int s0, // stride
1141
+ int p0, // padding
1142
+ int d0); // dilation
1117
1143
 
1118
- // used in whisper
1119
- GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1144
+ GGML_API struct ggml_tensor * ggml_conv_2d(
1120
1145
  struct ggml_context * ctx,
1121
1146
  struct ggml_tensor * a,
1122
- struct ggml_tensor * b);
1147
+ struct ggml_tensor * b,
1148
+ int s0,
1149
+ int s1,
1150
+ int p0,
1151
+ int p1,
1152
+ int d0,
1153
+ int d1);
1123
1154
 
1124
- // kernel size is a->ne[0] x a->ne[1]
1125
- // stride is equal to kernel size
1126
- // padding is zero
1127
- // example:
1128
- // a: 16 16 3 768
1129
- // b: 1024 1024 3 1
1130
- // res: 64 64 768 1
1131
- // used in sam
1132
- GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1155
+ // conv_1d with padding = half
1156
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1157
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1133
1158
  struct ggml_context * ctx,
1134
1159
  struct ggml_tensor * a,
1135
- struct ggml_tensor * b);
1160
+ struct ggml_tensor * b,
1161
+ int s,
1162
+ int d);
1136
1163
 
1137
1164
  GGML_API struct ggml_tensor * ggml_flash_attn(
1138
1165
  struct ggml_context * ctx,
@@ -1488,25 +1515,24 @@ extern "C" {
1488
1515
  //
1489
1516
 
1490
1517
  #ifdef __cplusplus
1491
- // restrict not standard in C++
1518
+ // restrict not standard in C++
1492
1519
  #define GGML_RESTRICT
1493
1520
  #else
1494
1521
  #define GGML_RESTRICT restrict
1495
1522
  #endif
1496
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1497
- typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1498
- typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1523
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1524
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1525
+ typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1499
1526
 
1500
1527
  typedef struct {
1501
- dequantize_row_q_t dequantize_row_q;
1502
- quantize_row_q_t quantize_row_q;
1503
- quantize_row_q_t quantize_row_q_reference;
1504
- quantize_row_q_t quantize_row_q_dot;
1505
- vec_dot_q_t vec_dot_q;
1506
- enum ggml_type vec_dot_type;
1507
- } quantize_fns_t;
1508
-
1509
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
1528
+ ggml_to_float_t to_float;
1529
+ ggml_from_float_t from_float;
1530
+ ggml_from_float_t from_float_reference;
1531
+ ggml_vec_dot_t vec_dot;
1532
+ enum ggml_type vec_dot_type;
1533
+ } ggml_type_traits_t;
1534
+
1535
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
1510
1536
 
1511
1537
  #ifdef __cplusplus
1512
1538
  }
@@ -66,6 +66,7 @@ enum e_model {
66
66
  MODEL_65B,
67
67
  };
68
68
 
69
+ static const size_t kB = 1024;
69
70
  static const size_t MB = 1024*1024;
70
71
 
71
72
  // computed for n_ctx == 2048
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
129
130
  return k_sizes;
130
131
  }
131
132
 
133
+ // amount of VRAM needed per batch size to hold temporary results
134
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
135
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
136
+ {
137
+ static std::map<e_model, size_t> k_sizes = {
138
+ { MODEL_3B, 512ull * kB },
139
+ { MODEL_7B, 512ull * kB },
140
+ { MODEL_13B, 640ull * kB },
141
+ { MODEL_30B, 768ull * kB },
142
+ { MODEL_65B, 1536ull * kB },
143
+ };
144
+ return k_sizes;
145
+ }
146
+
147
+ // amount of VRAM needed per batch size and context to hold temporary results
148
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
149
+ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
150
+ {
151
+ static std::map<e_model, size_t> k_sizes = {
152
+ { MODEL_3B, 128ull },
153
+ { MODEL_7B, 128ull },
154
+ { MODEL_13B, 160ull },
155
+ { MODEL_30B, 208ull },
156
+ { MODEL_65B, 416ull },
157
+ };
158
+ return k_sizes;
159
+ }
160
+
132
161
  // default hparams (LLaMA 7B)
133
162
  struct llama_hparams {
134
163
  uint32_t n_vocab = 32000;
@@ -165,8 +194,8 @@ struct llama_layer {
165
194
  };
166
195
 
167
196
  struct llama_kv_cache {
168
- struct ggml_tensor * k;
169
- struct ggml_tensor * v;
197
+ struct ggml_tensor * k = NULL;
198
+ struct ggml_tensor * v = NULL;
170
199
 
171
200
  struct ggml_context * ctx = NULL;
172
201
 
@@ -253,7 +282,13 @@ struct llama_model {
253
282
 
254
283
  struct llama_context {
255
284
  llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
256
-
285
+ #ifdef GGML_USE_METAL
286
+ ~llama_context() {
287
+ if (ctx_metal) {
288
+ ggml_metal_free(ctx_metal);
289
+ }
290
+ }
291
+ #endif
257
292
  std::mt19937 rng;
258
293
 
259
294
  bool has_evaluated_once = false;
@@ -446,9 +481,7 @@ struct llama_file_loader {
446
481
  std::string word = file.read_string(len);
447
482
 
448
483
  float score = 0.0f;
449
- if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
450
- file.read_raw(&score, sizeof(score));
451
- }
484
+ file.read_raw(&score, sizeof(score));
452
485
 
453
486
  vocab.token_to_id[word] = i;
454
487
 
@@ -1112,14 +1145,18 @@ static void llama_model_load_internal(
1112
1145
  fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
1113
1146
  ggml_cuda_set_scratch_size(0); // disable scratch
1114
1147
  } else {
1115
- vram_scratch = n_batch * MB;
1148
+ const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1149
+ const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1150
+ vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1116
1151
  ggml_cuda_set_scratch_size(vram_scratch);
1117
1152
  if (n_gpu_layers > 0) {
1118
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1119
- __func__, vram_scratch / MB);
1153
+ fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1154
+ __func__, vram_scratch_base / kB, vram_scratch_per_context,
1155
+ (vram_scratch + MB - 1) / MB); // round up
1120
1156
  }
1121
1157
  }
1122
1158
  #endif // GGML_USE_CUBLAS
1159
+
1123
1160
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1124
1161
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
1125
1162
 
@@ -1128,6 +1165,10 @@ static void llama_model_load_internal(
1128
1165
  fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
1129
1166
  }
1130
1167
  size_t vram_kv_cache = 0;
1168
+
1169
+ #ifdef GGML_USE_CUBLAS
1170
+ const int max_backend_supported_layers = hparams.n_layer + 3;
1171
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1131
1172
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
1132
1173
  if (low_vram) {
1133
1174
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@@ -1144,14 +1185,18 @@ static void llama_model_load_internal(
1144
1185
  vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1145
1186
  }
1146
1187
  }
1147
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
1188
+ #elif defined(GGML_USE_CLBLAST)
1189
+ const int max_backend_supported_layers = hparams.n_layer + 1;
1190
+ const int max_offloadable_layers = hparams.n_layer + 1;
1191
+ #endif // GGML_USE_CUBLAS
1192
+
1148
1193
  fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
1149
- __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
1194
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
1150
1195
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
1151
1196
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
1152
1197
  #else
1153
1198
  (void) n_gpu_layers;
1154
- #endif
1199
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1155
1200
  }
1156
1201
 
1157
1202
  // populate `tensors_by_name`
@@ -1860,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
1860
1905
  return;
1861
1906
  }
1862
1907
 
1863
- const int64_t t_start_sample_us = ggml_time_us();
1864
-
1865
1908
  llama_sample_softmax(ctx, candidates);
1866
1909
 
1910
+ const int64_t t_start_sample_us = ggml_time_us();
1911
+
1867
1912
  // Compute the cumulative probabilities
1868
1913
  float cum_sum = 0.0f;
1869
1914
  size_t last_idx = candidates->size;
@@ -1892,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
1892
1937
  return;
1893
1938
  }
1894
1939
 
1895
- const int64_t t_start_sample_us = ggml_time_us();
1896
-
1897
1940
  llama_sample_softmax(nullptr, candidates);
1941
+ const int64_t t_start_sample_us = ggml_time_us();
1898
1942
 
1899
1943
  // Compute the first and second derivatives
1900
1944
  std::vector<float> first_derivatives(candidates->size - 1);
@@ -1946,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
1946
1990
  return;
1947
1991
  }
1948
1992
 
1949
- const int64_t t_start_sample_us = ggml_time_us();
1950
-
1951
1993
  // Compute the softmax of logits and calculate entropy
1952
1994
  llama_sample_softmax(nullptr, candidates);
1953
1995
 
1996
+ const int64_t t_start_sample_us = ggml_time_us();
1997
+
1954
1998
  float entropy = 0.0f;
1955
1999
  for (size_t i = 0; i < candidates->size; ++i) {
1956
2000
  entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@@ -2119,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
2119
2163
 
2120
2164
  if (ctx) {
2121
2165
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2122
- ctx->n_sample++;
2123
2166
  }
2124
2167
  return X;
2125
2168
  }
2126
2169
 
2127
2170
  llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
2128
- assert(ctx);
2129
2171
  int64_t t_start_sample_us;
2130
2172
  t_start_sample_us = ggml_time_us();
2131
2173
 
@@ -2140,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
2140
2182
  candidates->size = 1;
2141
2183
  }
2142
2184
 
2185
+ if (ctx) {
2186
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2187
+ }
2188
+
2143
2189
  // Normalize the probabilities of the remaining words
2144
2190
  llama_sample_softmax(ctx, candidates);
2145
2191
 
2146
2192
  // Sample the next word X from the remaining words
2147
- if (ctx) {
2148
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
2149
- }
2150
2193
  llama_token X = llama_sample_token(ctx, candidates);
2151
2194
  t_start_sample_us = ggml_time_us();
2152
2195
 
@@ -2214,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2214
2257
  }
2215
2258
  float * f32_output = (float *) output.addr;
2216
2259
 
2217
- quantize_fns_t qtype;
2260
+ ggml_type_traits_t qtype;
2218
2261
  if (ggml_is_quantized(tensor.type)) {
2219
- qtype = ggml_internal_get_quantize_fn(tensor.type);
2220
- if (qtype.dequantize_row_q == NULL) {
2262
+ qtype = ggml_internal_get_type_traits(tensor.type);
2263
+ if (qtype.to_float == NULL) {
2221
2264
  throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
2222
2265
  }
2223
2266
  } else if (tensor.type != GGML_TYPE_F16) {
@@ -2228,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2228
2271
  if (tensor.type == GGML_TYPE_F16) {
2229
2272
  ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
2230
2273
  } else if (ggml_is_quantized(tensor.type)) {
2231
- qtype.dequantize_row_q(tensor.data, f32_output, nelements);
2274
+ qtype.to_float(tensor.data, f32_output, nelements);
2232
2275
  } else {
2233
2276
  LLAMA_ASSERT(false); // unreachable
2234
2277
  }
@@ -2253,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
2253
2296
  if (typ == GGML_TYPE_F16) {
2254
2297
  ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
2255
2298
  } else {
2256
- qtype.dequantize_row_q(inbuf, outbuf, nels);
2299
+ qtype.to_float(inbuf, outbuf, nels);
2257
2300
  }
2258
2301
  };
2259
2302
  workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
@@ -3219,7 +3262,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3219
3262
  return nread;
3220
3263
  }
3221
3264
 
3222
- bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3265
+ static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3223
3266
  llama_file file(path_session, "rb");
3224
3267
 
3225
3268
  // sanity checks
@@ -3273,6 +3316,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
3273
3316
  return true;
3274
3317
  }
3275
3318
 
3319
+ bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
3320
+ try {
3321
+ return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
3322
+ } catch (const std::exception & err) {
3323
+ fprintf(stderr, "error loading session file: %s\n", err.what());
3324
+ return false;
3325
+ }
3326
+ }
3327
+
3276
3328
  bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
3277
3329
  llama_file file(path_session, "wb");
3278
3330
 
@@ -3428,23 +3480,35 @@ llama_token llama_token_nl() {
3428
3480
  return 13;
3429
3481
  }
3430
3482
 
3483
+ struct llama_timings llama_get_timings(struct llama_context * ctx) {
3484
+ struct llama_timings result = {
3485
+ /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
3486
+ /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
3487
+ /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
3488
+ /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
3489
+ /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
3490
+ /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
3431
3491
 
3432
- void llama_print_timings(struct llama_context * ctx) {
3433
- const int64_t t_end_us = ggml_time_us();
3492
+ /*.n_sample =*/ std::max(1, ctx->n_sample),
3493
+ /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
3494
+ /*.n_eval =*/ std::max(1, ctx->n_eval),
3495
+ };
3434
3496
 
3435
- const int32_t n_sample = std::max(1, ctx->n_sample);
3436
- const int32_t n_eval = std::max(1, ctx->n_eval);
3437
- const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
3497
+ return result;
3498
+ }
3499
+
3500
+ void llama_print_timings(struct llama_context * ctx) {
3501
+ const llama_timings timings = llama_get_timings(ctx);
3438
3502
 
3439
3503
  fprintf(stderr, "\n");
3440
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3504
+ fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
3441
3505
  fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3442
- __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3506
+ __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
3443
3507
  fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3444
- __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3508
+ __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
3445
3509
  fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3446
- __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3447
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3510
+ __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
3511
+ fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
3448
3512
  }
3449
3513
 
3450
3514
  void llama_reset_timings(struct llama_context * ctx) {
@@ -134,6 +134,20 @@ extern "C" {
134
134
  bool quantize_output_tensor; // quantize output.weight
135
135
  } llama_model_quantize_params;
136
136
 
137
+ // performance timing information
138
+ struct llama_timings {
139
+ double t_start_ms;
140
+ double t_end_ms;
141
+ double t_load_ms;
142
+ double t_sample_ms;
143
+ double t_p_eval_ms;
144
+ double t_eval_ms;
145
+
146
+ int32_t n_sample;
147
+ int32_t n_p_eval;
148
+ int32_t n_eval;
149
+ };
150
+
137
151
  LLAMA_API struct llama_context_params llama_context_default_params();
138
152
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
139
153
 
@@ -331,6 +345,7 @@ extern "C" {
331
345
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
332
346
 
333
347
  // Performance information
348
+ LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
334
349
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
335
350
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
336
351
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.1'
6
+ VERSION = '0.3.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-b8c8dda'
9
+ LLAMA_CPP_VERSION = 'master-481f793'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
16
16
  # @param prompt [String] The prompt to start generation with.
17
17
  # @param n_predict [Integer] The number of tokens to predict.
18
18
  # @param n_threads [Integer] The number of threads.
19
+ # @param n_keep [Integer] The number of tokens to keep in the context.
20
+ # @param n_batch [Integer] The number of tokens to process in a batch.
21
+ # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
22
+ # @param repeat_penalty [Float] The repetition penalty.
23
+ # @param frequency [Float] The frequency penalty.
24
+ # @param presence [Float] The presence penalty.
25
+ # @param top_k [Integer] The number of tokens to consider for top-k sampling.
26
+ # @param top_p [Float] The probability threshold for nucleus sampling.
27
+ # @param tfs_z [Float] The z parameter for tail-free sampling.
28
+ # @param typical_p [Float] The probability for typical sampling.
29
+ # @param temperature [Float] The temperature for temperature sampling.
19
30
  # @return [String]
20
- def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
31
+ def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
32
+ n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
33
+ repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
34
+ top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
21
35
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
22
36
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
23
37
 
@@ -31,19 +45,8 @@ module LLaMACpp
31
45
 
32
46
  embd = []
33
47
  n_consumed = 0
34
- n_keep = 10
35
48
  n_past = 0
36
49
  n_remain = n_predict
37
- repeat_last_n = 64
38
- repeat_penalty = 1.1
39
- frequency = 0.0
40
- presence = 0.0
41
- top_k = 40
42
- top_p = 0.95
43
- tfs_z = 1.0
44
- typical_p = 1.0
45
- temperature = 0.8
46
- n_batch = 512
47
50
  n_vocab = context.n_vocab
48
51
  output = []
49
52
 
data/sig/llama_cpp.rbs CHANGED
@@ -28,7 +28,10 @@ module LLaMACpp
28
28
 
29
29
  def self?.init_backend: (?numa: bool) -> void
30
30
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
31
- def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
31
+ def self?.generate: (::LLaMACpp::Context, String,
32
+ ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
33
+ ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
34
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
32
35
  def self?.print_system_info: () -> void
33
36
  def self?.token_bos: () -> Integer
34
37
  def self?.token_eos: () -> Integer
@@ -67,6 +70,20 @@ module LLaMACpp
67
70
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
68
71
  end
69
72
 
73
+ class Timings
74
+ public
75
+
76
+ def t_start_ms: () -> Float
77
+ def t_end_ms: () -> Float
78
+ def t_load_ms: () -> Float
79
+ def t_sample_ms: () -> Float
80
+ def t_p_eval_ms: () -> Float
81
+ def t_eval_ms: () -> Float
82
+ def n_sample: () -> Integer
83
+ def n_p_eval: () -> Integer
84
+ def n_eval: () -> Integer
85
+ end
86
+
70
87
  class Context
71
88
  public
72
89
 
@@ -80,6 +97,7 @@ module LLaMACpp
80
97
  def n_embd: () -> Integer
81
98
  def n_vocab: () -> Integer
82
99
  def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
100
+ def timings: () -> ::LLaMACpp::Timings
83
101
  def print_timings: () -> void
84
102
  def reset_timings: () -> void
85
103
  def token_to_str: (Integer) -> String