llama_cpp 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,8 @@
1
1
  #define LLAMA_API_INTERNAL
2
2
  #include "llama.h"
3
3
 
4
+ #include "unicode.h"
5
+
4
6
  #include "ggml.h"
5
7
 
6
8
  #include "ggml-alloc.h"
@@ -123,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
123
125
  }
124
126
  s = std::move(result);
125
127
  }
128
+
129
+ static bool is_float_close(float a, float b, float abs_tol) {
130
+ // Check for non-negative tolerance
131
+ if (abs_tol < 0.0) {
132
+ throw std::invalid_argument("Tolerance must be non-negative");
133
+ }
134
+
135
+ // Exact equality check
136
+ if (a == b) {
137
+ return true;
138
+ }
139
+
140
+ // Check for infinities
141
+ if (std::isinf(a) || std::isinf(b)) {
142
+ return false;
143
+ }
144
+
145
+ // Regular comparison using the provided absolute tolerance
146
+ return std::fabs(b - a) <= abs_tol;
147
+ }
148
+
126
149
  #ifdef GGML_USE_CPU_HBM
127
150
  #include <hbwmalloc.h>
128
151
  #endif
@@ -163,6 +186,7 @@ enum llm_arch {
163
186
  LLM_ARCH_GPTNEOX,
164
187
  LLM_ARCH_MPT,
165
188
  LLM_ARCH_STARCODER,
189
+ LLM_ARCH_REFACT,
166
190
  LLM_ARCH_UNKNOWN,
167
191
  };
168
192
 
@@ -175,6 +199,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
175
199
  { LLM_ARCH_MPT, "mpt" },
176
200
  { LLM_ARCH_BAICHUAN, "baichuan" },
177
201
  { LLM_ARCH_STARCODER, "starcoder" },
202
+ { LLM_ARCH_REFACT, "refact" },
178
203
  };
179
204
 
180
205
  enum llm_kv {
@@ -395,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
395
420
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
396
421
  },
397
422
  },
423
+ {
424
+ LLM_ARCH_REFACT,
425
+ {
426
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
427
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
428
+ { LLM_TENSOR_OUTPUT, "output" },
429
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
430
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
431
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
432
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
433
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
434
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
435
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
436
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
+ },
439
+ },
398
440
  {
399
441
  LLM_ARCH_UNKNOWN,
400
442
  {
@@ -943,7 +985,24 @@ struct llama_hparams {
943
985
  float rope_freq_scale_train;
944
986
 
945
987
  bool operator!=(const llama_hparams & other) const {
946
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
988
+ if (this->vocab_only != other.vocab_only) return true;
989
+ if (this->n_vocab != other.n_vocab) return true;
990
+ if (this->n_ctx_train != other.n_ctx_train) return true;
991
+ if (this->n_embd != other.n_embd) return true;
992
+ if (this->n_head != other.n_head) return true;
993
+ if (this->n_head_kv != other.n_head_kv) return true;
994
+ if (this->n_layer != other.n_layer) return true;
995
+ if (this->n_rot != other.n_rot) return true;
996
+ if (this->n_ff != other.n_ff) return true;
997
+
998
+ const float EPSILON = 1e-9;
999
+
1000
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1001
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1002
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1003
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1004
+
1005
+ return false;
947
1006
  }
948
1007
 
949
1008
  uint32_t n_gqa() const {
@@ -1071,6 +1130,10 @@ struct llama_vocab {
1071
1130
  id special_pad_id = -1;
1072
1131
 
1073
1132
  id linefeed_id = 13;
1133
+ id special_prefix_id = 32007;
1134
+ id special_middle_id = 32009;
1135
+ id special_suffix_id = 32008;
1136
+ id special_eot_id = 32010;
1074
1137
 
1075
1138
  int find_bpe_rank(std::string token_left, std::string token_right) const {
1076
1139
  replace_all(token_left, " ", "\u0120");
@@ -1272,8 +1335,8 @@ static bool llama_kv_cache_init(
1272
1335
  // find an empty slot of size "n_tokens" in the cache
1273
1336
  // updates the cache head
1274
1337
  static bool llama_kv_cache_find_slot(
1275
- struct llama_kv_cache & cache,
1276
- const struct llama_batch & batch) {
1338
+ struct llama_kv_cache & cache,
1339
+ const struct llama_batch & batch) {
1277
1340
  const uint32_t n_ctx = cache.size;
1278
1341
  const uint32_t n_tokens = batch.n_tokens;
1279
1342
 
@@ -1341,10 +1404,13 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
1341
1404
  }
1342
1405
 
1343
1406
  static void llama_kv_cache_seq_rm(
1344
- struct llama_kv_cache & cache,
1345
- llama_seq_id seq_id,
1346
- llama_pos p0,
1347
- llama_pos p1) {
1407
+ struct llama_kv_cache & cache,
1408
+ llama_seq_id seq_id,
1409
+ llama_pos p0,
1410
+ llama_pos p1) {
1411
+ if (p0 < 0) p0 = 0;
1412
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1413
+
1348
1414
  for (uint32_t i = 0; i < cache.size; ++i) {
1349
1415
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1350
1416
  cache.cells[i].seq_id.erase(seq_id);
@@ -1356,11 +1422,14 @@ static void llama_kv_cache_seq_rm(
1356
1422
  }
1357
1423
 
1358
1424
  static void llama_kv_cache_seq_cp(
1359
- struct llama_kv_cache & cache,
1360
- llama_seq_id seq_id_src,
1361
- llama_seq_id seq_id_dst,
1362
- llama_pos p0,
1363
- llama_pos p1) {
1425
+ struct llama_kv_cache & cache,
1426
+ llama_seq_id seq_id_src,
1427
+ llama_seq_id seq_id_dst,
1428
+ llama_pos p0,
1429
+ llama_pos p1) {
1430
+ if (p0 < 0) p0 = 0;
1431
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1432
+
1364
1433
  for (uint32_t i = 0; i < cache.size; ++i) {
1365
1434
  if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1366
1435
  cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1378,11 +1447,14 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1378
1447
  }
1379
1448
 
1380
1449
  static void llama_kv_cache_seq_shift(
1381
- struct llama_kv_cache & cache,
1382
- llama_seq_id seq_id,
1383
- llama_pos p0,
1384
- llama_pos p1,
1385
- llama_pos delta) {
1450
+ struct llama_kv_cache & cache,
1451
+ llama_seq_id seq_id,
1452
+ llama_pos p0,
1453
+ llama_pos p1,
1454
+ llama_pos delta) {
1455
+ if (p0 < 0) p0 = 0;
1456
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1457
+
1386
1458
  for (uint32_t i = 0; i < cache.size; ++i) {
1387
1459
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1388
1460
  cache.cells[i].pos += delta;
@@ -1907,6 +1979,14 @@ static void llm_load_hparams(
1907
1979
  default: model.type = e_model::MODEL_UNKNOWN;
1908
1980
  }
1909
1981
  } break;
1982
+ case LLM_ARCH_REFACT:
1983
+ {
1984
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1985
+ switch (hparams.n_layer) {
1986
+ case 32: model.type = e_model::MODEL_1B; break;
1987
+ default: model.type = e_model::MODEL_UNKNOWN;
1988
+ }
1989
+ } break;
1910
1990
  default: (void)0;
1911
1991
  }
1912
1992
 
@@ -1971,6 +2051,7 @@ static void llm_load_vocab(
1971
2051
 
1972
2052
  for (int i = 0; i < n_merges; i++) {
1973
2053
  const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
2054
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
1974
2055
 
1975
2056
  std::string first;
1976
2057
  std::string second;
@@ -2005,6 +2086,7 @@ static void llm_load_vocab(
2005
2086
 
2006
2087
  for (uint32_t i = 0; i < n_vocab; i++) {
2007
2088
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
2089
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
2008
2090
 
2009
2091
  vocab.token_to_id[word] = i;
2010
2092
 
@@ -2013,12 +2095,13 @@ static void llm_load_vocab(
2013
2095
  token_data.score = scores ? scores[i] : 0.0f;
2014
2096
  token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
2015
2097
  }
2098
+ GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
2016
2099
 
2017
2100
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2018
2101
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
2019
2102
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
2020
2103
  } else {
2021
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
2104
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
2022
2105
  }
2023
2106
 
2024
2107
  // special tokens
@@ -2141,6 +2224,7 @@ static void llm_load_tensors(
2141
2224
  const auto tn = LLM_TN(model.arch);
2142
2225
  switch (model.arch) {
2143
2226
  case LLM_ARCH_LLAMA:
2227
+ case LLM_ARCH_REFACT:
2144
2228
  {
2145
2229
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2146
2230
 
@@ -3334,6 +3418,353 @@ static struct ggml_cgraph * llm_build_baichaun(
3334
3418
  return gf;
3335
3419
  }
3336
3420
 
3421
+ static struct ggml_cgraph * llm_build_refact(
3422
+ llama_context & lctx,
3423
+ const llama_batch & batch) {
3424
+ const auto & model = lctx.model;
3425
+ const auto & hparams = model.hparams;
3426
+ const auto & cparams = lctx.cparams;
3427
+
3428
+ const auto & kv_self = lctx.kv_self;
3429
+
3430
+ GGML_ASSERT(!!kv_self.ctx);
3431
+
3432
+ const int64_t n_embd = hparams.n_embd;
3433
+ const int64_t n_layer = hparams.n_layer;
3434
+ const int64_t n_ctx = cparams.n_ctx;
3435
+ const int64_t n_head = hparams.n_head;
3436
+ const int64_t n_head_kv = hparams.n_head_kv;
3437
+ const int64_t n_embd_head = hparams.n_embd_head();
3438
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3439
+
3440
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
3441
+
3442
+ const int n_gpu_layers = model.n_gpu_layers;
3443
+
3444
+ const int32_t n_tokens = batch.n_tokens;
3445
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3446
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3447
+
3448
+ // printf("n_kv = %d\n", n_kv);
3449
+
3450
+ auto & buf_compute = lctx.buf_compute;
3451
+
3452
+ struct ggml_init_params params = {
3453
+ /*.mem_size =*/ buf_compute.size,
3454
+ /*.mem_buffer =*/ buf_compute.data,
3455
+ /*.no_alloc =*/ false,
3456
+ };
3457
+
3458
+ params.no_alloc = true;
3459
+
3460
+ struct ggml_context * ctx0 = ggml_init(params);
3461
+
3462
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3463
+
3464
+ struct ggml_tensor * cur;
3465
+ struct ggml_tensor * inpL;
3466
+
3467
+ if (batch.token) {
3468
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3469
+
3470
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3471
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3472
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3473
+ }
3474
+ ggml_set_name(inp_tokens, "inp_tokens");
3475
+
3476
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3477
+ } else {
3478
+ #ifdef GGML_USE_MPI
3479
+ GGML_ASSERT(false && "not implemented");
3480
+ #endif
3481
+
3482
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3483
+
3484
+ ggml_allocr_alloc(lctx.alloc, inpL);
3485
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3486
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3487
+ }
3488
+ }
3489
+
3490
+ const int i_gpu_start = n_layer - n_gpu_layers;
3491
+ (void) i_gpu_start;
3492
+
3493
+ // offload functions set the tensor output backend to GPU
3494
+ // tensors are GPU-accelerated if any input or the output has been offloaded
3495
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3496
+ offload_func_t offload_func_kq = llama_nop;
3497
+ offload_func_t offload_func_v = llama_nop;
3498
+
3499
+ #ifdef GGML_USE_CUBLAS
3500
+ if (n_gpu_layers > n_layer) {
3501
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
3502
+ }
3503
+ if (n_gpu_layers > n_layer + 1) {
3504
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
3505
+ }
3506
+ if (n_gpu_layers > n_layer + 2) {
3507
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
3508
+ }
3509
+ #endif // GGML_USE_CUBLAS
3510
+
3511
+ // KQ_scale
3512
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3513
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3514
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3515
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3516
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
3517
+ }
3518
+
3519
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3520
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3521
+ offload_func_kq(KQ_mask);
3522
+ ggml_set_name(KQ_mask, "KQ_mask");
3523
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3524
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3525
+ float * data = (float *) KQ_mask->data;
3526
+ memset(data, 0, ggml_nbytes(KQ_mask));
3527
+
3528
+ for (int h = 0; h < 1; ++h) {
3529
+ for (int j = 0; j < n_tokens; ++j) {
3530
+ const llama_pos pos = batch.pos[j];
3531
+ const llama_seq_id seq_id = batch.seq_id[j];
3532
+
3533
+ for (int i = 0; i < n_kv; ++i) {
3534
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3535
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3536
+ }
3537
+ }
3538
+ }
3539
+ }
3540
+ }
3541
+
3542
+ for (int il = 0; il < n_layer; ++il) {
3543
+ ggml_format_name(inpL, "layer_inp_%d", il);
3544
+
3545
+ offload_func_t offload_func = llama_nop;
3546
+
3547
+ #ifdef GGML_USE_CUBLAS
3548
+ if (il >= i_gpu_start) {
3549
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
3550
+ }
3551
+ #endif // GGML_USE_CUBLAS
3552
+
3553
+ struct ggml_tensor * inpSA = inpL;
3554
+
3555
+ // norm
3556
+ {
3557
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
3558
+ offload_func(cur);
3559
+ ggml_set_name(cur, "rms_norm_0");
3560
+
3561
+ // cur = cur*attn_norm(broadcasted)
3562
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
3563
+ offload_func(cur);
3564
+ ggml_set_name(cur, "attention_norm_0");
3565
+ }
3566
+
3567
+ // self-attention
3568
+ {
3569
+ // compute Q and K
3570
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3571
+ offload_func_kq(tmpk);
3572
+ ggml_set_name(tmpk, "tmpk");
3573
+
3574
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3575
+ offload_func_kq(tmpq);
3576
+ ggml_set_name(tmpq, "tmpq");
3577
+
3578
+ struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
3579
+ offload_func_kq(Kcur);
3580
+ ggml_set_name(Kcur, "Kcur");
3581
+
3582
+ struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
3583
+ offload_func_kq(Qcur);
3584
+ ggml_set_name(Qcur, "Qcur");
3585
+
3586
+ // store key and value to memory
3587
+ {
3588
+ // compute the transposed [n_tokens, n_embd] V matrix
3589
+
3590
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3591
+ offload_func_v(tmpv);
3592
+ ggml_set_name(tmpv, "tmpv");
3593
+
3594
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
3595
+ offload_func_v(Vcur);
3596
+ ggml_set_name(Vcur, "Vcur");
3597
+
3598
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3599
+ offload_func_kq(k);
3600
+ ggml_set_name(k, "k");
3601
+
3602
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3603
+ ( n_ctx)*ggml_element_size(kv_self.v),
3604
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3605
+ offload_func_v(v);
3606
+ ggml_set_name(v, "v");
3607
+
3608
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3609
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3610
+ }
3611
+
3612
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
3613
+ offload_func_kq(Q);
3614
+ ggml_set_name(Q, "Q");
3615
+
3616
+ struct ggml_tensor * K =
3617
+ ggml_view_3d(ctx0, kv_self.k,
3618
+ n_embd_head, n_kv, n_head_kv,
3619
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3620
+ ggml_element_size(kv_self.k)*n_embd_head,
3621
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3622
+ offload_func_kq(K);
3623
+ ggml_set_name(K, "K");
3624
+
3625
+ // K * Q
3626
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3627
+ offload_func_kq(KQ);
3628
+ ggml_set_name(KQ, "KQ");
3629
+
3630
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3631
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
3632
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3633
+ offload_func_kq(KQ_scaled);
3634
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3635
+
3636
+ // KQ_masked = mask_past(KQ_scaled)
3637
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
3638
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
3639
+
3640
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
3641
+ offload_func_kq(KQ_masked);
3642
+ ggml_set_name(KQ_masked, "KQ_masked");
3643
+
3644
+ // KQ = soft_max(KQ_masked)
3645
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3646
+ offload_func_v(KQ_soft_max);
3647
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3648
+
3649
+ // split cached V into n_head heads
3650
+ struct ggml_tensor * V =
3651
+ ggml_view_3d(ctx0, kv_self.v,
3652
+ n_kv, n_embd_head, n_head_kv,
3653
+ ggml_element_size(kv_self.v)*n_ctx,
3654
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3655
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3656
+ offload_func_v(V);
3657
+ ggml_set_name(V, "V");
3658
+
3659
+ #if 1
3660
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3661
+ offload_func_v(KQV);
3662
+ ggml_set_name(KQV, "KQV");
3663
+ #else
3664
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3665
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3666
+ // is there a better way?
3667
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
3668
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
3669
+ #endif
3670
+
3671
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3672
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3673
+ offload_func_v(KQV_merged);
3674
+ ggml_set_name(KQV_merged, "KQV_merged");
3675
+
3676
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3677
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3678
+ offload_func_v(cur);
3679
+ ggml_set_name(cur, "KQV_merged_contiguous");
3680
+
3681
+ // projection (no bias)
3682
+ cur = ggml_mul_mat(ctx0,
3683
+ model.layers[il].wo,
3684
+ cur);
3685
+ offload_func(cur);
3686
+ ggml_set_name(cur, "result_wo");
3687
+ }
3688
+
3689
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
3690
+ offload_func(inpFF);
3691
+ ggml_set_name(inpFF, "inpFF");
3692
+
3693
+ // feed-forward network
3694
+ {
3695
+ // norm
3696
+ {
3697
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
3698
+ offload_func(cur);
3699
+ ggml_set_name(cur, "rms_norm_1");
3700
+
3701
+ // cur = cur*ffn_norm(broadcasted)
3702
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
3703
+ offload_func(cur);
3704
+ ggml_set_name(cur, "ffn_norm");
3705
+ }
3706
+
3707
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
3708
+ model.layers[il].w3,
3709
+ cur);
3710
+ offload_func(tmp);
3711
+ ggml_set_name(tmp, "result_w3");
3712
+
3713
+ cur = ggml_mul_mat(ctx0,
3714
+ model.layers[il].w1,
3715
+ cur);
3716
+ offload_func(cur);
3717
+ ggml_set_name(cur, "result_w1");
3718
+
3719
+ // SILU activation
3720
+ cur = ggml_silu(ctx0, cur);
3721
+ offload_func(cur);
3722
+ ggml_set_name(cur, "silu");
3723
+
3724
+ cur = ggml_mul(ctx0, cur, tmp);
3725
+ offload_func(cur);
3726
+ ggml_set_name(cur, "silu_x_result_w3");
3727
+
3728
+ cur = ggml_mul_mat(ctx0,
3729
+ model.layers[il].w2,
3730
+ cur);
3731
+ offload_func(cur);
3732
+ ggml_set_name(cur, "result_w2");
3733
+ }
3734
+
3735
+ cur = ggml_add(ctx0, cur, inpFF);
3736
+ offload_func(cur);
3737
+ ggml_set_name(cur, "inpFF_+_result_w2");
3738
+
3739
+ // input for next layer
3740
+ inpL = cur;
3741
+ }
3742
+
3743
+ cur = inpL;
3744
+
3745
+ // norm
3746
+ {
3747
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
3748
+ offload_func_nr(cur);
3749
+ ggml_set_name(cur, "rms_norm_2");
3750
+
3751
+ // cur = cur*norm(broadcasted)
3752
+ cur = ggml_mul(ctx0, cur, model.output_norm);
3753
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
3754
+ ggml_set_name(cur, "result_norm");
3755
+ }
3756
+
3757
+ // lm_head
3758
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3759
+ ggml_set_name(cur, "result_output");
3760
+
3761
+ ggml_build_forward_expand(gf, cur);
3762
+
3763
+ ggml_free(ctx0);
3764
+
3765
+ return gf;
3766
+ }
3767
+
3337
3768
  static struct ggml_cgraph * llm_build_falcon(
3338
3769
  llama_context & lctx,
3339
3770
  const llama_batch & batch) {
@@ -3974,6 +4405,10 @@ static struct ggml_cgraph * llama_build_graph(
3974
4405
  {
3975
4406
  result = llm_build_starcoder(lctx, batch);
3976
4407
  } break;
4408
+ case LLM_ARCH_REFACT:
4409
+ {
4410
+ result = llm_build_refact(lctx, batch);
4411
+ } break;
3977
4412
  default:
3978
4413
  GGML_ASSERT(false);
3979
4414
  }
@@ -4107,7 +4542,8 @@ static int llama_decode_internal(
4107
4542
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
4108
4543
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4109
4544
  model.arch == LLM_ARCH_BAICHUAN ||
4110
- model.arch == LLM_ARCH_FALCON;
4545
+ model.arch == LLM_ARCH_FALCON ||
4546
+ model.arch == LLM_ARCH_REFACT;
4111
4547
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4112
4548
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4113
4549
  n_threads = 1;
@@ -4227,18 +4663,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
4227
4663
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
4228
4664
  }
4229
4665
 
4230
- static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
4666
+ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
4667
+ return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
4668
+ }
4669
+
4670
+ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
4231
4671
  GGML_ASSERT(llama_is_byte_token(vocab, id));
4232
4672
  const auto& token_data = vocab.id_to_token.at(id);
4233
- auto buf = token_data.text.substr(3, 2);
4234
- return strtol(buf.c_str(), NULL, 16);
4673
+ switch (llama_vocab_get_type(vocab)) {
4674
+ case LLAMA_VOCAB_TYPE_SPM: {
4675
+ auto buf = token_data.text.substr(3, 2);
4676
+ return strtol(buf.c_str(), NULL, 16);
4677
+ }
4678
+ case LLAMA_VOCAB_TYPE_BPE: {
4679
+ GGML_ASSERT(false);
4680
+ return unicode_to_bytes_bpe(token_data.text);
4681
+ }
4682
+ default:
4683
+ GGML_ASSERT(false);
4684
+ }
4235
4685
  }
4236
4686
 
4237
4687
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
4238
- char buf[7];
4239
- int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
4240
- GGML_ASSERT(0 <= result && result < 7);
4241
- return vocab.token_to_id.at(buf);
4688
+ switch (llama_vocab_get_type(vocab)) {
4689
+ case LLAMA_VOCAB_TYPE_SPM: {
4690
+ char buf[7];
4691
+ int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
4692
+ GGML_ASSERT(0 <= result && result < 7);
4693
+ return vocab.token_to_id.at(buf);
4694
+ }
4695
+ case LLAMA_VOCAB_TYPE_BPE: {
4696
+ return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
4697
+ }
4698
+ default:
4699
+ GGML_ASSERT(false);
4700
+ }
4242
4701
  }
4243
4702
 
4244
4703
  static void llama_escape_whitespace(std::string & text) {
@@ -4518,15 +4977,9 @@ struct llm_tokenizer_bpe {
4518
4977
  std::string byte_str(1, *j);
4519
4978
  auto token_multibyte = vocab.token_to_id.find(byte_str);
4520
4979
  if (token_multibyte == vocab.token_to_id.end()) {
4521
- try {
4522
- llama_token token_byte = llama_byte_to_token(vocab, *j);
4523
- output.push_back(token_byte);
4524
- } catch (const std::out_of_range & err) {
4525
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
4526
- }
4527
- } else {
4528
- output.push_back((*token_multibyte).second);
4980
+ throw std::runtime_error("ERROR: byte not found in vocab");
4529
4981
  }
4982
+ output.push_back((*token_multibyte).second);
4530
4983
  }
4531
4984
  } else {
4532
4985
  output.push_back((*token).second);
@@ -4563,23 +5016,144 @@ private:
4563
5016
  work_queue.push(bigram);
4564
5017
  }
4565
5018
 
4566
- // probably not 100% correct
4567
- static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
4568
- std::vector<std::string> words;
5019
+ std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
5020
+ std::vector<std::string> bpe_words;
5021
+ std::vector<std::string> bpe_encoded_words;
5022
+
5023
+ std::string token = "";
5024
+ // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
5025
+ bool collecting_numeric = false;
5026
+ bool collecting_letter = false;
5027
+ bool collecting_special = false;
5028
+ bool collecting_whitespace_lookahead = false;
5029
+ bool collecting = false;
5030
+
5031
+ std::vector<std::string> text_utf;
5032
+ text_utf.reserve(text.size());
5033
+ bpe_words.reserve(text.size());
5034
+ bpe_encoded_words.reserve(text.size());
5035
+
5036
+ auto cps = codepoints_from_utf8(text);
5037
+ for (size_t i = 0; i < cps.size(); ++i)
5038
+ text_utf.emplace_back(codepoint_to_utf8(cps[i]));
5039
+
5040
+ for (int i = 0; i < (int)text_utf.size(); i++) {
5041
+ const std::string & utf_char = text_utf[i];
5042
+ bool split_condition = false;
5043
+ // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5044
+ int bytes_remain = text_utf.size() - i;
5045
+ // forward backward lookups
5046
+ const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
5047
+ const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
5048
+
5049
+ // handling contractions
5050
+ if (!split_condition && bytes_remain >= 2) {
5051
+ // 's|'t|'m|'d
5052
+ if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
5053
+ split_condition = true;
5054
+ }
5055
+ if (split_condition) {
5056
+ if (token.size()) {
5057
+ bpe_words.emplace_back(token); // push previous content as token
5058
+ }
5059
+ token = utf_char + utf_char_next;
5060
+ bpe_words.emplace_back(token);
5061
+ token = "";
5062
+ i++;
5063
+ continue;
5064
+ }
5065
+ }
5066
+ if (!split_condition && bytes_remain >= 3) {
5067
+ // 're|'ve|'ll
5068
+ if (utf_char == "\'" && (
5069
+ (utf_char_next == "r" || utf_char_next_next == "e") ||
5070
+ (utf_char_next == "v" || utf_char_next_next == "e") ||
5071
+ (utf_char_next == "l" || utf_char_next_next == "l"))
5072
+ ) {
5073
+ split_condition = true;
5074
+ }
5075
+ if (split_condition) {
5076
+ // current token + next token can be defined
5077
+ if (token.size()) {
5078
+ bpe_words.emplace_back(token); // push previous content as token
5079
+ }
5080
+ token = utf_char + utf_char_next + utf_char_next_next;
5081
+ bpe_words.emplace_back(token); // the contraction
5082
+ token = "";
5083
+ i += 2;
5084
+ continue;
5085
+ }
5086
+ }
5087
+
5088
+ if (!split_condition && !collecting) {
5089
+ if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
5090
+ collecting_letter = true;
5091
+ collecting = true;
5092
+ }
5093
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
5094
+ collecting_numeric = true;
5095
+ collecting = true;
5096
+ }
5097
+ else if (
5098
+ ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
5099
+ (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
5100
+ ) {
5101
+ collecting_special = true;
5102
+ collecting = true;
5103
+ }
5104
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
5105
+ collecting_whitespace_lookahead = true;
5106
+ collecting = true;
5107
+ }
5108
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
5109
+ split_condition = true;
5110
+ }
5111
+ }
5112
+ else if (!split_condition && collecting) {
5113
+ if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
5114
+ split_condition = true;
5115
+ }
5116
+ else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
5117
+ split_condition = true;
5118
+ }
5119
+ else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5120
+ split_condition = true;
5121
+ }
5122
+ else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
5123
+ split_condition = true;
5124
+ }
5125
+ }
4569
5126
 
4570
- // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
4571
- const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
4572
- const std::regex re(pattern);
5127
+ if (utf_char_next == "") {
5128
+ split_condition = true; // final
5129
+ token += utf_char;
5130
+ }
4573
5131
 
4574
- auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
4575
- auto words_end = std::sregex_iterator();
4576
- auto n_words = std::distance(words_begin, words_end);
4577
- words.reserve(n_words);
4578
- for (auto it = words_begin; it != words_end; ++it) {
4579
- words.push_back(it->str());
5132
+ if (split_condition) {
5133
+ if (token.size()) {
5134
+ bpe_words.emplace_back(token);
5135
+ }
5136
+ token = utf_char;
5137
+ collecting = false;
5138
+ collecting_letter = false;
5139
+ collecting_numeric = false;
5140
+ collecting_special = false;
5141
+ collecting_whitespace_lookahead = false;
5142
+ }
5143
+ else {
5144
+ token += utf_char;
5145
+ }
5146
+ }
5147
+
5148
+ for (std::string & word : bpe_words) {
5149
+ std::string encoded_token = "";
5150
+ for (char & c : word) {
5151
+ encoded_token += bytes_to_unicode_bpe(c);
5152
+ }
5153
+ bpe_encoded_words.emplace_back(encoded_token);
4580
5154
  }
4581
- return words;
4582
5155
 
5156
+ return bpe_encoded_words;
4583
5157
  }
4584
5158
 
4585
5159
  const llama_vocab & vocab;
@@ -6022,7 +6596,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6022
6596
  nthread = std::thread::hardware_concurrency();
6023
6597
  }
6024
6598
 
6025
- llama_model_loader ml(fname_inp, /*use_mmap*/ false);
6599
+ // mmap consistently increases speed Linux, and also increases speed on Windows with
6600
+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
6601
+ #if defined(__linux__) || defined(_WIN32)
6602
+ constexpr bool use_mmap = true;
6603
+ #else
6604
+ constexpr bool use_mmap = false;
6605
+ #endif
6606
+
6607
+ llama_model_loader ml(fname_inp, use_mmap);
6608
+ if (ml.use_mmap) {
6609
+ ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
6610
+ }
6026
6611
 
6027
6612
  llama_model model;
6028
6613
  llm_load_arch(ml, model);
@@ -6100,10 +6685,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6100
6685
 
6101
6686
  const std::string name = ggml_get_name(tensor);
6102
6687
 
6103
- if (read_data.size() < ggml_nbytes(tensor)) {
6104
- read_data.resize(ggml_nbytes(tensor));
6688
+ if (!ml.use_mmap) {
6689
+ if (read_data.size() < ggml_nbytes(tensor)) {
6690
+ read_data.resize(ggml_nbytes(tensor));
6691
+ }
6692
+ tensor->data = read_data.data();
6105
6693
  }
6106
- tensor->data = read_data.data();
6107
6694
  ml.load_data_for(tensor);
6108
6695
 
6109
6696
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
@@ -6738,13 +7325,14 @@ struct llama_context * llama_new_context_with_model(
6738
7325
 
6739
7326
  #ifdef GGML_USE_METAL
6740
7327
  if (model->n_gpu_layers > 0) {
7328
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
7329
+
6741
7330
  ctx->ctx_metal = ggml_metal_init(1);
6742
7331
  if (!ctx->ctx_metal) {
6743
7332
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6744
7333
  llama_free(ctx);
6745
7334
  return NULL;
6746
7335
  }
6747
- ggml_metal_log_set_callback(llama_log_callback_default, NULL);
6748
7336
  //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6749
7337
  //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6750
7338
  }
@@ -6872,6 +7460,10 @@ int llama_n_embd(const struct llama_model * model) {
6872
7460
  return model->hparams.n_embd;
6873
7461
  }
6874
7462
 
7463
+ float llama_rope_freq_scale_train(const struct llama_model * model) {
7464
+ return model->hparams.rope_freq_scale_train;
7465
+ }
7466
+
6875
7467
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6876
7468
  return snprintf(buf, buf_size, "%s %s %s",
6877
7469
  llama_model_arch_name(model->arch).c_str(),
@@ -7039,16 +7631,6 @@ struct llama_data_file_context : llama_data_context {
7039
7631
  *
7040
7632
  */
7041
7633
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7042
- // TODO: does not support multi-sequence states
7043
- {
7044
- const auto & kv_self = ctx->kv_self;
7045
- for (uint32_t i = 0; i < kv_self.head; ++i) {
7046
- GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
7047
- GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
7048
- GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
7049
- }
7050
- }
7051
-
7052
7634
  // copy rng
7053
7635
  {
7054
7636
  std::stringstream rng_ss;
@@ -7101,36 +7683,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
7101
7683
  const auto & hparams = ctx->model.hparams;
7102
7684
  const auto & cparams = ctx->cparams;
7103
7685
 
7104
- const int n_layer = hparams.n_layer;
7105
- const int n_embd = hparams.n_embd_gqa();
7106
- const int n_ctx = cparams.n_ctx;
7686
+ const auto n_layer = hparams.n_layer;
7687
+ const auto n_embd = hparams.n_embd_gqa();
7688
+ const auto n_ctx = cparams.n_ctx;
7107
7689
 
7108
- const size_t kv_size = kv_self.buf.size;
7109
- const int kv_ntok = kv_self.head;
7690
+ const size_t kv_buf_size = kv_self.buf.size;
7691
+ const uint32_t kv_head = kv_self.head;
7692
+ const uint32_t kv_size = kv_self.size;
7110
7693
 
7111
- data_ctx->write(&kv_size, sizeof(kv_size));
7112
- data_ctx->write(&kv_ntok, sizeof(kv_ntok));
7694
+ data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
7695
+ data_ctx->write(&kv_head, sizeof(kv_head));
7696
+ data_ctx->write(&kv_size, sizeof(kv_size));
7113
7697
 
7114
- if (kv_size) {
7698
+ if (kv_buf_size) {
7115
7699
  const size_t elt_size = ggml_element_size(kv_self.k);
7116
7700
 
7117
7701
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
7118
7702
  ggml_cgraph gf{};
7119
7703
 
7120
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
7704
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
7121
7705
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
7122
7706
  kout3d->data = kout3d_data.data();
7123
7707
 
7124
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
7708
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
7125
7709
  std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
7126
7710
  vout3d->data = vout3d_data.data();
7127
7711
 
7128
7712
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
7129
- n_embd, kv_ntok, n_layer,
7713
+ n_embd, kv_head, n_layer,
7130
7714
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
7131
7715
 
7132
7716
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
7133
- kv_ntok, n_embd, n_layer,
7717
+ kv_head, n_embd, n_layer,
7134
7718
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
7135
7719
 
7136
7720
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
@@ -7144,6 +7728,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
7144
7728
  data_ctx->write(kout3d_data.data(), kout3d_data.size());
7145
7729
  data_ctx->write(vout3d_data.data(), vout3d_data.size());
7146
7730
  }
7731
+
7732
+ for (uint32_t i = 0; i < kv_size; ++i) {
7733
+ const auto & cell = kv_self.cells[i];
7734
+
7735
+ const llama_pos pos = cell.pos;
7736
+ const size_t seq_id_size = cell.seq_id.size();
7737
+
7738
+ data_ctx->write(&pos, sizeof(pos));
7739
+ data_ctx->write(&seq_id_size, sizeof(seq_id_size));
7740
+
7741
+ for (auto seq_id : cell.seq_id) {
7742
+ data_ctx->write(&seq_id, sizeof(seq_id));
7743
+ }
7744
+ }
7147
7745
  }
7148
7746
  }
7149
7747
 
@@ -7215,34 +7813,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7215
7813
  const int n_embd = hparams.n_embd_gqa();
7216
7814
  const int n_ctx = cparams.n_ctx;
7217
7815
 
7218
- size_t kv_size;
7219
- int kv_ntok;
7816
+ size_t kv_buf_size;
7817
+ uint32_t kv_head;
7818
+ uint32_t kv_size;
7220
7819
 
7221
- memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
7222
- memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
7820
+ memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
7821
+ memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
7822
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
7223
7823
 
7224
- if (kv_size) {
7225
- GGML_ASSERT(kv_self.buf.size == kv_size);
7824
+ if (kv_buf_size) {
7825
+ GGML_ASSERT(kv_self.buf.size == kv_buf_size);
7226
7826
 
7227
7827
  const size_t elt_size = ggml_element_size(kv_self.k);
7228
7828
 
7229
7829
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
7230
7830
  ggml_cgraph gf{};
7231
7831
 
7232
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
7832
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
7233
7833
  kin3d->data = (void *) inp;
7234
7834
  inp += ggml_nbytes(kin3d);
7235
7835
 
7236
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
7836
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
7237
7837
  vin3d->data = (void *) inp;
7238
7838
  inp += ggml_nbytes(vin3d);
7239
7839
 
7240
7840
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
7241
- n_embd, kv_ntok, n_layer,
7841
+ n_embd, kv_head, n_layer,
7242
7842
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
7243
7843
 
7244
7844
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
7245
- kv_ntok, n_embd, n_layer,
7845
+ kv_head, n_embd, n_layer,
7246
7846
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
7247
7847
 
7248
7848
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
@@ -7252,8 +7852,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7252
7852
  ggml_free(cpy_ctx);
7253
7853
  }
7254
7854
 
7255
- ctx->kv_self.head = kv_ntok;
7855
+ ctx->kv_self.head = kv_head;
7256
7856
  ctx->kv_self.size = kv_size;
7857
+
7858
+ ctx->kv_self.cells.resize(kv_size);
7859
+
7860
+ for (uint32_t i = 0; i < kv_size; ++i) {
7861
+ llama_pos pos;
7862
+ size_t seq_id_size;
7863
+
7864
+ memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
7865
+ memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
7866
+
7867
+ ctx->kv_self.cells[i].pos = pos;
7868
+
7869
+ llama_seq_id seq_id;
7870
+
7871
+ for (size_t j = 0; j < seq_id_size; ++j) {
7872
+ memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
7873
+ ctx->kv_self.cells[i].seq_id.insert(seq_id);
7874
+ }
7875
+ }
7257
7876
  }
7258
7877
 
7259
7878
  const size_t nread = inp - src;
@@ -7471,6 +8090,22 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
7471
8090
  llama_token llama_token_nl(const struct llama_context * ctx) {
7472
8091
  return ctx->model.vocab.linefeed_id;
7473
8092
  }
8093
+ llama_token llama_token_prefix(const struct llama_context * ctx) {
8094
+ return ctx->model.vocab.special_prefix_id;
8095
+ }
8096
+
8097
+ llama_token llama_token_middle(const struct llama_context * ctx) {
8098
+ return ctx->model.vocab.special_middle_id;
8099
+ }
8100
+
8101
+ llama_token llama_token_suffix(const struct llama_context * ctx) {
8102
+ return ctx->model.vocab.special_suffix_id;
8103
+ }
8104
+
8105
+ llama_token llama_token_eot(const struct llama_context * ctx) {
8106
+ return ctx->model.vocab.special_eot_id;
8107
+ }
8108
+
7474
8109
 
7475
8110
  int llama_tokenize(
7476
8111
  const struct llama_model * model,
@@ -7493,35 +8128,66 @@ int llama_tokenize(
7493
8128
  return res.size();
7494
8129
  }
7495
8130
 
8131
+ static std::string llama_decode_text(const std::string & text) {
8132
+ std::string decoded_text;
8133
+ auto unicode_sequences = codepoints_from_utf8(text);
8134
+ for (auto& unicode_sequence : unicode_sequences) {
8135
+ decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
8136
+ }
8137
+
8138
+ return decoded_text;
8139
+ }
8140
+
7496
8141
  // does not write null-terminator to buf
7497
8142
  int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
7498
8143
  if (0 <= token && token < llama_n_vocab(model)) {
7499
- if (llama_is_normal_token(model->vocab, token)) {
7500
- std::string result = model->vocab.id_to_token[token].text;
7501
- if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
8144
+ switch (llama_vocab_get_type(model->vocab)) {
8145
+ case LLAMA_VOCAB_TYPE_SPM: {
8146
+ if (llama_is_normal_token(model->vocab, token)) {
8147
+ std::string result = model->vocab.id_to_token[token].text;
7502
8148
  llama_unescape_whitespace(result);
8149
+ if (length < (int) result.length()) {
8150
+ return -result.length();
8151
+ }
8152
+ memcpy(buf, result.c_str(), result.length());
8153
+ return result.length();
8154
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
8155
+ if (length < 3) {
8156
+ return -3;
8157
+ }
8158
+ memcpy(buf, "\xe2\x96\x85", 3);
8159
+ return 3;
8160
+ } else if (llama_is_control_token(model->vocab, token)) {
8161
+ ;
8162
+ } else if (llama_is_byte_token(model->vocab, token)) {
8163
+ if (length < 1) {
8164
+ return -1;
8165
+ }
8166
+ buf[0] = llama_token_to_byte(model->vocab, token);
8167
+ return 1;
8168
+ } else {
8169
+ GGML_ASSERT(false);
7503
8170
  }
7504
- if (length < (int) result.length()) {
7505
- return -result.length();
7506
- }
7507
- memcpy(buf, result.c_str(), result.length());
7508
- return result.length();
7509
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
7510
- if (length < 3) {
7511
- return -3;
7512
- }
7513
- buf[0] = '\xe2';
7514
- buf[1] = '\x96';
7515
- buf[2] = '\x85';
7516
- return 3;
7517
- } else if (llama_is_control_token(model->vocab, token)) {
7518
- // do nothing
7519
- } else if (llama_is_byte_token(model->vocab, token)) {
7520
- if (length < 1) {
7521
- return -1;
8171
+ break;
8172
+ }
8173
+ case LLAMA_VOCAB_TYPE_BPE: {
8174
+ if (llama_is_normal_token(model->vocab, token)) {
8175
+ std::string result = model->vocab.id_to_token[token].text;
8176
+ result = llama_decode_text(result);
8177
+ if (length < (int) result.length()) {
8178
+ return -result.length();
8179
+ }
8180
+ memcpy(buf, result.c_str(), result.length());
8181
+ return result.length();
8182
+ } else if (llama_is_control_token(model->vocab, token)) {
8183
+ ;
8184
+ } else {
8185
+ GGML_ASSERT(false);
7522
8186
  }
7523
- buf[0] = llama_token_to_byte(model->vocab, token);
7524
- return 1;
8187
+ break;
8188
+ }
8189
+ default:
8190
+ GGML_ASSERT(false);
7525
8191
  }
7526
8192
  }
7527
8193
  return 0;
@@ -7548,14 +8214,14 @@ void llama_print_timings(struct llama_context * ctx) {
7548
8214
  const llama_timings timings = llama_get_timings(ctx);
7549
8215
 
7550
8216
  LLAMA_LOG_INFO("\n");
7551
- LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
7552
- LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
8217
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
8218
+ LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7553
8219
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
7554
- LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
8220
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
7555
8221
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
7556
- LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
8222
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7557
8223
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
7558
- LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
8224
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
7559
8225
  }
7560
8226
 
7561
8227
  void llama_reset_timings(struct llama_context * ctx) {