llama_cpp 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,8 @@
1
1
  #define LLAMA_API_INTERNAL
2
2
  #include "llama.h"
3
3
 
4
+ #include "unicode.h"
5
+
4
6
  #include "ggml.h"
5
7
 
6
8
  #include "ggml-alloc.h"
@@ -123,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
123
125
  }
124
126
  s = std::move(result);
125
127
  }
128
+
129
+ static bool is_float_close(float a, float b, float abs_tol) {
130
+ // Check for non-negative tolerance
131
+ if (abs_tol < 0.0) {
132
+ throw std::invalid_argument("Tolerance must be non-negative");
133
+ }
134
+
135
+ // Exact equality check
136
+ if (a == b) {
137
+ return true;
138
+ }
139
+
140
+ // Check for infinities
141
+ if (std::isinf(a) || std::isinf(b)) {
142
+ return false;
143
+ }
144
+
145
+ // Regular comparison using the provided absolute tolerance
146
+ return std::fabs(b - a) <= abs_tol;
147
+ }
148
+
126
149
  #ifdef GGML_USE_CPU_HBM
127
150
  #include <hbwmalloc.h>
128
151
  #endif
@@ -163,6 +186,7 @@ enum llm_arch {
163
186
  LLM_ARCH_GPTNEOX,
164
187
  LLM_ARCH_MPT,
165
188
  LLM_ARCH_STARCODER,
189
+ LLM_ARCH_REFACT,
166
190
  LLM_ARCH_UNKNOWN,
167
191
  };
168
192
 
@@ -175,6 +199,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
175
199
  { LLM_ARCH_MPT, "mpt" },
176
200
  { LLM_ARCH_BAICHUAN, "baichuan" },
177
201
  { LLM_ARCH_STARCODER, "starcoder" },
202
+ { LLM_ARCH_REFACT, "refact" },
178
203
  };
179
204
 
180
205
  enum llm_kv {
@@ -395,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
395
420
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
396
421
  },
397
422
  },
423
+ {
424
+ LLM_ARCH_REFACT,
425
+ {
426
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
427
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
428
+ { LLM_TENSOR_OUTPUT, "output" },
429
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
430
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
431
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
432
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
433
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
434
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
435
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
436
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
437
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
438
+ },
439
+ },
398
440
  {
399
441
  LLM_ARCH_UNKNOWN,
400
442
  {
@@ -943,7 +985,24 @@ struct llama_hparams {
943
985
  float rope_freq_scale_train;
944
986
 
945
987
  bool operator!=(const llama_hparams & other) const {
946
- return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
988
+ if (this->vocab_only != other.vocab_only) return true;
989
+ if (this->n_vocab != other.n_vocab) return true;
990
+ if (this->n_ctx_train != other.n_ctx_train) return true;
991
+ if (this->n_embd != other.n_embd) return true;
992
+ if (this->n_head != other.n_head) return true;
993
+ if (this->n_head_kv != other.n_head_kv) return true;
994
+ if (this->n_layer != other.n_layer) return true;
995
+ if (this->n_rot != other.n_rot) return true;
996
+ if (this->n_ff != other.n_ff) return true;
997
+
998
+ const float EPSILON = 1e-9;
999
+
1000
+ if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1001
+ if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
1002
+ if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
1003
+ if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
1004
+
1005
+ return false;
947
1006
  }
948
1007
 
949
1008
  uint32_t n_gqa() const {
@@ -1071,6 +1130,10 @@ struct llama_vocab {
1071
1130
  id special_pad_id = -1;
1072
1131
 
1073
1132
  id linefeed_id = 13;
1133
+ id special_prefix_id = 32007;
1134
+ id special_middle_id = 32009;
1135
+ id special_suffix_id = 32008;
1136
+ id special_eot_id = 32010;
1074
1137
 
1075
1138
  int find_bpe_rank(std::string token_left, std::string token_right) const {
1076
1139
  replace_all(token_left, " ", "\u0120");
@@ -1272,8 +1335,8 @@ static bool llama_kv_cache_init(
1272
1335
  // find an empty slot of size "n_tokens" in the cache
1273
1336
  // updates the cache head
1274
1337
  static bool llama_kv_cache_find_slot(
1275
- struct llama_kv_cache & cache,
1276
- const struct llama_batch & batch) {
1338
+ struct llama_kv_cache & cache,
1339
+ const struct llama_batch & batch) {
1277
1340
  const uint32_t n_ctx = cache.size;
1278
1341
  const uint32_t n_tokens = batch.n_tokens;
1279
1342
 
@@ -1341,10 +1404,13 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
1341
1404
  }
1342
1405
 
1343
1406
  static void llama_kv_cache_seq_rm(
1344
- struct llama_kv_cache & cache,
1345
- llama_seq_id seq_id,
1346
- llama_pos p0,
1347
- llama_pos p1) {
1407
+ struct llama_kv_cache & cache,
1408
+ llama_seq_id seq_id,
1409
+ llama_pos p0,
1410
+ llama_pos p1) {
1411
+ if (p0 < 0) p0 = 0;
1412
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1413
+
1348
1414
  for (uint32_t i = 0; i < cache.size; ++i) {
1349
1415
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1350
1416
  cache.cells[i].seq_id.erase(seq_id);
@@ -1356,11 +1422,14 @@ static void llama_kv_cache_seq_rm(
1356
1422
  }
1357
1423
 
1358
1424
  static void llama_kv_cache_seq_cp(
1359
- struct llama_kv_cache & cache,
1360
- llama_seq_id seq_id_src,
1361
- llama_seq_id seq_id_dst,
1362
- llama_pos p0,
1363
- llama_pos p1) {
1425
+ struct llama_kv_cache & cache,
1426
+ llama_seq_id seq_id_src,
1427
+ llama_seq_id seq_id_dst,
1428
+ llama_pos p0,
1429
+ llama_pos p1) {
1430
+ if (p0 < 0) p0 = 0;
1431
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1432
+
1364
1433
  for (uint32_t i = 0; i < cache.size; ++i) {
1365
1434
  if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1366
1435
  cache.cells[i].seq_id.insert(seq_id_dst);
@@ -1378,11 +1447,14 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
1378
1447
  }
1379
1448
 
1380
1449
  static void llama_kv_cache_seq_shift(
1381
- struct llama_kv_cache & cache,
1382
- llama_seq_id seq_id,
1383
- llama_pos p0,
1384
- llama_pos p1,
1385
- llama_pos delta) {
1450
+ struct llama_kv_cache & cache,
1451
+ llama_seq_id seq_id,
1452
+ llama_pos p0,
1453
+ llama_pos p1,
1454
+ llama_pos delta) {
1455
+ if (p0 < 0) p0 = 0;
1456
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1457
+
1386
1458
  for (uint32_t i = 0; i < cache.size; ++i) {
1387
1459
  if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1388
1460
  cache.cells[i].pos += delta;
@@ -1907,6 +1979,14 @@ static void llm_load_hparams(
1907
1979
  default: model.type = e_model::MODEL_UNKNOWN;
1908
1980
  }
1909
1981
  } break;
1982
+ case LLM_ARCH_REFACT:
1983
+ {
1984
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1985
+ switch (hparams.n_layer) {
1986
+ case 32: model.type = e_model::MODEL_1B; break;
1987
+ default: model.type = e_model::MODEL_UNKNOWN;
1988
+ }
1989
+ } break;
1910
1990
  default: (void)0;
1911
1991
  }
1912
1992
 
@@ -1971,6 +2051,7 @@ static void llm_load_vocab(
1971
2051
 
1972
2052
  for (int i = 0; i < n_merges; i++) {
1973
2053
  const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
2054
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
1974
2055
 
1975
2056
  std::string first;
1976
2057
  std::string second;
@@ -2005,6 +2086,7 @@ static void llm_load_vocab(
2005
2086
 
2006
2087
  for (uint32_t i = 0; i < n_vocab; i++) {
2007
2088
  std::string word = gguf_get_arr_str(ctx, token_idx, i);
2089
+ GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
2008
2090
 
2009
2091
  vocab.token_to_id[word] = i;
2010
2092
 
@@ -2013,12 +2095,13 @@ static void llm_load_vocab(
2013
2095
  token_data.score = scores ? scores[i] : 0.0f;
2014
2096
  token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
2015
2097
  }
2098
+ GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
2016
2099
 
2017
2100
  // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2018
2101
  if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
2019
2102
  vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
2020
2103
  } else {
2021
- vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
2104
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
2022
2105
  }
2023
2106
 
2024
2107
  // special tokens
@@ -2141,6 +2224,7 @@ static void llm_load_tensors(
2141
2224
  const auto tn = LLM_TN(model.arch);
2142
2225
  switch (model.arch) {
2143
2226
  case LLM_ARCH_LLAMA:
2227
+ case LLM_ARCH_REFACT:
2144
2228
  {
2145
2229
  model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2146
2230
 
@@ -3334,6 +3418,353 @@ static struct ggml_cgraph * llm_build_baichaun(
3334
3418
  return gf;
3335
3419
  }
3336
3420
 
3421
+ static struct ggml_cgraph * llm_build_refact(
3422
+ llama_context & lctx,
3423
+ const llama_batch & batch) {
3424
+ const auto & model = lctx.model;
3425
+ const auto & hparams = model.hparams;
3426
+ const auto & cparams = lctx.cparams;
3427
+
3428
+ const auto & kv_self = lctx.kv_self;
3429
+
3430
+ GGML_ASSERT(!!kv_self.ctx);
3431
+
3432
+ const int64_t n_embd = hparams.n_embd;
3433
+ const int64_t n_layer = hparams.n_layer;
3434
+ const int64_t n_ctx = cparams.n_ctx;
3435
+ const int64_t n_head = hparams.n_head;
3436
+ const int64_t n_head_kv = hparams.n_head_kv;
3437
+ const int64_t n_embd_head = hparams.n_embd_head();
3438
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
3439
+
3440
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
3441
+
3442
+ const int n_gpu_layers = model.n_gpu_layers;
3443
+
3444
+ const int32_t n_tokens = batch.n_tokens;
3445
+ const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
3446
+ const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
3447
+
3448
+ // printf("n_kv = %d\n", n_kv);
3449
+
3450
+ auto & buf_compute = lctx.buf_compute;
3451
+
3452
+ struct ggml_init_params params = {
3453
+ /*.mem_size =*/ buf_compute.size,
3454
+ /*.mem_buffer =*/ buf_compute.data,
3455
+ /*.no_alloc =*/ false,
3456
+ };
3457
+
3458
+ params.no_alloc = true;
3459
+
3460
+ struct ggml_context * ctx0 = ggml_init(params);
3461
+
3462
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
3463
+
3464
+ struct ggml_tensor * cur;
3465
+ struct ggml_tensor * inpL;
3466
+
3467
+ if (batch.token) {
3468
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
3469
+
3470
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
3471
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3472
+ memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
3473
+ }
3474
+ ggml_set_name(inp_tokens, "inp_tokens");
3475
+
3476
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
3477
+ } else {
3478
+ #ifdef GGML_USE_MPI
3479
+ GGML_ASSERT(false && "not implemented");
3480
+ #endif
3481
+
3482
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
3483
+
3484
+ ggml_allocr_alloc(lctx.alloc, inpL);
3485
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3486
+ memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
3487
+ }
3488
+ }
3489
+
3490
+ const int i_gpu_start = n_layer - n_gpu_layers;
3491
+ (void) i_gpu_start;
3492
+
3493
+ // offload functions set the tensor output backend to GPU
3494
+ // tensors are GPU-accelerated if any input or the output has been offloaded
3495
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
3496
+ offload_func_t offload_func_kq = llama_nop;
3497
+ offload_func_t offload_func_v = llama_nop;
3498
+
3499
+ #ifdef GGML_USE_CUBLAS
3500
+ if (n_gpu_layers > n_layer) {
3501
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
3502
+ }
3503
+ if (n_gpu_layers > n_layer + 1) {
3504
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
3505
+ }
3506
+ if (n_gpu_layers > n_layer + 2) {
3507
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
3508
+ }
3509
+ #endif // GGML_USE_CUBLAS
3510
+
3511
+ // KQ_scale
3512
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3513
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
3514
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
3515
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3516
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
3517
+ }
3518
+
3519
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
3520
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3521
+ offload_func_kq(KQ_mask);
3522
+ ggml_set_name(KQ_mask, "KQ_mask");
3523
+ ggml_allocr_alloc(lctx.alloc, KQ_mask);
3524
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
3525
+ float * data = (float *) KQ_mask->data;
3526
+ memset(data, 0, ggml_nbytes(KQ_mask));
3527
+
3528
+ for (int h = 0; h < 1; ++h) {
3529
+ for (int j = 0; j < n_tokens; ++j) {
3530
+ const llama_pos pos = batch.pos[j];
3531
+ const llama_seq_id seq_id = batch.seq_id[j];
3532
+
3533
+ for (int i = 0; i < n_kv; ++i) {
3534
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
3535
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
3536
+ }
3537
+ }
3538
+ }
3539
+ }
3540
+ }
3541
+
3542
+ for (int il = 0; il < n_layer; ++il) {
3543
+ ggml_format_name(inpL, "layer_inp_%d", il);
3544
+
3545
+ offload_func_t offload_func = llama_nop;
3546
+
3547
+ #ifdef GGML_USE_CUBLAS
3548
+ if (il >= i_gpu_start) {
3549
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
3550
+ }
3551
+ #endif // GGML_USE_CUBLAS
3552
+
3553
+ struct ggml_tensor * inpSA = inpL;
3554
+
3555
+ // norm
3556
+ {
3557
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
3558
+ offload_func(cur);
3559
+ ggml_set_name(cur, "rms_norm_0");
3560
+
3561
+ // cur = cur*attn_norm(broadcasted)
3562
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
3563
+ offload_func(cur);
3564
+ ggml_set_name(cur, "attention_norm_0");
3565
+ }
3566
+
3567
+ // self-attention
3568
+ {
3569
+ // compute Q and K
3570
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
3571
+ offload_func_kq(tmpk);
3572
+ ggml_set_name(tmpk, "tmpk");
3573
+
3574
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
3575
+ offload_func_kq(tmpq);
3576
+ ggml_set_name(tmpq, "tmpq");
3577
+
3578
+ struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
3579
+ offload_func_kq(Kcur);
3580
+ ggml_set_name(Kcur, "Kcur");
3581
+
3582
+ struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
3583
+ offload_func_kq(Qcur);
3584
+ ggml_set_name(Qcur, "Qcur");
3585
+
3586
+ // store key and value to memory
3587
+ {
3588
+ // compute the transposed [n_tokens, n_embd] V matrix
3589
+
3590
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
3591
+ offload_func_v(tmpv);
3592
+ ggml_set_name(tmpv, "tmpv");
3593
+
3594
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
3595
+ offload_func_v(Vcur);
3596
+ ggml_set_name(Vcur, "Vcur");
3597
+
3598
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
3599
+ offload_func_kq(k);
3600
+ ggml_set_name(k, "k");
3601
+
3602
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
3603
+ ( n_ctx)*ggml_element_size(kv_self.v),
3604
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
3605
+ offload_func_v(v);
3606
+ ggml_set_name(v, "v");
3607
+
3608
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
3609
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
3610
+ }
3611
+
3612
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
3613
+ offload_func_kq(Q);
3614
+ ggml_set_name(Q, "Q");
3615
+
3616
+ struct ggml_tensor * K =
3617
+ ggml_view_3d(ctx0, kv_self.k,
3618
+ n_embd_head, n_kv, n_head_kv,
3619
+ ggml_element_size(kv_self.k)*n_embd_gqa,
3620
+ ggml_element_size(kv_self.k)*n_embd_head,
3621
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
3622
+ offload_func_kq(K);
3623
+ ggml_set_name(K, "K");
3624
+
3625
+ // K * Q
3626
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
3627
+ offload_func_kq(KQ);
3628
+ ggml_set_name(KQ, "KQ");
3629
+
3630
+ // KQ_scaled = KQ / sqrt(n_embd_head)
3631
+ // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
3632
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
3633
+ offload_func_kq(KQ_scaled);
3634
+ ggml_set_name(KQ_scaled, "KQ_scaled");
3635
+
3636
+ // KQ_masked = mask_past(KQ_scaled)
3637
+ struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
3638
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
3639
+
3640
+ struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
3641
+ offload_func_kq(KQ_masked);
3642
+ ggml_set_name(KQ_masked, "KQ_masked");
3643
+
3644
+ // KQ = soft_max(KQ_masked)
3645
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
3646
+ offload_func_v(KQ_soft_max);
3647
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
3648
+
3649
+ // split cached V into n_head heads
3650
+ struct ggml_tensor * V =
3651
+ ggml_view_3d(ctx0, kv_self.v,
3652
+ n_kv, n_embd_head, n_head_kv,
3653
+ ggml_element_size(kv_self.v)*n_ctx,
3654
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
3655
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
3656
+ offload_func_v(V);
3657
+ ggml_set_name(V, "V");
3658
+
3659
+ #if 1
3660
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
3661
+ offload_func_v(KQV);
3662
+ ggml_set_name(KQV, "KQV");
3663
+ #else
3664
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
3665
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
3666
+ // is there a better way?
3667
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
3668
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
3669
+ #endif
3670
+
3671
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
3672
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
3673
+ offload_func_v(KQV_merged);
3674
+ ggml_set_name(KQV_merged, "KQV_merged");
3675
+
3676
+ // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3677
+ cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
3678
+ offload_func_v(cur);
3679
+ ggml_set_name(cur, "KQV_merged_contiguous");
3680
+
3681
+ // projection (no bias)
3682
+ cur = ggml_mul_mat(ctx0,
3683
+ model.layers[il].wo,
3684
+ cur);
3685
+ offload_func(cur);
3686
+ ggml_set_name(cur, "result_wo");
3687
+ }
3688
+
3689
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
3690
+ offload_func(inpFF);
3691
+ ggml_set_name(inpFF, "inpFF");
3692
+
3693
+ // feed-forward network
3694
+ {
3695
+ // norm
3696
+ {
3697
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
3698
+ offload_func(cur);
3699
+ ggml_set_name(cur, "rms_norm_1");
3700
+
3701
+ // cur = cur*ffn_norm(broadcasted)
3702
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
3703
+ offload_func(cur);
3704
+ ggml_set_name(cur, "ffn_norm");
3705
+ }
3706
+
3707
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
3708
+ model.layers[il].w3,
3709
+ cur);
3710
+ offload_func(tmp);
3711
+ ggml_set_name(tmp, "result_w3");
3712
+
3713
+ cur = ggml_mul_mat(ctx0,
3714
+ model.layers[il].w1,
3715
+ cur);
3716
+ offload_func(cur);
3717
+ ggml_set_name(cur, "result_w1");
3718
+
3719
+ // SILU activation
3720
+ cur = ggml_silu(ctx0, cur);
3721
+ offload_func(cur);
3722
+ ggml_set_name(cur, "silu");
3723
+
3724
+ cur = ggml_mul(ctx0, cur, tmp);
3725
+ offload_func(cur);
3726
+ ggml_set_name(cur, "silu_x_result_w3");
3727
+
3728
+ cur = ggml_mul_mat(ctx0,
3729
+ model.layers[il].w2,
3730
+ cur);
3731
+ offload_func(cur);
3732
+ ggml_set_name(cur, "result_w2");
3733
+ }
3734
+
3735
+ cur = ggml_add(ctx0, cur, inpFF);
3736
+ offload_func(cur);
3737
+ ggml_set_name(cur, "inpFF_+_result_w2");
3738
+
3739
+ // input for next layer
3740
+ inpL = cur;
3741
+ }
3742
+
3743
+ cur = inpL;
3744
+
3745
+ // norm
3746
+ {
3747
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
3748
+ offload_func_nr(cur);
3749
+ ggml_set_name(cur, "rms_norm_2");
3750
+
3751
+ // cur = cur*norm(broadcasted)
3752
+ cur = ggml_mul(ctx0, cur, model.output_norm);
3753
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
3754
+ ggml_set_name(cur, "result_norm");
3755
+ }
3756
+
3757
+ // lm_head
3758
+ cur = ggml_mul_mat(ctx0, model.output, cur);
3759
+ ggml_set_name(cur, "result_output");
3760
+
3761
+ ggml_build_forward_expand(gf, cur);
3762
+
3763
+ ggml_free(ctx0);
3764
+
3765
+ return gf;
3766
+ }
3767
+
3337
3768
  static struct ggml_cgraph * llm_build_falcon(
3338
3769
  llama_context & lctx,
3339
3770
  const llama_batch & batch) {
@@ -3974,6 +4405,10 @@ static struct ggml_cgraph * llama_build_graph(
3974
4405
  {
3975
4406
  result = llm_build_starcoder(lctx, batch);
3976
4407
  } break;
4408
+ case LLM_ARCH_REFACT:
4409
+ {
4410
+ result = llm_build_refact(lctx, batch);
4411
+ } break;
3977
4412
  default:
3978
4413
  GGML_ASSERT(false);
3979
4414
  }
@@ -4107,7 +4542,8 @@ static int llama_decode_internal(
4107
4542
  // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
4108
4543
  const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
4109
4544
  model.arch == LLM_ARCH_BAICHUAN ||
4110
- model.arch == LLM_ARCH_FALCON;
4545
+ model.arch == LLM_ARCH_FALCON ||
4546
+ model.arch == LLM_ARCH_REFACT;
4111
4547
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
4112
4548
  if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
4113
4549
  n_threads = 1;
@@ -4227,18 +4663,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
4227
4663
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
4228
4664
  }
4229
4665
 
4230
- static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
4666
+ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
4667
+ return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
4668
+ }
4669
+
4670
+ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
4231
4671
  GGML_ASSERT(llama_is_byte_token(vocab, id));
4232
4672
  const auto& token_data = vocab.id_to_token.at(id);
4233
- auto buf = token_data.text.substr(3, 2);
4234
- return strtol(buf.c_str(), NULL, 16);
4673
+ switch (llama_vocab_get_type(vocab)) {
4674
+ case LLAMA_VOCAB_TYPE_SPM: {
4675
+ auto buf = token_data.text.substr(3, 2);
4676
+ return strtol(buf.c_str(), NULL, 16);
4677
+ }
4678
+ case LLAMA_VOCAB_TYPE_BPE: {
4679
+ GGML_ASSERT(false);
4680
+ return unicode_to_bytes_bpe(token_data.text);
4681
+ }
4682
+ default:
4683
+ GGML_ASSERT(false);
4684
+ }
4235
4685
  }
4236
4686
 
4237
4687
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
4238
- char buf[7];
4239
- int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
4240
- GGML_ASSERT(0 <= result && result < 7);
4241
- return vocab.token_to_id.at(buf);
4688
+ switch (llama_vocab_get_type(vocab)) {
4689
+ case LLAMA_VOCAB_TYPE_SPM: {
4690
+ char buf[7];
4691
+ int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
4692
+ GGML_ASSERT(0 <= result && result < 7);
4693
+ return vocab.token_to_id.at(buf);
4694
+ }
4695
+ case LLAMA_VOCAB_TYPE_BPE: {
4696
+ return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
4697
+ }
4698
+ default:
4699
+ GGML_ASSERT(false);
4700
+ }
4242
4701
  }
4243
4702
 
4244
4703
  static void llama_escape_whitespace(std::string & text) {
@@ -4518,15 +4977,9 @@ struct llm_tokenizer_bpe {
4518
4977
  std::string byte_str(1, *j);
4519
4978
  auto token_multibyte = vocab.token_to_id.find(byte_str);
4520
4979
  if (token_multibyte == vocab.token_to_id.end()) {
4521
- try {
4522
- llama_token token_byte = llama_byte_to_token(vocab, *j);
4523
- output.push_back(token_byte);
4524
- } catch (const std::out_of_range & err) {
4525
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
4526
- }
4527
- } else {
4528
- output.push_back((*token_multibyte).second);
4980
+ throw std::runtime_error("ERROR: byte not found in vocab");
4529
4981
  }
4982
+ output.push_back((*token_multibyte).second);
4530
4983
  }
4531
4984
  } else {
4532
4985
  output.push_back((*token).second);
@@ -4563,23 +5016,144 @@ private:
4563
5016
  work_queue.push(bigram);
4564
5017
  }
4565
5018
 
4566
- // probably not 100% correct
4567
- static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
4568
- std::vector<std::string> words;
5019
+ std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
5020
+ std::vector<std::string> bpe_words;
5021
+ std::vector<std::string> bpe_encoded_words;
5022
+
5023
+ std::string token = "";
5024
+ // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
5025
+ bool collecting_numeric = false;
5026
+ bool collecting_letter = false;
5027
+ bool collecting_special = false;
5028
+ bool collecting_whitespace_lookahead = false;
5029
+ bool collecting = false;
5030
+
5031
+ std::vector<std::string> text_utf;
5032
+ text_utf.reserve(text.size());
5033
+ bpe_words.reserve(text.size());
5034
+ bpe_encoded_words.reserve(text.size());
5035
+
5036
+ auto cps = codepoints_from_utf8(text);
5037
+ for (size_t i = 0; i < cps.size(); ++i)
5038
+ text_utf.emplace_back(codepoint_to_utf8(cps[i]));
5039
+
5040
+ for (int i = 0; i < (int)text_utf.size(); i++) {
5041
+ const std::string & utf_char = text_utf[i];
5042
+ bool split_condition = false;
5043
+ // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
5044
+ int bytes_remain = text_utf.size() - i;
5045
+ // forward backward lookups
5046
+ const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
5047
+ const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
5048
+
5049
+ // handling contractions
5050
+ if (!split_condition && bytes_remain >= 2) {
5051
+ // 's|'t|'m|'d
5052
+ if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
5053
+ split_condition = true;
5054
+ }
5055
+ if (split_condition) {
5056
+ if (token.size()) {
5057
+ bpe_words.emplace_back(token); // push previous content as token
5058
+ }
5059
+ token = utf_char + utf_char_next;
5060
+ bpe_words.emplace_back(token);
5061
+ token = "";
5062
+ i++;
5063
+ continue;
5064
+ }
5065
+ }
5066
+ if (!split_condition && bytes_remain >= 3) {
5067
+ // 're|'ve|'ll
5068
+ if (utf_char == "\'" && (
5069
+ (utf_char_next == "r" || utf_char_next_next == "e") ||
5070
+ (utf_char_next == "v" || utf_char_next_next == "e") ||
5071
+ (utf_char_next == "l" || utf_char_next_next == "l"))
5072
+ ) {
5073
+ split_condition = true;
5074
+ }
5075
+ if (split_condition) {
5076
+ // current token + next token can be defined
5077
+ if (token.size()) {
5078
+ bpe_words.emplace_back(token); // push previous content as token
5079
+ }
5080
+ token = utf_char + utf_char_next + utf_char_next_next;
5081
+ bpe_words.emplace_back(token); // the contraction
5082
+ token = "";
5083
+ i += 2;
5084
+ continue;
5085
+ }
5086
+ }
5087
+
5088
+ if (!split_condition && !collecting) {
5089
+ if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
5090
+ collecting_letter = true;
5091
+ collecting = true;
5092
+ }
5093
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
5094
+ collecting_numeric = true;
5095
+ collecting = true;
5096
+ }
5097
+ else if (
5098
+ ((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
5099
+ (!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
5100
+ ) {
5101
+ collecting_special = true;
5102
+ collecting = true;
5103
+ }
5104
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
5105
+ collecting_whitespace_lookahead = true;
5106
+ collecting = true;
5107
+ }
5108
+ else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
5109
+ split_condition = true;
5110
+ }
5111
+ }
5112
+ else if (!split_condition && collecting) {
5113
+ if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
5114
+ split_condition = true;
5115
+ }
5116
+ else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
5117
+ split_condition = true;
5118
+ }
5119
+ else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
5120
+ split_condition = true;
5121
+ }
5122
+ else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
5123
+ split_condition = true;
5124
+ }
5125
+ }
4569
5126
 
4570
- // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
4571
- const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
4572
- const std::regex re(pattern);
5127
+ if (utf_char_next == "") {
5128
+ split_condition = true; // final
5129
+ token += utf_char;
5130
+ }
4573
5131
 
4574
- auto words_begin = std::sregex_iterator(text.begin(), text.end(), re);
4575
- auto words_end = std::sregex_iterator();
4576
- auto n_words = std::distance(words_begin, words_end);
4577
- words.reserve(n_words);
4578
- for (auto it = words_begin; it != words_end; ++it) {
4579
- words.push_back(it->str());
5132
+ if (split_condition) {
5133
+ if (token.size()) {
5134
+ bpe_words.emplace_back(token);
5135
+ }
5136
+ token = utf_char;
5137
+ collecting = false;
5138
+ collecting_letter = false;
5139
+ collecting_numeric = false;
5140
+ collecting_special = false;
5141
+ collecting_whitespace_lookahead = false;
5142
+ }
5143
+ else {
5144
+ token += utf_char;
5145
+ }
5146
+ }
5147
+
5148
+ for (std::string & word : bpe_words) {
5149
+ std::string encoded_token = "";
5150
+ for (char & c : word) {
5151
+ encoded_token += bytes_to_unicode_bpe(c);
5152
+ }
5153
+ bpe_encoded_words.emplace_back(encoded_token);
4580
5154
  }
4581
- return words;
4582
5155
 
5156
+ return bpe_encoded_words;
4583
5157
  }
4584
5158
 
4585
5159
  const llama_vocab & vocab;
@@ -6022,7 +6596,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6022
6596
  nthread = std::thread::hardware_concurrency();
6023
6597
  }
6024
6598
 
6025
- llama_model_loader ml(fname_inp, /*use_mmap*/ false);
6599
+ // mmap consistently increases speed Linux, and also increases speed on Windows with
6600
+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
6601
+ #if defined(__linux__) || defined(_WIN32)
6602
+ constexpr bool use_mmap = true;
6603
+ #else
6604
+ constexpr bool use_mmap = false;
6605
+ #endif
6606
+
6607
+ llama_model_loader ml(fname_inp, use_mmap);
6608
+ if (ml.use_mmap) {
6609
+ ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
6610
+ }
6026
6611
 
6027
6612
  llama_model model;
6028
6613
  llm_load_arch(ml, model);
@@ -6100,10 +6685,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6100
6685
 
6101
6686
  const std::string name = ggml_get_name(tensor);
6102
6687
 
6103
- if (read_data.size() < ggml_nbytes(tensor)) {
6104
- read_data.resize(ggml_nbytes(tensor));
6688
+ if (!ml.use_mmap) {
6689
+ if (read_data.size() < ggml_nbytes(tensor)) {
6690
+ read_data.resize(ggml_nbytes(tensor));
6691
+ }
6692
+ tensor->data = read_data.data();
6105
6693
  }
6106
- tensor->data = read_data.data();
6107
6694
  ml.load_data_for(tensor);
6108
6695
 
6109
6696
  LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
@@ -6738,13 +7325,14 @@ struct llama_context * llama_new_context_with_model(
6738
7325
 
6739
7326
  #ifdef GGML_USE_METAL
6740
7327
  if (model->n_gpu_layers > 0) {
7328
+ ggml_metal_log_set_callback(llama_log_callback_default, NULL);
7329
+
6741
7330
  ctx->ctx_metal = ggml_metal_init(1);
6742
7331
  if (!ctx->ctx_metal) {
6743
7332
  LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
6744
7333
  llama_free(ctx);
6745
7334
  return NULL;
6746
7335
  }
6747
- ggml_metal_log_set_callback(llama_log_callback_default, NULL);
6748
7336
  //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
6749
7337
  //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
6750
7338
  }
@@ -6872,6 +7460,10 @@ int llama_n_embd(const struct llama_model * model) {
6872
7460
  return model->hparams.n_embd;
6873
7461
  }
6874
7462
 
7463
+ float llama_rope_freq_scale_train(const struct llama_model * model) {
7464
+ return model->hparams.rope_freq_scale_train;
7465
+ }
7466
+
6875
7467
  int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
6876
7468
  return snprintf(buf, buf_size, "%s %s %s",
6877
7469
  llama_model_arch_name(model->arch).c_str(),
@@ -7039,16 +7631,6 @@ struct llama_data_file_context : llama_data_context {
7039
7631
  *
7040
7632
  */
7041
7633
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
7042
- // TODO: does not support multi-sequence states
7043
- {
7044
- const auto & kv_self = ctx->kv_self;
7045
- for (uint32_t i = 0; i < kv_self.head; ++i) {
7046
- GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
7047
- GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
7048
- GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
7049
- }
7050
- }
7051
-
7052
7634
  // copy rng
7053
7635
  {
7054
7636
  std::stringstream rng_ss;
@@ -7101,36 +7683,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
7101
7683
  const auto & hparams = ctx->model.hparams;
7102
7684
  const auto & cparams = ctx->cparams;
7103
7685
 
7104
- const int n_layer = hparams.n_layer;
7105
- const int n_embd = hparams.n_embd_gqa();
7106
- const int n_ctx = cparams.n_ctx;
7686
+ const auto n_layer = hparams.n_layer;
7687
+ const auto n_embd = hparams.n_embd_gqa();
7688
+ const auto n_ctx = cparams.n_ctx;
7107
7689
 
7108
- const size_t kv_size = kv_self.buf.size;
7109
- const int kv_ntok = kv_self.head;
7690
+ const size_t kv_buf_size = kv_self.buf.size;
7691
+ const uint32_t kv_head = kv_self.head;
7692
+ const uint32_t kv_size = kv_self.size;
7110
7693
 
7111
- data_ctx->write(&kv_size, sizeof(kv_size));
7112
- data_ctx->write(&kv_ntok, sizeof(kv_ntok));
7694
+ data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
7695
+ data_ctx->write(&kv_head, sizeof(kv_head));
7696
+ data_ctx->write(&kv_size, sizeof(kv_size));
7113
7697
 
7114
- if (kv_size) {
7698
+ if (kv_buf_size) {
7115
7699
  const size_t elt_size = ggml_element_size(kv_self.k);
7116
7700
 
7117
7701
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
7118
7702
  ggml_cgraph gf{};
7119
7703
 
7120
- ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
7704
+ ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
7121
7705
  std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
7122
7706
  kout3d->data = kout3d_data.data();
7123
7707
 
7124
- ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
7708
+ ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
7125
7709
  std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
7126
7710
  vout3d->data = vout3d_data.data();
7127
7711
 
7128
7712
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
7129
- n_embd, kv_ntok, n_layer,
7713
+ n_embd, kv_head, n_layer,
7130
7714
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
7131
7715
 
7132
7716
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
7133
- kv_ntok, n_embd, n_layer,
7717
+ kv_head, n_embd, n_layer,
7134
7718
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
7135
7719
 
7136
7720
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
@@ -7144,6 +7728,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
7144
7728
  data_ctx->write(kout3d_data.data(), kout3d_data.size());
7145
7729
  data_ctx->write(vout3d_data.data(), vout3d_data.size());
7146
7730
  }
7731
+
7732
+ for (uint32_t i = 0; i < kv_size; ++i) {
7733
+ const auto & cell = kv_self.cells[i];
7734
+
7735
+ const llama_pos pos = cell.pos;
7736
+ const size_t seq_id_size = cell.seq_id.size();
7737
+
7738
+ data_ctx->write(&pos, sizeof(pos));
7739
+ data_ctx->write(&seq_id_size, sizeof(seq_id_size));
7740
+
7741
+ for (auto seq_id : cell.seq_id) {
7742
+ data_ctx->write(&seq_id, sizeof(seq_id));
7743
+ }
7744
+ }
7147
7745
  }
7148
7746
  }
7149
7747
 
@@ -7215,34 +7813,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7215
7813
  const int n_embd = hparams.n_embd_gqa();
7216
7814
  const int n_ctx = cparams.n_ctx;
7217
7815
 
7218
- size_t kv_size;
7219
- int kv_ntok;
7816
+ size_t kv_buf_size;
7817
+ uint32_t kv_head;
7818
+ uint32_t kv_size;
7220
7819
 
7221
- memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
7222
- memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
7820
+ memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
7821
+ memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
7822
+ memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
7223
7823
 
7224
- if (kv_size) {
7225
- GGML_ASSERT(kv_self.buf.size == kv_size);
7824
+ if (kv_buf_size) {
7825
+ GGML_ASSERT(kv_self.buf.size == kv_buf_size);
7226
7826
 
7227
7827
  const size_t elt_size = ggml_element_size(kv_self.k);
7228
7828
 
7229
7829
  ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
7230
7830
  ggml_cgraph gf{};
7231
7831
 
7232
- ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
7832
+ ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
7233
7833
  kin3d->data = (void *) inp;
7234
7834
  inp += ggml_nbytes(kin3d);
7235
7835
 
7236
- ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
7836
+ ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
7237
7837
  vin3d->data = (void *) inp;
7238
7838
  inp += ggml_nbytes(vin3d);
7239
7839
 
7240
7840
  ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
7241
- n_embd, kv_ntok, n_layer,
7841
+ n_embd, kv_head, n_layer,
7242
7842
  elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
7243
7843
 
7244
7844
  ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
7245
- kv_ntok, n_embd, n_layer,
7845
+ kv_head, n_embd, n_layer,
7246
7846
  elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
7247
7847
 
7248
7848
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
@@ -7252,8 +7852,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
7252
7852
  ggml_free(cpy_ctx);
7253
7853
  }
7254
7854
 
7255
- ctx->kv_self.head = kv_ntok;
7855
+ ctx->kv_self.head = kv_head;
7256
7856
  ctx->kv_self.size = kv_size;
7857
+
7858
+ ctx->kv_self.cells.resize(kv_size);
7859
+
7860
+ for (uint32_t i = 0; i < kv_size; ++i) {
7861
+ llama_pos pos;
7862
+ size_t seq_id_size;
7863
+
7864
+ memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
7865
+ memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
7866
+
7867
+ ctx->kv_self.cells[i].pos = pos;
7868
+
7869
+ llama_seq_id seq_id;
7870
+
7871
+ for (size_t j = 0; j < seq_id_size; ++j) {
7872
+ memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
7873
+ ctx->kv_self.cells[i].seq_id.insert(seq_id);
7874
+ }
7875
+ }
7257
7876
  }
7258
7877
 
7259
7878
  const size_t nread = inp - src;
@@ -7471,6 +8090,22 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
7471
8090
  llama_token llama_token_nl(const struct llama_context * ctx) {
7472
8091
  return ctx->model.vocab.linefeed_id;
7473
8092
  }
8093
+ llama_token llama_token_prefix(const struct llama_context * ctx) {
8094
+ return ctx->model.vocab.special_prefix_id;
8095
+ }
8096
+
8097
+ llama_token llama_token_middle(const struct llama_context * ctx) {
8098
+ return ctx->model.vocab.special_middle_id;
8099
+ }
8100
+
8101
+ llama_token llama_token_suffix(const struct llama_context * ctx) {
8102
+ return ctx->model.vocab.special_suffix_id;
8103
+ }
8104
+
8105
+ llama_token llama_token_eot(const struct llama_context * ctx) {
8106
+ return ctx->model.vocab.special_eot_id;
8107
+ }
8108
+
7474
8109
 
7475
8110
  int llama_tokenize(
7476
8111
  const struct llama_model * model,
@@ -7493,35 +8128,66 @@ int llama_tokenize(
7493
8128
  return res.size();
7494
8129
  }
7495
8130
 
8131
+ static std::string llama_decode_text(const std::string & text) {
8132
+ std::string decoded_text;
8133
+ auto unicode_sequences = codepoints_from_utf8(text);
8134
+ for (auto& unicode_sequence : unicode_sequences) {
8135
+ decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
8136
+ }
8137
+
8138
+ return decoded_text;
8139
+ }
8140
+
7496
8141
  // does not write null-terminator to buf
7497
8142
  int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
7498
8143
  if (0 <= token && token < llama_n_vocab(model)) {
7499
- if (llama_is_normal_token(model->vocab, token)) {
7500
- std::string result = model->vocab.id_to_token[token].text;
7501
- if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
8144
+ switch (llama_vocab_get_type(model->vocab)) {
8145
+ case LLAMA_VOCAB_TYPE_SPM: {
8146
+ if (llama_is_normal_token(model->vocab, token)) {
8147
+ std::string result = model->vocab.id_to_token[token].text;
7502
8148
  llama_unescape_whitespace(result);
8149
+ if (length < (int) result.length()) {
8150
+ return -result.length();
8151
+ }
8152
+ memcpy(buf, result.c_str(), result.length());
8153
+ return result.length();
8154
+ } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
8155
+ if (length < 3) {
8156
+ return -3;
8157
+ }
8158
+ memcpy(buf, "\xe2\x96\x85", 3);
8159
+ return 3;
8160
+ } else if (llama_is_control_token(model->vocab, token)) {
8161
+ ;
8162
+ } else if (llama_is_byte_token(model->vocab, token)) {
8163
+ if (length < 1) {
8164
+ return -1;
8165
+ }
8166
+ buf[0] = llama_token_to_byte(model->vocab, token);
8167
+ return 1;
8168
+ } else {
8169
+ GGML_ASSERT(false);
7503
8170
  }
7504
- if (length < (int) result.length()) {
7505
- return -result.length();
7506
- }
7507
- memcpy(buf, result.c_str(), result.length());
7508
- return result.length();
7509
- } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
7510
- if (length < 3) {
7511
- return -3;
7512
- }
7513
- buf[0] = '\xe2';
7514
- buf[1] = '\x96';
7515
- buf[2] = '\x85';
7516
- return 3;
7517
- } else if (llama_is_control_token(model->vocab, token)) {
7518
- // do nothing
7519
- } else if (llama_is_byte_token(model->vocab, token)) {
7520
- if (length < 1) {
7521
- return -1;
8171
+ break;
8172
+ }
8173
+ case LLAMA_VOCAB_TYPE_BPE: {
8174
+ if (llama_is_normal_token(model->vocab, token)) {
8175
+ std::string result = model->vocab.id_to_token[token].text;
8176
+ result = llama_decode_text(result);
8177
+ if (length < (int) result.length()) {
8178
+ return -result.length();
8179
+ }
8180
+ memcpy(buf, result.c_str(), result.length());
8181
+ return result.length();
8182
+ } else if (llama_is_control_token(model->vocab, token)) {
8183
+ ;
8184
+ } else {
8185
+ GGML_ASSERT(false);
7522
8186
  }
7523
- buf[0] = llama_token_to_byte(model->vocab, token);
7524
- return 1;
8187
+ break;
8188
+ }
8189
+ default:
8190
+ GGML_ASSERT(false);
7525
8191
  }
7526
8192
  }
7527
8193
  return 0;
@@ -7548,14 +8214,14 @@ void llama_print_timings(struct llama_context * ctx) {
7548
8214
  const llama_timings timings = llama_get_timings(ctx);
7549
8215
 
7550
8216
  LLAMA_LOG_INFO("\n");
7551
- LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
7552
- LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
8217
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
8218
+ LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7553
8219
  __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
7554
- LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
8220
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
7555
8221
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
7556
- LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
8222
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
7557
8223
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
7558
- LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
8224
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
7559
8225
  }
7560
8226
 
7561
8227
  void llama_reset_timings(struct llama_context * ctx) {