llama_cpp 0.10.2 → 0.10.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -198,6 +198,7 @@ enum llm_arch {
198
198
  LLM_ARCH_STABLELM,
199
199
  LLM_ARCH_QWEN,
200
200
  LLM_ARCH_PHI2,
201
+ LLM_ARCH_PLAMO,
201
202
  LLM_ARCH_UNKNOWN,
202
203
  };
203
204
 
@@ -216,6 +217,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
216
217
  { LLM_ARCH_STABLELM, "stablelm" },
217
218
  { LLM_ARCH_QWEN, "qwen" },
218
219
  { LLM_ARCH_PHI2, "phi2" },
220
+ { LLM_ARCH_PLAMO, "plamo" },
219
221
  };
220
222
 
221
223
  enum llm_kv {
@@ -352,6 +354,7 @@ enum llm_tensor {
352
354
  LLM_TENSOR_FFN_GATE,
353
355
  LLM_TENSOR_FFN_DOWN,
354
356
  LLM_TENSOR_FFN_UP,
357
+ LLM_TENSOR_FFN_ACT,
355
358
  LLM_TENSOR_FFN_DOWN_EXP,
356
359
  LLM_TENSOR_FFN_GATE_EXP,
357
360
  LLM_TENSOR_FFN_UP_EXP,
@@ -420,6 +423,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
420
423
  LLM_ARCH_GPT2,
421
424
  {
422
425
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
426
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
427
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
428
+ { LLM_TENSOR_OUTPUT, "output" },
429
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
430
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
431
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
432
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
433
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
434
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
423
435
  },
424
436
  },
425
437
  {
@@ -471,6 +483,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
471
483
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
472
484
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
473
485
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
486
+ { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
474
487
  },
475
488
  },
476
489
  {
@@ -567,6 +580,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
567
580
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
568
581
  },
569
582
  },
583
+ {
584
+ LLM_ARCH_PLAMO,
585
+ {
586
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
587
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
588
+ { LLM_TENSOR_OUTPUT, "output" },
589
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
590
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
591
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
592
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
593
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
594
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
595
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
596
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
597
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
598
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
599
+ },
600
+ },
570
601
 
571
602
  {
572
603
  LLM_ARCH_UNKNOWN,
@@ -778,7 +809,7 @@ struct llama_file {
778
809
  throw std::runtime_error(format("read error: %s", strerror(errno)));
779
810
  }
780
811
  if (ret != 1) {
781
- throw std::runtime_error(std::string("unexpectedly reached end of file"));
812
+ throw std::runtime_error("unexpectedly reached end of file");
782
813
  }
783
814
  }
784
815
 
@@ -931,22 +962,22 @@ struct llama_mmap {
931
962
  #elif defined(_WIN32)
932
963
  static constexpr bool SUPPORTED = true;
933
964
 
934
- llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
935
- (void) numa;
965
+ llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
966
+ GGML_UNUSED(numa);
936
967
 
937
968
  size = file->size;
938
969
 
939
970
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
940
971
 
941
972
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
942
- DWORD error = GetLastError();
943
973
 
944
974
  if (hMapping == NULL) {
975
+ DWORD error = GetLastError();
945
976
  throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
946
977
  }
947
978
 
948
979
  addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
949
- error = GetLastError();
980
+ DWORD error = GetLastError();
950
981
  CloseHandle(hMapping);
951
982
 
952
983
  if (addr == NULL) {
@@ -954,7 +985,7 @@ struct llama_mmap {
954
985
  }
955
986
 
956
987
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
957
- if (prefetch) {
988
+ if (prefetch > 0) {
958
989
  // Advise the kernel to preload the mapped memory
959
990
  WIN32_MEMORY_RANGE_ENTRY range;
960
991
  range.VirtualAddress = addr;
@@ -977,26 +1008,26 @@ struct llama_mmap {
977
1008
 
978
1009
  ~llama_mmap() {
979
1010
  if (!UnmapViewOfFile(addr)) {
980
- fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
1011
+ LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
981
1012
  llama_format_win_err(GetLastError()).c_str());
982
1013
  }
983
1014
  }
984
1015
  #else
985
1016
  static constexpr bool SUPPORTED = false;
986
1017
 
987
- llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
988
- (void) file;
989
- (void) prefetch;
990
- (void) numa;
1018
+ llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
1019
+ GGML_UNUSED(file);
1020
+ GGML_UNUSED(prefetch);
1021
+ GGML_UNUSED(numa);
991
1022
 
992
- throw std::runtime_error(std::string("mmap not supported"));
1023
+ throw std::runtime_error("mmap not supported");
993
1024
  }
994
1025
 
995
- void unmap(size_t offset, size_t len) {
996
- (void) offset;
997
- (void) len;
1026
+ void unmap_fragment(size_t first, size_t last) {
1027
+ GGML_UNUSED(first);
1028
+ GGML_UNUSED(last);
998
1029
 
999
- throw std::runtime_error(std::string("mmap not supported"));
1030
+ throw std::runtime_error("mmap not supported");
1000
1031
  }
1001
1032
  #endif
1002
1033
  };
@@ -1172,21 +1203,27 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1172
1203
  }
1173
1204
 
1174
1205
  static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1206
+ ggml_backend_buffer_type_t buft = nullptr;
1207
+
1175
1208
  #ifdef GGML_USE_METAL
1176
1209
  if (n_gpu_layers > 0) {
1177
- return ggml_backend_metal_buffer_type();
1210
+ buft = ggml_backend_metal_buffer_type();
1178
1211
  }
1179
1212
  #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1180
1213
  if (n_gpu_layers > 0) {
1181
- return ggml_backend_cuda_buffer_type(0);
1214
+ buft = ggml_backend_cuda_buffer_type(0);
1182
1215
  }
1183
1216
  #elif defined(GGML_USE_CUBLAS)
1184
- return ggml_backend_cuda_host_buffer_type();
1217
+ buft = ggml_backend_cuda_host_buffer_type();
1185
1218
  #elif defined(GGML_USE_CPU_HBM)
1186
- return ggml_backend_cpu_hbm_buffer_type();
1219
+ buft = ggml_backend_cpu_hbm_buffer_type();
1187
1220
  #endif
1188
1221
 
1189
- return ggml_backend_cpu_buffer_type();
1222
+ if (buft == nullptr) {
1223
+ buft = ggml_backend_cpu_buffer_type();
1224
+ }
1225
+
1226
+ return buft;
1190
1227
 
1191
1228
  GGML_UNUSED(n_gpu_layers);
1192
1229
  }
@@ -1223,6 +1260,10 @@ enum e_model {
1223
1260
  MODEL_40B,
1224
1261
  MODEL_65B,
1225
1262
  MODEL_70B,
1263
+ MODEL_SMALL,
1264
+ MODEL_MEDIUM,
1265
+ MODEL_LARGE,
1266
+ MODEL_XL,
1226
1267
  };
1227
1268
 
1228
1269
  static const size_t kiB = 1024;
@@ -1254,6 +1295,7 @@ struct llama_hparams {
1254
1295
  float f_clamp_kqv;
1255
1296
  float f_max_alibi_bias;
1256
1297
 
1298
+
1257
1299
  bool operator!=(const llama_hparams & other) const {
1258
1300
  if (this->vocab_only != other.vocab_only) return true;
1259
1301
  if (this->n_vocab != other.n_vocab) return true;
@@ -1270,7 +1312,7 @@ struct llama_hparams {
1270
1312
  if (this->rope_finetuned != other.rope_finetuned) return true;
1271
1313
  if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
1272
1314
 
1273
- const float EPSILON = 1e-9;
1315
+ const float EPSILON = 1e-9f;
1274
1316
 
1275
1317
  if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
1276
1318
  if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
@@ -1357,6 +1399,7 @@ struct llama_layer {
1357
1399
  // ff bias
1358
1400
  struct ggml_tensor * ffn_down_b; // b2
1359
1401
  struct ggml_tensor * ffn_up_b; // b3
1402
+ struct ggml_tensor * ffn_act;
1360
1403
  };
1361
1404
 
1362
1405
  struct llama_kv_cell {
@@ -2517,18 +2560,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2517
2560
 
2518
2561
  static const char * llama_model_type_name(e_model type) {
2519
2562
  switch (type) {
2520
- case MODEL_1B: return "1B";
2521
- case MODEL_3B: return "3B";
2522
- case MODEL_7B: return "7B";
2523
- case MODEL_8B: return "8B";
2524
- case MODEL_13B: return "13B";
2525
- case MODEL_15B: return "15B";
2526
- case MODEL_30B: return "30B";
2527
- case MODEL_34B: return "34B";
2528
- case MODEL_40B: return "40B";
2529
- case MODEL_65B: return "65B";
2530
- case MODEL_70B: return "70B";
2531
- default: return "?B";
2563
+ case MODEL_1B: return "1B";
2564
+ case MODEL_3B: return "3B";
2565
+ case MODEL_7B: return "7B";
2566
+ case MODEL_8B: return "8B";
2567
+ case MODEL_13B: return "13B";
2568
+ case MODEL_15B: return "15B";
2569
+ case MODEL_30B: return "30B";
2570
+ case MODEL_34B: return "34B";
2571
+ case MODEL_40B: return "40B";
2572
+ case MODEL_65B: return "65B";
2573
+ case MODEL_70B: return "70B";
2574
+ case MODEL_SMALL: return "0.1B";
2575
+ case MODEL_MEDIUM: return "0.4B";
2576
+ case MODEL_LARGE: return "0.8B";
2577
+ case MODEL_XL: return "1.5B";
2578
+ default: return "?B";
2532
2579
  }
2533
2580
  }
2534
2581
 
@@ -2738,6 +2785,26 @@ static void llm_load_hparams(
2738
2785
  default: model.type = e_model::MODEL_UNKNOWN;
2739
2786
  }
2740
2787
  } break;
2788
+ case LLM_ARCH_PLAMO:
2789
+ {
2790
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2791
+
2792
+ switch (hparams.n_layer) {
2793
+ case 40: model.type = e_model::MODEL_13B; break;
2794
+ default: model.type = e_model::MODEL_UNKNOWN;
2795
+ }
2796
+ } break;
2797
+ case LLM_ARCH_GPT2:
2798
+ {
2799
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2800
+ switch (hparams.n_layer) {
2801
+ case 12: model.type = e_model::MODEL_SMALL; break;
2802
+ case 24: model.type = e_model::MODEL_MEDIUM; break;
2803
+ case 36: model.type = e_model::MODEL_LARGE; break;
2804
+ case 48: model.type = e_model::MODEL_XL; break;
2805
+ default: model.type = e_model::MODEL_UNKNOWN;
2806
+ }
2807
+ } break;
2741
2808
 
2742
2809
  default: (void)0;
2743
2810
  }
@@ -3431,7 +3498,6 @@ static bool llm_load_tensors(
3431
3498
  case LLM_ARCH_MPT:
3432
3499
  {
3433
3500
  model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3434
-
3435
3501
  // output
3436
3502
  {
3437
3503
  ggml_backend_type backend_norm;
@@ -3469,6 +3535,9 @@ static bool llm_load_tensors(
3469
3535
 
3470
3536
  layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3471
3537
  layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3538
+
3539
+ // AWQ ScaleActivation layer
3540
+ layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
3472
3541
  }
3473
3542
  } break;
3474
3543
  case LLM_ARCH_STABLELM:
@@ -3619,6 +3688,105 @@ static bool llm_load_tensors(
3619
3688
  layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3620
3689
  }
3621
3690
  } break;
3691
+ case LLM_ARCH_PLAMO:
3692
+ {
3693
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3694
+
3695
+ // output
3696
+ {
3697
+ ggml_backend_type backend_norm;
3698
+ ggml_backend_type backend_output;
3699
+
3700
+ if (n_gpu_layers > int(n_layer)) {
3701
+ backend_norm = llama_backend_offload;
3702
+ backend_output = llama_backend_offload_split;
3703
+ } else {
3704
+ backend_norm = GGML_BACKEND_CPU;
3705
+ backend_output = GGML_BACKEND_CPU;
3706
+ }
3707
+
3708
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3709
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3710
+ }
3711
+
3712
+ const uint32_t n_ff = hparams.n_ff;
3713
+
3714
+ const int i_gpu_start = n_layer - n_gpu_layers;
3715
+
3716
+ model.layers.resize(n_layer);
3717
+
3718
+ for (uint32_t i = 0; i < n_layer; ++i) {
3719
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3720
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3721
+
3722
+ auto & layer = model.layers[i];
3723
+
3724
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3725
+
3726
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3727
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3728
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3729
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3730
+
3731
+ layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3732
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3733
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3734
+ }
3735
+ } break;
3736
+ case LLM_ARCH_GPT2:
3737
+ {
3738
+ model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3739
+ model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
3740
+
3741
+ // output
3742
+ {
3743
+ ggml_backend_type backend_norm;
3744
+ ggml_backend_type backend_output;
3745
+
3746
+ if (n_gpu_layers > int(n_layer)) {
3747
+ backend_norm = llama_backend_offload;
3748
+ backend_output = llama_backend_offload_split;
3749
+ } else {
3750
+ backend_norm = GGML_BACKEND_CPU;
3751
+ backend_output = GGML_BACKEND_CPU;
3752
+ }
3753
+
3754
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3755
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3756
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3757
+ }
3758
+
3759
+ const uint32_t n_ff = hparams.n_ff;
3760
+
3761
+ const int i_gpu_start = n_layer - n_gpu_layers;
3762
+
3763
+ model.layers.resize(n_layer);
3764
+
3765
+ for (uint32_t i = 0; i < n_layer; ++i) {
3766
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3767
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3768
+
3769
+ auto & layer = model.layers[i];
3770
+
3771
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3772
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3773
+
3774
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3775
+ layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3776
+
3777
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3778
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3779
+
3780
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3781
+ layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3782
+
3783
+ layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3784
+ layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3785
+
3786
+ layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3787
+ layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3788
+ }
3789
+ } break;
3622
3790
  default:
3623
3791
  throw std::runtime_error("unknown architecture");
3624
3792
  }
@@ -3954,6 +4122,7 @@ static struct ggml_tensor * llm_build_ffn(
3954
4122
  struct ggml_tensor * gate_b,
3955
4123
  struct ggml_tensor * down,
3956
4124
  struct ggml_tensor * down_b,
4125
+ struct ggml_tensor * act_scales,
3957
4126
  llm_ffn_op_type type_op,
3958
4127
  llm_ffn_gate_type type_gate,
3959
4128
  const llm_build_cb & cb,
@@ -3998,6 +4167,10 @@ static struct ggml_tensor * llm_build_ffn(
3998
4167
  {
3999
4168
  cur = ggml_gelu(ctx, cur);
4000
4169
  cb(cur, "ffn_gelu", il);
4170
+ if (act_scales != NULL) {
4171
+ cur = ggml_div(ctx, cur, act_scales);
4172
+ cb(cur, "ffn_act", il);
4173
+ }
4001
4174
  } break;
4002
4175
  case LLM_FFN_RELU:
4003
4176
  {
@@ -4316,6 +4489,7 @@ struct llm_build_context {
4316
4489
  model.layers[il].ffn_up, NULL,
4317
4490
  model.layers[il].ffn_gate, NULL,
4318
4491
  model.layers[il].ffn_down, NULL,
4492
+ NULL,
4319
4493
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4320
4494
  cb(cur, "ffn_out", il);
4321
4495
  } else {
@@ -4495,6 +4669,7 @@ struct llm_build_context {
4495
4669
  model.layers[il].ffn_up, NULL,
4496
4670
  model.layers[il].ffn_gate, NULL,
4497
4671
  model.layers[il].ffn_down, NULL,
4672
+ NULL,
4498
4673
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4499
4674
  cb(cur, "ffn_out", il);
4500
4675
  }
@@ -4609,6 +4784,7 @@ struct llm_build_context {
4609
4784
  model.layers[il].ffn_up, NULL,
4610
4785
  NULL, NULL,
4611
4786
  model.layers[il].ffn_down, NULL,
4787
+ NULL,
4612
4788
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
4613
4789
  cb(cur, "ffn_out", il);
4614
4790
  }
@@ -4713,6 +4889,7 @@ struct llm_build_context {
4713
4889
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
4714
4890
  NULL, NULL,
4715
4891
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
4892
+ NULL,
4716
4893
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
4717
4894
  cb(cur, "ffn_out", il);
4718
4895
  }
@@ -4917,6 +5094,7 @@ struct llm_build_context {
4917
5094
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
4918
5095
  NULL, NULL,
4919
5096
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5097
+ NULL,
4920
5098
  LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
4921
5099
  cb(cur, "ffn_out", il);
4922
5100
  }
@@ -5003,6 +5181,7 @@ struct llm_build_context {
5003
5181
  model.layers[il].ffn_up, NULL,
5004
5182
  model.layers[il].ffn_gate, NULL,
5005
5183
  model.layers[il].ffn_down, NULL,
5184
+ NULL,
5006
5185
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5007
5186
  cb(cur, "ffn_out", il);
5008
5187
  }
@@ -5098,6 +5277,7 @@ struct llm_build_context {
5098
5277
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5099
5278
  NULL, NULL,
5100
5279
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5280
+ NULL,
5101
5281
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5102
5282
  cb(cur, "ffn_out", il);
5103
5283
  }
@@ -5183,11 +5363,11 @@ struct llm_build_context {
5183
5363
  NULL,
5184
5364
  LLM_NORM, cb, il);
5185
5365
  cb(cur, "ffn_norm", il);
5186
-
5187
5366
  cur = llm_build_ffn(ctx0, cur,
5188
5367
  model.layers[il].ffn_up, NULL,
5189
5368
  NULL, NULL,
5190
5369
  model.layers[il].ffn_down, NULL,
5370
+ model.layers[il].ffn_act,
5191
5371
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5192
5372
  cb(cur, "ffn_out", il);
5193
5373
  }
@@ -5296,6 +5476,7 @@ struct llm_build_context {
5296
5476
  model.layers[il].ffn_up, NULL,
5297
5477
  model.layers[il].ffn_gate, NULL,
5298
5478
  model.layers[il].ffn_down, NULL,
5479
+ NULL,
5299
5480
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5300
5481
  cb(cur, "ffn_out", il);
5301
5482
  }
@@ -5408,6 +5589,7 @@ struct llm_build_context {
5408
5589
  model.layers[il].ffn_up, NULL,
5409
5590
  model.layers[il].ffn_gate, NULL,
5410
5591
  model.layers[il].ffn_down, NULL,
5592
+ NULL,
5411
5593
  LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5412
5594
  cb(cur, "ffn_out", il);
5413
5595
  }
@@ -5515,6 +5697,7 @@ struct llm_build_context {
5515
5697
  model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5516
5698
  NULL, NULL,
5517
5699
  model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5700
+ NULL,
5518
5701
  LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5519
5702
  cb(ffn_output, "ffn_out", il);
5520
5703
  }
@@ -5544,6 +5727,206 @@ struct llm_build_context {
5544
5727
 
5545
5728
  return gf;
5546
5729
  }
5730
+
5731
+ struct ggml_cgraph * build_plamo() {
5732
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
5733
+
5734
+ struct ggml_tensor * cur;
5735
+ struct ggml_tensor * inpL;
5736
+
5737
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5738
+ cb(inpL, "inp_embd", -1);
5739
+
5740
+ // inp_pos - contains the positions
5741
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5742
+ cb(inp_pos, "inp_pos", -1);
5743
+
5744
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5745
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5746
+ cb(KQ_mask, "KQ_mask", -1);
5747
+
5748
+ // shift the entire K-cache if needed
5749
+ if (do_rope_shift) {
5750
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5751
+ }
5752
+
5753
+ for (int il = 0; il < n_layer; ++il) {
5754
+
5755
+ // norm
5756
+ cur = llm_build_norm(ctx0, inpL, hparams,
5757
+ model.layers[il].attn_norm, NULL,
5758
+ LLM_NORM_RMS, cb, il);
5759
+ cb(cur, "attn_norm", il);
5760
+
5761
+ struct ggml_tensor * attention_norm = cur;
5762
+
5763
+ // self-attention
5764
+ {
5765
+ // compute Q and K and RoPE them
5766
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
5767
+ cb(Qcur, "Qcur", il);
5768
+
5769
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
5770
+ cb(Kcur, "Kcur", il);
5771
+
5772
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
5773
+ cb(Vcur, "Vcur", il);
5774
+
5775
+ Qcur = ggml_rope_custom(
5776
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5777
+ n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
5778
+ ext_factor, attn_factor, beta_fast, beta_slow);
5779
+ cb(Qcur, "Qcur", il);
5780
+
5781
+ Kcur = ggml_rope_custom(
5782
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5783
+ n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
5784
+ ext_factor, attn_factor, beta_fast, beta_slow);
5785
+ cb(Kcur, "Kcur", il);
5786
+
5787
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5788
+
5789
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5790
+ model.layers[il].wo, NULL,
5791
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5792
+ cb(cur, "kqv_out", il);
5793
+ }
5794
+ struct ggml_tensor * sa_out = cur;
5795
+
5796
+ cur = attention_norm;
5797
+
5798
+ // feed-forward network
5799
+ {
5800
+ cur = llm_build_ffn(ctx0, cur,
5801
+ model.layers[il].ffn_up, NULL,
5802
+ model.layers[il].ffn_gate, NULL,
5803
+ model.layers[il].ffn_down, NULL,
5804
+ NULL,
5805
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
5806
+ cb(cur, "ffn_out", il);
5807
+ }
5808
+
5809
+ cur = ggml_add(ctx0, cur, sa_out);
5810
+ cb(cur, "l_out", il);
5811
+
5812
+ cur = ggml_add(ctx0, cur, inpL);
5813
+ cb(cur, "l_out", il);
5814
+
5815
+ // input for next layer
5816
+ inpL = cur;
5817
+ }
5818
+
5819
+ cur = inpL;
5820
+
5821
+ cur = llm_build_norm(ctx0, cur, hparams,
5822
+ model.output_norm, NULL,
5823
+ LLM_NORM_RMS, cb, -1);
5824
+ cb(cur, "result_norm", -1);
5825
+
5826
+ // lm_head
5827
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5828
+ cb(cur, "result_output", -1);
5829
+
5830
+ ggml_build_forward_expand(gf, cur);
5831
+
5832
+ return gf;
5833
+ }
5834
+
5835
+ struct ggml_cgraph * build_gpt2() {
5836
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5837
+
5838
+ struct ggml_tensor * cur;
5839
+ struct ggml_tensor * pos;
5840
+ struct ggml_tensor * inpL;
5841
+
5842
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5843
+ cb(inpL, "inp_embd", -1);
5844
+
5845
+ // inp_pos - contains the positions
5846
+ struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
5847
+ cb(inp_pos, "inp_pos", -1);
5848
+
5849
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
5850
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
5851
+ cb(KQ_mask, "KQ_mask", -1);
5852
+
5853
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
5854
+ cb(pos, "pos_embd", -1);
5855
+
5856
+ inpL = ggml_add(ctx0, inpL, pos);
5857
+ cb(inpL, "inpL", -1);
5858
+
5859
+ for (int il = 0; il < n_layer; ++il) {
5860
+ cur = llm_build_norm(ctx0, inpL, hparams,
5861
+ model.layers[il].attn_norm,
5862
+ model.layers[il].attn_norm_b,
5863
+ LLM_NORM, cb, il);
5864
+ cb(cur, "attn_norm", il);
5865
+
5866
+ // self-attention
5867
+ {
5868
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
5869
+ cb(cur, "wqkv", il);
5870
+
5871
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5872
+ cb(cur, "bqkv", il);
5873
+
5874
+ struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5875
+ struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5876
+ struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5877
+
5878
+ cb(Qcur, "Qcur", il);
5879
+ cb(Kcur, "Kcur", il);
5880
+ cb(Vcur, "Vcur", il);
5881
+
5882
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5883
+
5884
+ llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
5885
+
5886
+ cur = llm_build_kqv(ctx0, model, hparams, kv_self,
5887
+ model.layers[il].wo, model.layers[il].bo,
5888
+ Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
5889
+ cb(cur, "kqv_out", il);
5890
+ }
5891
+
5892
+ // add the input
5893
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
5894
+ cb(ffn_inp, "ffn_inp", il);
5895
+
5896
+ // FF
5897
+ {
5898
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
5899
+ model.layers[il].ffn_norm,
5900
+ model.layers[il].ffn_norm_b,
5901
+ LLM_NORM, cb, il);
5902
+ cb(cur, "ffn_norm", il);
5903
+
5904
+ cur = llm_build_ffn(ctx0, cur,
5905
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b,
5906
+ NULL, NULL,
5907
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b,
5908
+ NULL,
5909
+ LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5910
+ cb(cur, "ffn_out", il);
5911
+ }
5912
+
5913
+ inpL = ggml_add(ctx0, cur, ffn_inp);
5914
+ cb(inpL, "l_out", il);
5915
+ }
5916
+
5917
+ cur = llm_build_norm(ctx0, inpL, hparams,
5918
+ model.output_norm,
5919
+ model.output_norm_b,
5920
+ LLM_NORM, cb, -1);
5921
+ cb(cur, "result_norm", -1);
5922
+
5923
+ cur = ggml_mul_mat(ctx0, model.output, cur);
5924
+ cb(cur, "result_output", -1);
5925
+
5926
+ ggml_build_forward_expand(gf, cur);
5927
+
5928
+ return gf;
5929
+ }
5547
5930
  };
5548
5931
 
5549
5932
  //
@@ -5699,6 +6082,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5699
6082
  { "ffn_gate", OFFLOAD_FUNC },
5700
6083
  { "ffn_gate_b", OFFLOAD_FUNC },
5701
6084
  { "ffn_gate_par", OFFLOAD_FUNC },
6085
+ { "ffn_act", OFFLOAD_FUNC },
5702
6086
  { "ffn_down", OFFLOAD_FUNC },
5703
6087
  { "ffn_down_b", OFFLOAD_FUNC },
5704
6088
  { "ffn_out", OFFLOAD_FUNC },
@@ -6054,6 +6438,14 @@ static struct ggml_cgraph * llama_build_graph(
6054
6438
  {
6055
6439
  result = llm.build_phi2();
6056
6440
  } break;
6441
+ case LLM_ARCH_PLAMO:
6442
+ {
6443
+ result = llm.build_plamo();
6444
+ } break;
6445
+ case LLM_ARCH_GPT2:
6446
+ {
6447
+ result = llm.build_gpt2();
6448
+ } break;
6057
6449
  default:
6058
6450
  GGML_ASSERT(false);
6059
6451
  }
@@ -9327,7 +9719,8 @@ struct llama_context * llama_new_context_with_model(
9327
9719
  ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
9328
9720
  #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9329
9721
  if (model->n_gpu_layers > 0) {
9330
- ggml_cuda_set_scratch_size(alloc_size);
9722
+ // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
9723
+ ggml_cuda_set_scratch_size(alloc_size + 64);
9331
9724
  LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9332
9725
 
9333
9726
  // calculate total VRAM usage
@@ -10289,7 +10682,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
10289
10682
  std::string result = model->vocab.id_to_token[token].text;
10290
10683
  llama_unescape_whitespace(result);
10291
10684
  if (length < (int) result.length()) {
10292
- return -result.length();
10685
+ return -(int) result.length();
10293
10686
  }
10294
10687
  memcpy(buf, result.c_str(), result.length());
10295
10688
  return result.length();
@@ -10319,7 +10712,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
10319
10712
  std::string result = model->vocab.id_to_token[token].text;
10320
10713
  result = llama_decode_text(result);
10321
10714
  if (length < (int) result.length()) {
10322
- return -result.length();
10715
+ return -(int) result.length();
10323
10716
  }
10324
10717
  memcpy(buf, result.c_str(), result.length());
10325
10718
  return result.length();