llama-cpp-pydist 0.20.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. llama_cpp/binaries/{llama-b7621-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
  2. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/METADATA +146 -1
  3. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +76 -73
  4. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
  5. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +3 -1
  6. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
  7. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +7 -0
  9. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +4 -4
  10. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +19 -0
  11. vendor_llama_cpp_pydist/llama.cpp/common/common.h +4 -0
  12. vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
  13. vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
  14. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
  15. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
  16. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +10 -4
  17. vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
  18. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +50 -0
  19. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +55 -0
  20. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +14 -0
  21. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +44 -0
  22. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +24 -0
  23. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +50 -29
  24. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
  25. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +9 -9
  26. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +37 -3
  27. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +22 -8
  28. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
  29. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
  30. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
  31. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  32. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  33. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +32 -25
  34. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
  35. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +12 -7
  36. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +86 -8
  37. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +602 -18
  38. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
  39. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
  40. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
  41. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
  42. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
  43. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +2 -2
  44. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +43 -11
  45. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1232 -170
  46. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +16 -7
  47. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +1 -1
  48. vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
  49. vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  50. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  51. vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
  52. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +4 -3
  53. vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  54. vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
  55. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
  56. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +93 -4
  57. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
  58. vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
  59. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +8 -0
  60. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
  61. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  62. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +12 -7
  63. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +19 -0
  64. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +47 -5
  65. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +3 -3
  66. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +3 -0
  67. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +2 -2
  68. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
  69. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
  70. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +3 -0
  71. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +2 -0
  72. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +3 -0
  73. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +1 -0
  74. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
  75. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +0 -0
  76. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
@@ -578,6 +578,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
578
578
  hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
579
579
  GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
580
580
 
581
+ // TODO: Handle SWA metadata similarly when models start implementing it
581
582
  // rope_freq_scale (inverse of the kv) is optional
582
583
  float ropescale = 0.0f;
583
584
  if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
@@ -586,10 +587,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
586
587
  }
587
588
  hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
588
589
 
589
- // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
590
- hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
591
- hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
592
-
593
590
  ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
594
591
 
595
592
  // non-transformer models do not have attention heads
@@ -677,6 +674,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
677
674
  hparams.f_attn_temp_scale = 0.1f;
678
675
  hparams.f_attn_temp_offset = 1.0f;
679
676
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
677
+
678
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
679
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
680
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
680
681
  }
681
682
 
682
683
  switch (hparams.n_expert) {
@@ -722,6 +723,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
722
723
  if (hparams.n_swa > 0) {
723
724
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
724
725
  hparams.set_swa_pattern(4);
726
+
727
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
728
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
729
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
725
730
  } else {
726
731
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
727
732
  }
@@ -1243,7 +1248,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1243
1248
  if (found_swa && hparams.n_swa > 0) {
1244
1249
  uint32_t swa_period = 8;
1245
1250
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1246
- hparams.rope_freq_scale_train_swa = 1.0f;
1247
1251
  ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
1248
1252
  ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
1249
1253
  hparams.set_swa_pattern(swa_period);
@@ -1309,7 +1313,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1309
1313
  hparams.n_swa = 4096; // default value of gemma 2
1310
1314
  hparams.set_swa_pattern(2);
1311
1315
  hparams.attn_soft_cap = true;
1316
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1317
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1312
1318
 
1319
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1313
1320
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1314
1321
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1315
1322
  ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
@@ -1334,8 +1341,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1334
1341
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1335
1342
  hparams.set_swa_pattern(6);
1336
1343
 
1337
- hparams.rope_freq_base_train_swa = 10000.0f;
1338
- hparams.rope_freq_scale_train_swa = 1.0f;
1344
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1339
1345
  } else {
1340
1346
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1341
1347
  }
@@ -1365,10 +1371,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1365
1371
  hparams.set_swa_pattern(5);
1366
1372
 
1367
1373
  hparams.n_layer_kv_from_start = 20;
1368
- hparams.rope_freq_base_train_swa = 10000.0f;
1369
- hparams.rope_freq_scale_train_swa = 1.0f;
1370
1374
  hparams.f_attention_scale = 1.0f;
1371
1375
 
1376
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1372
1377
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1373
1378
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1374
1379
 
@@ -1384,9 +1389,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1384
1389
  hparams.set_swa_pattern(6);
1385
1390
 
1386
1391
  hparams.causal_attn = false; // embeddings do not use causal attention
1387
- hparams.rope_freq_base_train_swa = 10000.0f;
1388
- hparams.rope_freq_scale_train_swa = 1.0f;
1389
1392
 
1393
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1390
1394
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1391
1395
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1392
1396
  ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@@ -1525,7 +1529,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1525
1529
  {
1526
1530
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1527
1531
  hparams.set_swa_pattern(4);
1532
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1533
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1528
1534
 
1535
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1529
1536
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
1530
1537
  ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
1531
1538
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -1564,6 +1571,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1564
1571
  if (found_swa && hparams.n_swa > 0) {
1565
1572
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1566
1573
  hparams.set_swa_pattern(4);
1574
+
1575
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1576
+ hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
1577
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1567
1578
  } else {
1568
1579
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1569
1580
  }
@@ -1906,6 +1917,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1906
1917
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1907
1918
  hparams.n_swa = 4096;
1908
1919
  hparams.set_swa_pattern(4);
1920
+
1921
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
1922
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
1923
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
1909
1924
  }
1910
1925
 
1911
1926
  ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@@ -2208,6 +2223,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2208
2223
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2209
2224
  hparams.set_swa_pattern(2);
2210
2225
 
2226
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2227
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2228
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2229
+
2211
2230
  switch (hparams.n_layer) {
2212
2231
  case 24: type = LLM_TYPE_20B; break;
2213
2232
  case 36: type = LLM_TYPE_120B; break;
@@ -2252,6 +2271,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2252
2271
  hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
2253
2272
  hparams.n_swa = 4096;
2254
2273
  hparams.set_swa_pattern(4, true);
2274
+
2275
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
2276
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
2277
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
2255
2278
  } else {
2256
2279
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
2257
2280
  hparams.n_no_rope_layer_step = hparams.n_layer;
@@ -7098,6 +7121,10 @@ void llama_model::print_info() const {
7098
7121
  LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
7099
7122
  LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
7100
7123
  LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
7124
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
7125
+ LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
7126
+ LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
7127
+ }
7101
7128
  LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
7102
7129
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
7103
7130
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
@@ -7910,12 +7937,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7910
7937
  // add on pooling layer
7911
7938
  llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
7912
7939
 
7940
+ // add backend sampling layers (if any)
7941
+ llm->build_sampling();
7942
+
7913
7943
  // if the gguf model was converted with --sentence-transformers-dense-modules
7914
7944
  // there will be two additional dense projection layers
7915
7945
  // dense linear projections are applied after pooling
7916
7946
  // TODO: move reranking logic here and generalize
7917
7947
  llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
7918
7948
 
7949
+ llm->res->set_outputs();
7950
+
7919
7951
  return llm->res->get_gf();
7920
7952
  }
7921
7953