llama-cpp-pydist 0.20.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7621-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/METADATA +146 -1
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +76 -73
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +3 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +7 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +4 -4
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +19 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +10 -4
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +50 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +55 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +44 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +24 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +50 -29
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +37 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +22 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +32 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +12 -7
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +86 -8
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +602 -18
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +2 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +43 -11
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1232 -170
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +16 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +4 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +93 -4
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +8 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +19 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +47 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +2 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +1 -0
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +0 -0
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -578,6 +578,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
578
578
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
|
579
579
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
|
580
580
|
|
|
581
|
+
// TODO: Handle SWA metadata similarly when models start implementing it
|
|
581
582
|
// rope_freq_scale (inverse of the kv) is optional
|
|
582
583
|
float ropescale = 0.0f;
|
|
583
584
|
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
|
@@ -586,10 +587,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
586
587
|
}
|
|
587
588
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
588
589
|
|
|
589
|
-
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
|
|
590
|
-
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
591
|
-
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
592
|
-
|
|
593
590
|
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
|
594
591
|
|
|
595
592
|
// non-transformer models do not have attention heads
|
|
@@ -677,6 +674,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
677
674
|
hparams.f_attn_temp_scale = 0.1f;
|
|
678
675
|
hparams.f_attn_temp_offset = 1.0f;
|
|
679
676
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
677
|
+
|
|
678
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
679
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
680
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
680
681
|
}
|
|
681
682
|
|
|
682
683
|
switch (hparams.n_expert) {
|
|
@@ -722,6 +723,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
722
723
|
if (hparams.n_swa > 0) {
|
|
723
724
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
724
725
|
hparams.set_swa_pattern(4);
|
|
726
|
+
|
|
727
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
728
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
729
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
725
730
|
} else {
|
|
726
731
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
727
732
|
}
|
|
@@ -1243,7 +1248,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1243
1248
|
if (found_swa && hparams.n_swa > 0) {
|
|
1244
1249
|
uint32_t swa_period = 8;
|
|
1245
1250
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1246
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1247
1251
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
1248
1252
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1249
1253
|
hparams.set_swa_pattern(swa_period);
|
|
@@ -1309,7 +1313,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1309
1313
|
hparams.n_swa = 4096; // default value of gemma 2
|
|
1310
1314
|
hparams.set_swa_pattern(2);
|
|
1311
1315
|
hparams.attn_soft_cap = true;
|
|
1316
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1317
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1312
1318
|
|
|
1319
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1313
1320
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1314
1321
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1315
1322
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
@@ -1334,8 +1341,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1334
1341
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1335
1342
|
hparams.set_swa_pattern(6);
|
|
1336
1343
|
|
|
1337
|
-
hparams.rope_freq_base_train_swa
|
|
1338
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1344
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1339
1345
|
} else {
|
|
1340
1346
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1341
1347
|
}
|
|
@@ -1365,10 +1371,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1365
1371
|
hparams.set_swa_pattern(5);
|
|
1366
1372
|
|
|
1367
1373
|
hparams.n_layer_kv_from_start = 20;
|
|
1368
|
-
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1369
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1370
1374
|
hparams.f_attention_scale = 1.0f;
|
|
1371
1375
|
|
|
1376
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1372
1377
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1373
1378
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1374
1379
|
|
|
@@ -1384,9 +1389,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1384
1389
|
hparams.set_swa_pattern(6);
|
|
1385
1390
|
|
|
1386
1391
|
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1387
|
-
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
1388
|
-
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
1389
1392
|
|
|
1393
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1390
1394
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1391
1395
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1392
1396
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
@@ -1525,7 +1529,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1525
1529
|
{
|
|
1526
1530
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1527
1531
|
hparams.set_swa_pattern(4);
|
|
1532
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1533
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1528
1534
|
|
|
1535
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1529
1536
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1530
1537
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
1531
1538
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1564,6 +1571,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1564
1571
|
if (found_swa && hparams.n_swa > 0) {
|
|
1565
1572
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1566
1573
|
hparams.set_swa_pattern(4);
|
|
1574
|
+
|
|
1575
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1576
|
+
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
|
|
1577
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1567
1578
|
} else {
|
|
1568
1579
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
1569
1580
|
}
|
|
@@ -1906,6 +1917,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1906
1917
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1907
1918
|
hparams.n_swa = 4096;
|
|
1908
1919
|
hparams.set_swa_pattern(4);
|
|
1920
|
+
|
|
1921
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1922
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1923
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1909
1924
|
}
|
|
1910
1925
|
|
|
1911
1926
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
@@ -2208,6 +2223,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2208
2223
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2209
2224
|
hparams.set_swa_pattern(2);
|
|
2210
2225
|
|
|
2226
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2227
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
2228
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2229
|
+
|
|
2211
2230
|
switch (hparams.n_layer) {
|
|
2212
2231
|
case 24: type = LLM_TYPE_20B; break;
|
|
2213
2232
|
case 36: type = LLM_TYPE_120B; break;
|
|
@@ -2252,6 +2271,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2252
2271
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2253
2272
|
hparams.n_swa = 4096;
|
|
2254
2273
|
hparams.set_swa_pattern(4, true);
|
|
2274
|
+
|
|
2275
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2276
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
2277
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2255
2278
|
} else {
|
|
2256
2279
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
2257
2280
|
hparams.n_no_rope_layer_step = hparams.n_layer;
|
|
@@ -7098,6 +7121,10 @@ void llama_model::print_info() const {
|
|
|
7098
7121
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
7099
7122
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
7100
7123
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
7124
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7125
|
+
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
|
7126
|
+
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
|
7127
|
+
}
|
|
7101
7128
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
7102
7129
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7103
7130
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
@@ -7910,12 +7937,17 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7910
7937
|
// add on pooling layer
|
|
7911
7938
|
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
|
7912
7939
|
|
|
7940
|
+
// add backend sampling layers (if any)
|
|
7941
|
+
llm->build_sampling();
|
|
7942
|
+
|
|
7913
7943
|
// if the gguf model was converted with --sentence-transformers-dense-modules
|
|
7914
7944
|
// there will be two additional dense projection layers
|
|
7915
7945
|
// dense linear projections are applied after pooling
|
|
7916
7946
|
// TODO: move reranking logic here and generalize
|
|
7917
7947
|
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
|
7918
7948
|
|
|
7949
|
+
llm->res->set_outputs();
|
|
7950
|
+
|
|
7919
7951
|
return llm->res->get_gf();
|
|
7920
7952
|
}
|
|
7921
7953
|
|