llama-cpp-pydist 0.20.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. llama_cpp/binaries/{llama-b7621-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
  2. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/METADATA +146 -1
  3. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +76 -73
  4. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
  5. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +3 -1
  6. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
  7. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +7 -0
  9. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +4 -4
  10. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +19 -0
  11. vendor_llama_cpp_pydist/llama.cpp/common/common.h +4 -0
  12. vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
  13. vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
  14. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
  15. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
  16. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +10 -4
  17. vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
  18. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +50 -0
  19. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +55 -0
  20. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +14 -0
  21. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +44 -0
  22. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +24 -0
  23. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +50 -29
  24. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
  25. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +9 -9
  26. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +37 -3
  27. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +22 -8
  28. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
  29. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
  30. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
  31. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  32. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  33. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +32 -25
  34. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
  35. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +12 -7
  36. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +86 -8
  37. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +602 -18
  38. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
  39. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
  40. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
  41. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
  42. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
  43. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +2 -2
  44. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +43 -11
  45. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1232 -170
  46. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +16 -7
  47. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +1 -1
  48. vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
  49. vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  50. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  51. vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
  52. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +4 -3
  53. vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  54. vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
  55. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
  56. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +93 -4
  57. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
  58. vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
  59. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +8 -0
  60. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
  61. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  62. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +12 -7
  63. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +19 -0
  64. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +47 -5
  65. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +3 -3
  66. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +3 -0
  67. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +2 -2
  68. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
  69. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
  70. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +3 -0
  71. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +2 -0
  72. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +3 -0
  73. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +1 -0
  74. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
  75. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +0 -0
  76. {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
@@ -316,6 +316,11 @@ extern "C" {
316
316
  bool no_alloc; // only load metadata and simulate memory allocations
317
317
  };
318
318
 
319
+ struct llama_sampler_seq_config {
320
+ llama_seq_id seq_id;
321
+ struct llama_sampler * sampler;
322
+ };
323
+
319
324
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
320
325
  // https://github.com/ggml-org/llama.cpp/pull/7544
321
326
  struct llama_context_params {
@@ -364,6 +369,12 @@ extern "C" {
364
369
  bool kv_unified; // use a unified buffer across the input sequences when computing the attention
365
370
  // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
366
371
  // ref: https://github.com/ggml-org/llama.cpp/pull/14363
372
+
373
+ // [EXPERIMENTAL]
374
+ // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
375
+ // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
376
+ struct llama_sampler_seq_config * samplers;
377
+ size_t n_samplers;
367
378
  };
368
379
 
369
380
  // model quantization parameters
@@ -992,6 +1003,32 @@ extern "C" {
992
1003
  // otherwise: float[n_embd] (1-dimensional)
993
1004
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
994
1005
 
1006
+ //
1007
+ // backend sampling API [EXPERIMENTAL]
1008
+ // note: use only if the llama_context was created with at least one llama_sampler_seq_config
1009
+ //
1010
+
1011
+ // Get the backend sampled token for the ith token.
1012
+ // Returns LLAMA_TOKEN_NULL if no token was sampled.
1013
+ LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
1014
+
1015
+ // Get the backend sampled probabilites for the ith token
1016
+ // The index matches llama_get_sampled_token_ith().
1017
+ // Returns NULL if no probabilites were generated.
1018
+ LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
1019
+ LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
1020
+
1021
+ // Get the backend sampled logits for the ith token
1022
+ // Returns NULL if no logits were sampled.
1023
+ LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
1024
+ LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
1025
+
1026
+ // Get the backend sampled candidates (token ids) for the ith token
1027
+ // These are needed to map probability/logit indices to vocab token ids.
1028
+ // Returns NULL if no candidates were sampled.
1029
+ LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
1030
+ LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
1031
+
995
1032
  //
996
1033
  // Vocab
997
1034
  //
@@ -1163,11 +1200,16 @@ extern "C" {
1163
1200
  //
1164
1201
  // llama_sampler_free(smpl);
1165
1202
  //
1166
- // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
1167
- //
1168
1203
 
1169
1204
  typedef void * llama_sampler_context_t;
1170
1205
 
1206
+ struct llama_sampler_data {
1207
+ struct ggml_tensor * logits;
1208
+ struct ggml_tensor * probs;
1209
+ struct ggml_tensor * sampled;
1210
+ struct ggml_tensor * candidates;
1211
+ };
1212
+
1171
1213
  // user code can implement the interface below in order to create custom llama_sampler
1172
1214
  struct llama_sampler_i {
1173
1215
  const char * (*name) (const struct llama_sampler * smpl); // can be NULL
@@ -1177,17 +1219,45 @@ extern "C" {
1177
1219
  struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
1178
1220
  void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
1179
1221
 
1180
- // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
1181
- //void (*apply_ggml) (struct llama_sampler * smpl, ...);
1222
+ // [EXPERIMENTAL]
1223
+ // backend sampling interface:
1224
+
1225
+ // return true if the backend supports all ops needed by the sampler
1226
+ // note: call once per sampler
1227
+ bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
1228
+
1229
+ // call after .backend_apply()
1230
+ void (*backend_accept)(
1231
+ struct llama_sampler * smpl,
1232
+ struct ggml_context * ctx,
1233
+ struct ggml_cgraph * gf,
1234
+ struct ggml_tensor * selected_token);
1235
+
1236
+ // call after .backend_init()
1237
+ void (*backend_apply)(
1238
+ struct llama_sampler * smpl,
1239
+ struct ggml_context * ctx,
1240
+ struct ggml_cgraph * gf,
1241
+ struct llama_sampler_data * data);
1242
+
1243
+ // called before graph execution to set inputs for the current ubatch
1244
+ void (*backend_set_input)(struct llama_sampler * smpl);
1182
1245
  };
1183
1246
 
1184
1247
  struct llama_sampler {
1185
- const struct llama_sampler_i * iface;
1186
- llama_sampler_context_t ctx;
1248
+ struct llama_sampler_i * iface;
1249
+
1250
+ llama_sampler_context_t ctx;
1187
1251
  };
1188
1252
 
1253
+ // [EXPERIMENTAL]
1254
+ // attach a sampler to the context
1255
+ // note: prefer initializing the context with llama_context_params.samplers when possible
1256
+ // note: changing the samplers of a context can cause graph reallocations and degraded performance
1257
+ LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
1258
+
1189
1259
  // mirror of llama_sampler_i:
1190
- LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1260
+ LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1191
1261
  LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
1192
1262
  LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
1193
1263
  LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
@@ -1203,7 +1273,15 @@ extern "C" {
1203
1273
 
1204
1274
  // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
1205
1275
  LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
1206
- LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
1276
+
1277
+ // return NULL if:
1278
+ // - the sampler is NULL
1279
+ // - the sampler is not a llama_sampler_chain
1280
+ // - the index is out of bounds, unless i == -1
1281
+ // - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
1282
+ LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
1283
+
1284
+ // the total number of samplers in the chain
1207
1285
  LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
1208
1286
 
1209
1287
  // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed