llama-cpp-pydist 0.20.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp/binaries/{llama-b7621-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/METADATA +146 -1
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +76 -73
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +3 -1
- vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
- vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
- vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +7 -0
- vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +4 -4
- vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +19 -0
- vendor_llama_cpp_pydist/llama.cpp/common/common.h +4 -0
- vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
- vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
- vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
- vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +10 -4
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
- vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +50 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +55 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +14 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +44 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +24 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +50 -29
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +9 -9
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +37 -3
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +22 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +32 -25
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
- vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +12 -7
- vendor_llama_cpp_pydist/llama.cpp/include/llama.h +86 -8
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +602 -18
- vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
- vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
- vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +2 -2
- vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +43 -11
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1232 -170
- vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +16 -7
- vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +1 -1
- vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
- vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +4 -3
- vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
- vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +93 -4
- vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
- vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +8 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
- vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +12 -7
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +19 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +47 -5
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +3 -3
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +2 -2
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +2 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +3 -0
- vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +1 -0
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +0 -0
- {llama_cpp_pydist-0.20.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -316,6 +316,11 @@ extern "C" {
|
|
|
316
316
|
bool no_alloc; // only load metadata and simulate memory allocations
|
|
317
317
|
};
|
|
318
318
|
|
|
319
|
+
struct llama_sampler_seq_config {
|
|
320
|
+
llama_seq_id seq_id;
|
|
321
|
+
struct llama_sampler * sampler;
|
|
322
|
+
};
|
|
323
|
+
|
|
319
324
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
320
325
|
// https://github.com/ggml-org/llama.cpp/pull/7544
|
|
321
326
|
struct llama_context_params {
|
|
@@ -364,6 +369,12 @@ extern "C" {
|
|
|
364
369
|
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
|
365
370
|
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
|
366
371
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
372
|
+
|
|
373
|
+
// [EXPERIMENTAL]
|
|
374
|
+
// backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
|
|
375
|
+
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
|
|
376
|
+
struct llama_sampler_seq_config * samplers;
|
|
377
|
+
size_t n_samplers;
|
|
367
378
|
};
|
|
368
379
|
|
|
369
380
|
// model quantization parameters
|
|
@@ -992,6 +1003,32 @@ extern "C" {
|
|
|
992
1003
|
// otherwise: float[n_embd] (1-dimensional)
|
|
993
1004
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
|
994
1005
|
|
|
1006
|
+
//
|
|
1007
|
+
// backend sampling API [EXPERIMENTAL]
|
|
1008
|
+
// note: use only if the llama_context was created with at least one llama_sampler_seq_config
|
|
1009
|
+
//
|
|
1010
|
+
|
|
1011
|
+
// Get the backend sampled token for the ith token.
|
|
1012
|
+
// Returns LLAMA_TOKEN_NULL if no token was sampled.
|
|
1013
|
+
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
|
|
1014
|
+
|
|
1015
|
+
// Get the backend sampled probabilites for the ith token
|
|
1016
|
+
// The index matches llama_get_sampled_token_ith().
|
|
1017
|
+
// Returns NULL if no probabilites were generated.
|
|
1018
|
+
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
|
|
1019
|
+
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
|
|
1020
|
+
|
|
1021
|
+
// Get the backend sampled logits for the ith token
|
|
1022
|
+
// Returns NULL if no logits were sampled.
|
|
1023
|
+
LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
|
|
1024
|
+
LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
|
|
1025
|
+
|
|
1026
|
+
// Get the backend sampled candidates (token ids) for the ith token
|
|
1027
|
+
// These are needed to map probability/logit indices to vocab token ids.
|
|
1028
|
+
// Returns NULL if no candidates were sampled.
|
|
1029
|
+
LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
|
|
1030
|
+
LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
|
|
1031
|
+
|
|
995
1032
|
//
|
|
996
1033
|
// Vocab
|
|
997
1034
|
//
|
|
@@ -1163,11 +1200,16 @@ extern "C" {
|
|
|
1163
1200
|
//
|
|
1164
1201
|
// llama_sampler_free(smpl);
|
|
1165
1202
|
//
|
|
1166
|
-
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
|
|
1167
|
-
//
|
|
1168
1203
|
|
|
1169
1204
|
typedef void * llama_sampler_context_t;
|
|
1170
1205
|
|
|
1206
|
+
struct llama_sampler_data {
|
|
1207
|
+
struct ggml_tensor * logits;
|
|
1208
|
+
struct ggml_tensor * probs;
|
|
1209
|
+
struct ggml_tensor * sampled;
|
|
1210
|
+
struct ggml_tensor * candidates;
|
|
1211
|
+
};
|
|
1212
|
+
|
|
1171
1213
|
// user code can implement the interface below in order to create custom llama_sampler
|
|
1172
1214
|
struct llama_sampler_i {
|
|
1173
1215
|
const char * (*name) (const struct llama_sampler * smpl); // can be NULL
|
|
@@ -1177,17 +1219,45 @@ extern "C" {
|
|
|
1177
1219
|
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
|
1178
1220
|
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
|
|
1179
1221
|
|
|
1180
|
-
//
|
|
1181
|
-
//
|
|
1222
|
+
// [EXPERIMENTAL]
|
|
1223
|
+
// backend sampling interface:
|
|
1224
|
+
|
|
1225
|
+
// return true if the backend supports all ops needed by the sampler
|
|
1226
|
+
// note: call once per sampler
|
|
1227
|
+
bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
|
|
1228
|
+
|
|
1229
|
+
// call after .backend_apply()
|
|
1230
|
+
void (*backend_accept)(
|
|
1231
|
+
struct llama_sampler * smpl,
|
|
1232
|
+
struct ggml_context * ctx,
|
|
1233
|
+
struct ggml_cgraph * gf,
|
|
1234
|
+
struct ggml_tensor * selected_token);
|
|
1235
|
+
|
|
1236
|
+
// call after .backend_init()
|
|
1237
|
+
void (*backend_apply)(
|
|
1238
|
+
struct llama_sampler * smpl,
|
|
1239
|
+
struct ggml_context * ctx,
|
|
1240
|
+
struct ggml_cgraph * gf,
|
|
1241
|
+
struct llama_sampler_data * data);
|
|
1242
|
+
|
|
1243
|
+
// called before graph execution to set inputs for the current ubatch
|
|
1244
|
+
void (*backend_set_input)(struct llama_sampler * smpl);
|
|
1182
1245
|
};
|
|
1183
1246
|
|
|
1184
1247
|
struct llama_sampler {
|
|
1185
|
-
|
|
1186
|
-
|
|
1248
|
+
struct llama_sampler_i * iface;
|
|
1249
|
+
|
|
1250
|
+
llama_sampler_context_t ctx;
|
|
1187
1251
|
};
|
|
1188
1252
|
|
|
1253
|
+
// [EXPERIMENTAL]
|
|
1254
|
+
// attach a sampler to the context
|
|
1255
|
+
// note: prefer initializing the context with llama_context_params.samplers when possible
|
|
1256
|
+
// note: changing the samplers of a context can cause graph reallocations and degraded performance
|
|
1257
|
+
LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
|
|
1258
|
+
|
|
1189
1259
|
// mirror of llama_sampler_i:
|
|
1190
|
-
LLAMA_API struct llama_sampler * llama_sampler_init (
|
|
1260
|
+
LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
|
|
1191
1261
|
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
|
1192
1262
|
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
|
1193
1263
|
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
|
@@ -1203,7 +1273,15 @@ extern "C" {
|
|
|
1203
1273
|
|
|
1204
1274
|
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called
|
|
1205
1275
|
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
|
|
1206
|
-
|
|
1276
|
+
|
|
1277
|
+
// return NULL if:
|
|
1278
|
+
// - the sampler is NULL
|
|
1279
|
+
// - the sampler is not a llama_sampler_chain
|
|
1280
|
+
// - the index is out of bounds, unless i == -1
|
|
1281
|
+
// - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
|
|
1282
|
+
LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
|
|
1283
|
+
|
|
1284
|
+
// the total number of samplers in the chain
|
|
1207
1285
|
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
|
1208
1286
|
|
|
1209
1287
|
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|