@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
|
|
|
121
121
|
common_grammar_trigger_type type;
|
|
122
122
|
std::string value;
|
|
123
123
|
llama_token token = LLAMA_TOKEN_NULL;
|
|
124
|
-
|
|
125
|
-
// T can only be nlohmann::ordered_json
|
|
126
|
-
template <class T> T to_json() const;
|
|
127
|
-
template <class T> static common_grammar_trigger from_json(const T & in);
|
|
128
124
|
};
|
|
129
125
|
|
|
130
126
|
// sampling parameters
|
|
@@ -184,6 +180,13 @@ struct common_params_sampling {
|
|
|
184
180
|
std::string print() const;
|
|
185
181
|
};
|
|
186
182
|
|
|
183
|
+
struct common_params_model {
|
|
184
|
+
std::string path = ""; // model local path // NOLINT
|
|
185
|
+
std::string url = ""; // model url to download // NOLINT
|
|
186
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
187
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
188
|
+
};
|
|
189
|
+
|
|
187
190
|
struct common_params_speculative {
|
|
188
191
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
189
192
|
|
|
@@ -197,19 +200,11 @@ struct common_params_speculative {
|
|
|
197
200
|
struct cpu_params cpuparams;
|
|
198
201
|
struct cpu_params cpuparams_batch;
|
|
199
202
|
|
|
200
|
-
|
|
201
|
-
std::string hf_file = ""; // HF file // NOLINT
|
|
202
|
-
|
|
203
|
-
std::string model = ""; // draft model for speculative decoding // NOLINT
|
|
204
|
-
std::string model_url = ""; // model url to download // NOLINT
|
|
203
|
+
struct common_params_model model;
|
|
205
204
|
};
|
|
206
205
|
|
|
207
206
|
struct common_params_vocoder {
|
|
208
|
-
|
|
209
|
-
std::string hf_file = ""; // HF file // NOLINT
|
|
210
|
-
|
|
211
|
-
std::string model = ""; // model path // NOLINT
|
|
212
|
-
std::string model_url = ""; // model url to download // NOLINT
|
|
207
|
+
struct common_params_model model;
|
|
213
208
|
|
|
214
209
|
std::string speaker_file = ""; // speaker file path // NOLINT
|
|
215
210
|
|
|
@@ -267,12 +262,10 @@ struct common_params {
|
|
|
267
262
|
struct common_params_speculative speculative;
|
|
268
263
|
struct common_params_vocoder vocoder;
|
|
269
264
|
|
|
270
|
-
|
|
265
|
+
struct common_params_model model;
|
|
266
|
+
|
|
271
267
|
std::string model_alias = ""; // model alias // NOLINT
|
|
272
|
-
std::string model_url = ""; // model url to download // NOLINT
|
|
273
268
|
std::string hf_token = ""; // HF token // NOLINT
|
|
274
|
-
std::string hf_repo = ""; // HF repo // NOLINT
|
|
275
|
-
std::string hf_file = ""; // HF file // NOLINT
|
|
276
269
|
std::string prompt = ""; // NOLINT
|
|
277
270
|
std::string system_prompt = ""; // NOLINT
|
|
278
271
|
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
|
@@ -286,6 +279,7 @@ struct common_params {
|
|
|
286
279
|
std::vector<std::string> in_files; // all input files
|
|
287
280
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
288
281
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
282
|
+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
289
283
|
|
|
290
284
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
|
291
285
|
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
|
@@ -347,7 +341,9 @@ struct common_params {
|
|
|
347
341
|
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
|
348
342
|
|
|
349
343
|
// multimodal models (see examples/llava)
|
|
350
|
-
|
|
344
|
+
struct common_params_model mmproj;
|
|
345
|
+
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
|
346
|
+
bool no_mmproj = false; // explicitly disable multimodal model
|
|
351
347
|
std::vector<std::string> image; // path to image file(s)
|
|
352
348
|
|
|
353
349
|
// embedding
|
|
@@ -546,26 +542,11 @@ struct llama_model_params common_model_params_to_llama ( common_params
|
|
|
546
542
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
|
547
543
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
|
548
544
|
|
|
549
|
-
struct llama_model * common_load_model_from_url(
|
|
550
|
-
const std::string & model_url,
|
|
551
|
-
const std::string & local_path,
|
|
552
|
-
const std::string & hf_token,
|
|
553
|
-
const struct llama_model_params & params);
|
|
554
|
-
|
|
555
|
-
struct llama_model * common_load_model_from_hf(
|
|
556
|
-
const std::string & repo,
|
|
557
|
-
const std::string & remote_path,
|
|
558
|
-
const std::string & local_path,
|
|
559
|
-
const std::string & hf_token,
|
|
560
|
-
const struct llama_model_params & params);
|
|
561
|
-
|
|
562
|
-
std::pair<std::string, std::string> common_get_hf_file(
|
|
563
|
-
const std::string & hf_repo_with_tag,
|
|
564
|
-
const std::string & hf_token);
|
|
565
|
-
|
|
566
545
|
// clear LoRA adapters from context, then apply new list of adapters
|
|
567
546
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
|
568
547
|
|
|
548
|
+
std::string get_model_endpoint();
|
|
549
|
+
|
|
569
550
|
//
|
|
570
551
|
// Batch utils
|
|
571
552
|
//
|
|
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
|
|
|
16
16
|
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
|
17
17
|
auto has_max = max_items != std::numeric_limits<int>::max();
|
|
18
18
|
|
|
19
|
+
if (max_items == 0) {
|
|
20
|
+
return "";
|
|
21
|
+
}
|
|
19
22
|
if (min_items == 0 && max_items == 1) {
|
|
20
23
|
return item_rule + "?";
|
|
21
24
|
}
|
|
@@ -11,25 +11,24 @@ struct llama_sampler_llg {
|
|
|
11
11
|
std::string grammar_kind;
|
|
12
12
|
std::string grammar_data;
|
|
13
13
|
LlgTokenizer * tokenizer;
|
|
14
|
-
|
|
15
|
-
LlgMaskResult llg_res;
|
|
16
|
-
bool has_llg_res;
|
|
14
|
+
LlgMatcher * grammar;
|
|
17
15
|
};
|
|
18
16
|
|
|
19
|
-
static
|
|
20
|
-
|
|
17
|
+
static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
|
|
18
|
+
const char * grammar_data) {
|
|
21
19
|
LlgConstraintInit cinit;
|
|
22
20
|
llg_constraint_init_set_defaults(&cinit, tokenizer);
|
|
23
21
|
const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
|
|
24
22
|
if (log_level && *log_level) {
|
|
25
23
|
cinit.log_stderr_level = atoi(log_level);
|
|
26
24
|
}
|
|
27
|
-
auto c =
|
|
28
|
-
if (
|
|
29
|
-
LOG_ERR("llg error: %s\n",
|
|
30
|
-
|
|
25
|
+
auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
|
|
26
|
+
if (llg_matcher_get_error(c)) {
|
|
27
|
+
LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
|
|
28
|
+
llg_free_matcher(c);
|
|
31
29
|
return nullptr;
|
|
32
30
|
}
|
|
31
|
+
|
|
33
32
|
return c;
|
|
34
33
|
}
|
|
35
34
|
|
|
@@ -40,39 +39,29 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
|
|
|
40
39
|
static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
|
|
41
40
|
auto * ctx = (llama_sampler_llg *) smpl->ctx;
|
|
42
41
|
if (ctx->grammar) {
|
|
43
|
-
|
|
44
|
-
llg_commit_token(ctx->grammar, token, &res);
|
|
45
|
-
ctx->has_llg_res = false;
|
|
42
|
+
llg_matcher_consume_token(ctx->grammar, token);
|
|
46
43
|
}
|
|
47
44
|
}
|
|
48
45
|
|
|
49
46
|
static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
50
47
|
auto * ctx = (llama_sampler_llg *) smpl->ctx;
|
|
51
48
|
if (ctx->grammar) {
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
49
|
+
const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
|
|
50
|
+
if (mask == nullptr) {
|
|
51
|
+
if (llg_matcher_compute_mask(ctx->grammar) == 0) {
|
|
52
|
+
mask = llg_matcher_get_mask(ctx->grammar);
|
|
55
53
|
} else {
|
|
56
|
-
LOG_ERR("llg error: %s\n",
|
|
57
|
-
|
|
54
|
+
LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
|
|
55
|
+
llg_free_matcher(ctx->grammar);
|
|
58
56
|
ctx->grammar = nullptr;
|
|
57
|
+
return;
|
|
59
58
|
}
|
|
60
59
|
}
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
} else {
|
|
69
|
-
const uint32_t * mask = ctx->llg_res.sample_mask;
|
|
70
|
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
71
|
-
auto token = cur_p->data[i].id;
|
|
72
|
-
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
|
|
73
|
-
cur_p->data[i].logit = -INFINITY;
|
|
74
|
-
}
|
|
75
|
-
}
|
|
60
|
+
|
|
61
|
+
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
62
|
+
auto token = cur_p->data[i].id;
|
|
63
|
+
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
|
|
64
|
+
cur_p->data[i].logit = -INFINITY;
|
|
76
65
|
}
|
|
77
66
|
}
|
|
78
67
|
}
|
|
@@ -80,14 +69,9 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array
|
|
|
80
69
|
|
|
81
70
|
static void llama_sampler_llg_reset(llama_sampler * smpl) {
|
|
82
71
|
auto * ctx = (llama_sampler_llg *) smpl->ctx;
|
|
83
|
-
if (
|
|
84
|
-
|
|
72
|
+
if (ctx->grammar) {
|
|
73
|
+
llg_matcher_reset(ctx->grammar);
|
|
85
74
|
}
|
|
86
|
-
|
|
87
|
-
auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
|
|
88
|
-
llg_free_constraint(ctx->grammar);
|
|
89
|
-
ctx->grammar = grammar_new;
|
|
90
|
-
ctx->has_llg_res = false;
|
|
91
75
|
}
|
|
92
76
|
|
|
93
77
|
static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
|
|
@@ -102,7 +86,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
|
|
|
102
86
|
if (ctx->grammar) {
|
|
103
87
|
result_ctx->grammar_kind = ctx->grammar_kind;
|
|
104
88
|
result_ctx->grammar_data = ctx->grammar_data;
|
|
105
|
-
result_ctx->grammar =
|
|
89
|
+
result_ctx->grammar = llg_clone_matcher(ctx->grammar);
|
|
106
90
|
result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
|
|
107
91
|
}
|
|
108
92
|
}
|
|
@@ -114,7 +98,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
|
|
|
114
98
|
const auto * ctx = (llama_sampler_llg *) smpl->ctx;
|
|
115
99
|
|
|
116
100
|
if (ctx->grammar) {
|
|
117
|
-
|
|
101
|
+
llg_free_matcher(ctx->grammar);
|
|
118
102
|
llg_free_tokenizer(ctx->tokenizer);
|
|
119
103
|
}
|
|
120
104
|
|
|
@@ -239,9 +223,11 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
|
|
|
239
223
|
/* .grammar_data = */ grammar_data,
|
|
240
224
|
/* .tokenizer = */ tokenizer,
|
|
241
225
|
/* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
|
|
242
|
-
/* .llg_res = */ {},
|
|
243
|
-
/* .has_llg_res = */ false,
|
|
244
226
|
};
|
|
227
|
+
if (ctx->grammar) {
|
|
228
|
+
GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
|
|
229
|
+
llg_matcher_get_mask_byte_size(ctx->grammar));
|
|
230
|
+
}
|
|
245
231
|
} else {
|
|
246
232
|
*ctx = {
|
|
247
233
|
/* .vocab = */ vocab,
|
|
@@ -249,15 +235,12 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
|
|
|
249
235
|
/* .grammar_data = */ {},
|
|
250
236
|
/* .tokenizer = */ nullptr,
|
|
251
237
|
/* .grammar = */ nullptr,
|
|
252
|
-
/* .llg_res = */ {},
|
|
253
|
-
/* .has_llg_res = */ false,
|
|
254
238
|
};
|
|
255
239
|
}
|
|
256
240
|
|
|
257
241
|
return llama_sampler_init(
|
|
258
242
|
/* .iface = */ &llama_sampler_llg_i,
|
|
259
|
-
/* .ctx = */ ctx
|
|
260
|
-
);
|
|
243
|
+
/* .ctx = */ ctx);
|
|
261
244
|
}
|
|
262
245
|
|
|
263
246
|
#else
|
|
@@ -9,10 +9,19 @@
|
|
|
9
9
|
#pragma once
|
|
10
10
|
|
|
11
11
|
#include "minja.hpp"
|
|
12
|
-
|
|
12
|
+
|
|
13
|
+
#include <chrono>
|
|
14
|
+
#include <cstddef>
|
|
15
|
+
#include <cstdio>
|
|
16
|
+
#include <exception>
|
|
17
|
+
#include <iomanip>
|
|
18
|
+
#include <memory>
|
|
19
|
+
#include <sstream>
|
|
13
20
|
#include <string>
|
|
14
21
|
#include <vector>
|
|
15
22
|
|
|
23
|
+
#include <json.hpp>
|
|
24
|
+
|
|
16
25
|
using json = nlohmann::ordered_json;
|
|
17
26
|
|
|
18
27
|
namespace minja {
|
|
@@ -425,7 +434,7 @@ class chat_template {
|
|
|
425
434
|
auto obj = json {
|
|
426
435
|
{"tool_calls", tool_calls},
|
|
427
436
|
};
|
|
428
|
-
if (!content.is_null() && content
|
|
437
|
+
if (!content.is_null() && !content.empty()) {
|
|
429
438
|
obj["content"] = content;
|
|
430
439
|
}
|
|
431
440
|
message["content"] = obj.dump(2);
|
|
@@ -435,13 +444,12 @@ class chat_template {
|
|
|
435
444
|
if (polyfill_tool_responses && role == "tool") {
|
|
436
445
|
message["role"] = "user";
|
|
437
446
|
auto obj = json {
|
|
438
|
-
{"tool_response",
|
|
439
|
-
{"content", message.at("content")},
|
|
440
|
-
}},
|
|
447
|
+
{"tool_response", json::object()},
|
|
441
448
|
};
|
|
442
449
|
if (message.contains("name")) {
|
|
443
|
-
obj["tool_response"]["
|
|
450
|
+
obj["tool_response"]["tool"] = message.at("name");
|
|
444
451
|
}
|
|
452
|
+
obj["tool_response"]["content"] = message.at("content");
|
|
445
453
|
if (message.contains("tool_call_id")) {
|
|
446
454
|
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
|
|
447
455
|
}
|
|
@@ -510,7 +518,7 @@ class chat_template {
|
|
|
510
518
|
static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
|
|
511
519
|
json messages_with_system = messages;
|
|
512
520
|
|
|
513
|
-
if (messages_with_system.
|
|
521
|
+
if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
|
|
514
522
|
std::string existing_system = messages_with_system.at(0).at("content");
|
|
515
523
|
messages_with_system[0] = json {
|
|
516
524
|
{"role", "system"},
|