@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
#include "llama-arch.h"
|
|
5
|
+
#include "llama-graph.h"
|
|
5
6
|
#include "llama-hparams.h"
|
|
7
|
+
#include "llama-memory.h"
|
|
6
8
|
#include "llama-vocab.h"
|
|
7
9
|
|
|
8
10
|
#include <memory>
|
|
@@ -10,6 +12,8 @@
|
|
|
10
12
|
#include <unordered_map>
|
|
11
13
|
#include <vector>
|
|
12
14
|
|
|
15
|
+
struct llama_cparams;
|
|
16
|
+
struct llama_ubatch;
|
|
13
17
|
struct llama_model_loader;
|
|
14
18
|
|
|
15
19
|
// available models
|
|
@@ -25,6 +29,7 @@ enum llm_type {
|
|
|
25
29
|
LLM_TYPE_109M,
|
|
26
30
|
LLM_TYPE_137M,
|
|
27
31
|
LLM_TYPE_160M,
|
|
32
|
+
LLM_TYPE_190M,
|
|
28
33
|
LLM_TYPE_220M,
|
|
29
34
|
LLM_TYPE_250M,
|
|
30
35
|
LLM_TYPE_270M,
|
|
@@ -41,6 +46,7 @@ enum llm_type {
|
|
|
41
46
|
LLM_TYPE_1_6B,
|
|
42
47
|
LLM_TYPE_2B,
|
|
43
48
|
LLM_TYPE_2_8B,
|
|
49
|
+
LLM_TYPE_2_9B,
|
|
44
50
|
LLM_TYPE_3B,
|
|
45
51
|
LLM_TYPE_4B,
|
|
46
52
|
LLM_TYPE_6B,
|
|
@@ -256,6 +262,20 @@ struct llama_layer {
|
|
|
256
262
|
struct ggml_tensor * time_mix_receptance_b = nullptr;
|
|
257
263
|
struct ggml_tensor * time_mix_gate = nullptr;
|
|
258
264
|
|
|
265
|
+
// rwkv7
|
|
266
|
+
struct ggml_tensor * time_mix_w0 = nullptr;
|
|
267
|
+
struct ggml_tensor * time_mix_a0 = nullptr;
|
|
268
|
+
struct ggml_tensor * time_mix_a1 = nullptr;
|
|
269
|
+
struct ggml_tensor * time_mix_a2 = nullptr;
|
|
270
|
+
struct ggml_tensor * time_mix_v0 = nullptr;
|
|
271
|
+
struct ggml_tensor * time_mix_v1 = nullptr;
|
|
272
|
+
struct ggml_tensor * time_mix_v2 = nullptr;
|
|
273
|
+
struct ggml_tensor * time_mix_g1 = nullptr;
|
|
274
|
+
struct ggml_tensor * time_mix_g2 = nullptr;
|
|
275
|
+
struct ggml_tensor * time_mix_k_k = nullptr;
|
|
276
|
+
struct ggml_tensor * time_mix_k_a = nullptr;
|
|
277
|
+
struct ggml_tensor * time_mix_r_k = nullptr;
|
|
278
|
+
|
|
259
279
|
struct ggml_tensor * time_mix_ln = nullptr;
|
|
260
280
|
struct ggml_tensor * time_mix_ln_b = nullptr;
|
|
261
281
|
struct ggml_tensor * time_mix_output = nullptr;
|
|
@@ -347,7 +367,7 @@ struct llama_model {
|
|
|
347
367
|
std::string desc() const;
|
|
348
368
|
|
|
349
369
|
size_t size() const;
|
|
350
|
-
size_t
|
|
370
|
+
size_t n_tensors() const;
|
|
351
371
|
size_t n_devices() const;
|
|
352
372
|
|
|
353
373
|
// total number of parameters in the model
|
|
@@ -362,9 +382,22 @@ struct llama_model {
|
|
|
362
382
|
|
|
363
383
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
|
364
384
|
|
|
385
|
+
// TODO: move this to new llm_arch_model_i interface
|
|
386
|
+
llama_memory_i * create_memory() const; // TODO: params
|
|
387
|
+
|
|
388
|
+
// TODO: move this to new llm_arch_model_i interface
|
|
389
|
+
llm_graph_result_ptr build_graph(
|
|
390
|
+
const llm_graph_params & params,
|
|
391
|
+
ggml_cgraph * gf,
|
|
392
|
+
llm_graph_type type) const;
|
|
393
|
+
|
|
365
394
|
private:
|
|
366
395
|
struct impl;
|
|
367
396
|
std::unique_ptr<impl> pimpl;
|
|
368
397
|
};
|
|
369
398
|
|
|
370
399
|
const char * llm_type_name(llm_type type);
|
|
400
|
+
|
|
401
|
+
// For internal test use
|
|
402
|
+
// TODO: remove
|
|
403
|
+
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
|
|
@@ -756,10 +756,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
756
756
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
757
757
|
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
|
758
758
|
|
|
759
|
-
// do not quantize RWKV's
|
|
759
|
+
// do not quantize RWKV's small yet 2D weights
|
|
760
760
|
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
|
761
|
+
quantize &= name.find("time_mix_w0.weight") == std::string::npos;
|
|
761
762
|
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
|
762
763
|
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
|
764
|
+
quantize &= name.find("time_mix_v0.weight") == std::string::npos;
|
|
765
|
+
quantize &= name.find("time_mix_v1.weight") == std::string::npos;
|
|
766
|
+
quantize &= name.find("time_mix_v2.weight") == std::string::npos;
|
|
767
|
+
quantize &= name.find("time_mix_a0.weight") == std::string::npos;
|
|
768
|
+
quantize &= name.find("time_mix_a1.weight") == std::string::npos;
|
|
769
|
+
quantize &= name.find("time_mix_a2.weight") == std::string::npos;
|
|
770
|
+
quantize &= name.find("time_mix_g1.weight") == std::string::npos;
|
|
771
|
+
quantize &= name.find("time_mix_g2.weight") == std::string::npos;
|
|
763
772
|
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
|
764
773
|
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
|
765
774
|
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
|
|
@@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1449
1449
|
const char ** trigger_words,
|
|
1450
1450
|
size_t num_trigger_words,
|
|
1451
1451
|
const llama_token * trigger_tokens,
|
|
1452
|
-
size_t num_trigger_tokens
|
|
1452
|
+
size_t num_trigger_tokens,
|
|
1453
|
+
const char ** trigger_patterns,
|
|
1454
|
+
size_t num_trigger_patterns);
|
|
1453
1455
|
|
|
1454
1456
|
static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
1455
1457
|
auto * ctx = (llama_sampler_grammar *) smpl->ctx;
|
|
@@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
|
1457
1459
|
return;
|
|
1458
1460
|
}
|
|
1459
1461
|
|
|
1460
|
-
std::vector<const char *>
|
|
1461
|
-
|
|
1462
|
-
|
|
1462
|
+
std::vector<const char *> trigger_patterns_c;
|
|
1463
|
+
trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
|
|
1464
|
+
for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
|
|
1465
|
+
trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
|
|
1463
1466
|
}
|
|
1467
|
+
|
|
1464
1468
|
auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
|
|
1465
|
-
ctx->grammar->lazy,
|
|
1469
|
+
ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
1466
1470
|
ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
|
|
1467
1471
|
|
|
1468
1472
|
llama_grammar_free_impl(ctx->grammar);
|
|
@@ -1472,7 +1476,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
|
1472
1476
|
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
|
1473
1477
|
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
|
1474
1478
|
|
|
1475
|
-
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
|
|
1479
|
+
auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
|
|
1476
1480
|
|
|
1477
1481
|
// copy the state
|
|
1478
1482
|
{
|
|
@@ -1516,15 +1520,33 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
|
1516
1520
|
const char ** trigger_words,
|
|
1517
1521
|
size_t num_trigger_words,
|
|
1518
1522
|
const llama_token * trigger_tokens,
|
|
1519
|
-
size_t num_trigger_tokens
|
|
1523
|
+
size_t num_trigger_tokens,
|
|
1524
|
+
const char ** trigger_patterns,
|
|
1525
|
+
size_t num_trigger_patterns) {
|
|
1520
1526
|
auto * ctx = new llama_sampler_grammar;
|
|
1521
1527
|
|
|
1522
1528
|
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
|
1529
|
+
// TODO: remove trigger_words support.
|
|
1530
|
+
if (trigger_words != nullptr && num_trigger_words > 0) {
|
|
1531
|
+
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
|
|
1532
|
+
std::string trigger_pattern("[\\s\\S]*?(");
|
|
1533
|
+
for (size_t i = 0; i < num_trigger_words; ++i) {
|
|
1534
|
+
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
|
1535
|
+
if (i > 0) {
|
|
1536
|
+
trigger_pattern += "|";
|
|
1537
|
+
}
|
|
1538
|
+
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
|
|
1539
|
+
}
|
|
1540
|
+
trigger_pattern += ")[\\s\\S]*";
|
|
1541
|
+
auto trigger_pattern_c = trigger_pattern.c_str();
|
|
1542
|
+
trigger_patterns = &trigger_pattern_c;
|
|
1543
|
+
num_trigger_patterns = 1;
|
|
1544
|
+
}
|
|
1523
1545
|
*ctx = {
|
|
1524
1546
|
/* .vocab = */ vocab,
|
|
1525
1547
|
/* .grammar_str = */ grammar_str,
|
|
1526
1548
|
/* .grammar_root = */ grammar_root,
|
|
1527
|
-
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy,
|
|
1549
|
+
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
|
|
1528
1550
|
};
|
|
1529
1551
|
} else {
|
|
1530
1552
|
*ctx = {
|
|
@@ -1545,7 +1567,7 @@ struct llama_sampler * llama_sampler_init_grammar(
|
|
|
1545
1567
|
const struct llama_vocab * vocab,
|
|
1546
1568
|
const char * grammar_str,
|
|
1547
1569
|
const char * grammar_root) {
|
|
1548
|
-
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
|
|
1570
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
|
|
1549
1571
|
}
|
|
1550
1572
|
|
|
1551
1573
|
struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
@@ -1556,7 +1578,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
|
|
|
1556
1578
|
size_t num_trigger_words,
|
|
1557
1579
|
const llama_token * trigger_tokens,
|
|
1558
1580
|
size_t num_trigger_tokens) {
|
|
1559
|
-
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
|
|
1581
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1584
|
+
struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
|
|
1585
|
+
const struct llama_vocab * vocab,
|
|
1586
|
+
const char * grammar_str,
|
|
1587
|
+
const char * grammar_root,
|
|
1588
|
+
const char ** trigger_patterns,
|
|
1589
|
+
size_t num_trigger_patterns,
|
|
1590
|
+
const llama_token * trigger_tokens,
|
|
1591
|
+
size_t num_trigger_tokens) {
|
|
1592
|
+
return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
|
|
1560
1593
|
}
|
|
1561
1594
|
|
|
1562
1595
|
// penalties
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
#include <queue>
|
|
17
17
|
#include <set>
|
|
18
18
|
#include <unordered_map>
|
|
19
|
+
#include <cctype>
|
|
19
20
|
|
|
20
21
|
//
|
|
21
22
|
// helpers
|
|
@@ -392,6 +393,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
392
393
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
393
394
|
};
|
|
394
395
|
break;
|
|
396
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT4O:
|
|
397
|
+
regex_exprs = {
|
|
398
|
+
// original regex from tokenizer.json
|
|
399
|
+
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
400
|
+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
401
|
+
};
|
|
402
|
+
break;
|
|
395
403
|
default:
|
|
396
404
|
// default regex for BPE tokenization pre-processing
|
|
397
405
|
regex_exprs = {
|
|
@@ -1592,6 +1600,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1592
1600
|
} else if (
|
|
1593
1601
|
tokenizer_pre == "megrez") {
|
|
1594
1602
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1603
|
+
} else if (
|
|
1604
|
+
tokenizer_pre == "gpt-4o") {
|
|
1605
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
|
1606
|
+
clean_spaces = false;
|
|
1595
1607
|
} else {
|
|
1596
1608
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1597
1609
|
}
|