@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -24,6 +24,22 @@
|
|
|
24
24
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
|
25
25
|
|
|
26
26
|
using json = nlohmann::ordered_json;
|
|
27
|
+
using llama_tokens = std::vector<llama_token>;
|
|
28
|
+
|
|
29
|
+
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
30
|
+
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
31
|
+
#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
32
|
+
#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
|
33
|
+
|
|
34
|
+
#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
35
|
+
#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
36
|
+
#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
37
|
+
#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
38
|
+
|
|
39
|
+
#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
40
|
+
#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
41
|
+
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
42
|
+
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
27
43
|
|
|
28
44
|
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
|
29
45
|
enum error_type {
|
|
@@ -52,12 +68,240 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|
|
52
68
|
}
|
|
53
69
|
|
|
54
70
|
//
|
|
55
|
-
//
|
|
71
|
+
// tokenizer and input processing utils
|
|
56
72
|
//
|
|
57
73
|
|
|
74
|
+
static bool json_is_array_of_numbers(const json & data) {
|
|
75
|
+
if (data.is_array()) {
|
|
76
|
+
for (const auto & e : data) {
|
|
77
|
+
if (!e.is_number_integer()) {
|
|
78
|
+
return false;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return true;
|
|
82
|
+
}
|
|
83
|
+
return false;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// is array having BOTH numbers & strings?
|
|
87
|
+
static bool json_is_array_of_mixed_numbers_strings(const json & data) {
|
|
88
|
+
bool seen_string = false;
|
|
89
|
+
bool seen_number = false;
|
|
90
|
+
if (data.is_array()) {
|
|
91
|
+
for (const auto & e : data) {
|
|
92
|
+
seen_string |= e.is_string();
|
|
93
|
+
seen_number |= e.is_number_integer();
|
|
94
|
+
if (seen_number && seen_string) {
|
|
95
|
+
return true;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return false;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* this handles 2 cases:
|
|
104
|
+
* - only string, example: "string"
|
|
105
|
+
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
|
|
106
|
+
*/
|
|
107
|
+
static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
|
|
108
|
+
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
|
109
|
+
// or the first element of the json_prompt array is a string.
|
|
110
|
+
llama_tokens prompt_tokens;
|
|
111
|
+
|
|
112
|
+
if (json_prompt.is_array()) {
|
|
113
|
+
bool first = true;
|
|
114
|
+
for (const auto & p : json_prompt) {
|
|
115
|
+
if (p.is_string()) {
|
|
116
|
+
auto s = p.template get<std::string>();
|
|
117
|
+
|
|
118
|
+
llama_tokens p;
|
|
119
|
+
if (first) {
|
|
120
|
+
p = common_tokenize(ctx, s, add_special, parse_special);
|
|
121
|
+
first = false;
|
|
122
|
+
} else {
|
|
123
|
+
p = common_tokenize(ctx, s, false, parse_special);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
|
127
|
+
} else {
|
|
128
|
+
if (first) {
|
|
129
|
+
first = false;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
prompt_tokens.push_back(p.template get<llama_token>());
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
} else {
|
|
136
|
+
auto s = json_prompt.template get<std::string>();
|
|
137
|
+
prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return prompt_tokens;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* break the input "prompt" object into multiple prompt if needed, then tokenize them
|
|
145
|
+
* this supports these cases:
|
|
146
|
+
* - "prompt": "string"
|
|
147
|
+
* - "prompt": [12, 34, 56]
|
|
148
|
+
* - "prompt": [12, 34, "string", 56, 78]
|
|
149
|
+
* and multiple prompts (multi-tasks):
|
|
150
|
+
* - "prompt": ["string1", "string2"]
|
|
151
|
+
* - "prompt": ["string1", [12, 34, 56]]
|
|
152
|
+
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
|
|
153
|
+
*/
|
|
154
|
+
static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
|
|
155
|
+
std::vector<llama_tokens> result;
|
|
156
|
+
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
|
|
157
|
+
// string or mixed
|
|
158
|
+
result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
|
|
159
|
+
} else if (json_is_array_of_numbers(json_prompt)) {
|
|
160
|
+
// array of tokens
|
|
161
|
+
result.push_back(json_prompt.get<llama_tokens>());
|
|
162
|
+
} else if (json_prompt.is_array()) {
|
|
163
|
+
// array of prompts
|
|
164
|
+
result.reserve(json_prompt.size());
|
|
165
|
+
for (const auto & p : json_prompt) {
|
|
166
|
+
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
|
|
167
|
+
result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
|
|
168
|
+
} else if (json_is_array_of_numbers(p)) {
|
|
169
|
+
// array of tokens
|
|
170
|
+
result.push_back(p.get<llama_tokens>());
|
|
171
|
+
} else {
|
|
172
|
+
throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
} else {
|
|
176
|
+
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
|
|
177
|
+
}
|
|
178
|
+
return result;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
//
|
|
182
|
+
// template utils
|
|
183
|
+
//
|
|
184
|
+
|
|
185
|
+
// format rerank task: [BOS]query[EOS][SEP]doc[EOS]
|
|
186
|
+
static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
|
|
187
|
+
llama_tokens result;
|
|
188
|
+
result.reserve(doc.size() + query.size() + 4);
|
|
189
|
+
result.push_back(llama_token_bos(model));
|
|
190
|
+
result.insert(result.end(), query.begin(), query.end());
|
|
191
|
+
result.push_back(llama_token_eos(model));
|
|
192
|
+
result.push_back(llama_token_sep(model));
|
|
193
|
+
result.insert(result.end(), doc.begin(), doc.end());
|
|
194
|
+
result.push_back(llama_token_eos(model));
|
|
195
|
+
return result;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// format infill task
|
|
199
|
+
static llama_tokens format_infill(
|
|
200
|
+
const llama_context * ctx,
|
|
201
|
+
const json & input_prefix,
|
|
202
|
+
const json & input_suffix,
|
|
203
|
+
const json & input_extra,
|
|
204
|
+
const int n_batch,
|
|
205
|
+
const int n_predict,
|
|
206
|
+
const int n_ctx,
|
|
207
|
+
const bool spm_infill,
|
|
208
|
+
const llama_tokens & tokens_prompt
|
|
209
|
+
) {
|
|
210
|
+
// TODO: optimize this block by reducing memory allocations and movement
|
|
211
|
+
|
|
212
|
+
// use FIM repo-level pattern:
|
|
213
|
+
// ref: https://arxiv.org/pdf/2409.12186
|
|
214
|
+
//
|
|
215
|
+
// [FIM_REP]myproject
|
|
216
|
+
// [FIM_SEP]filename0
|
|
217
|
+
// extra chunk 0
|
|
218
|
+
// [FIM_SEP]filename1
|
|
219
|
+
// extra chunk 1
|
|
220
|
+
// ...
|
|
221
|
+
// [FIM_SEP]filename
|
|
222
|
+
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
|
|
223
|
+
//
|
|
224
|
+
llama_tokens extra_tokens;
|
|
225
|
+
extra_tokens.reserve(n_ctx);
|
|
226
|
+
|
|
227
|
+
auto model = llama_get_model(ctx);
|
|
228
|
+
auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
|
|
229
|
+
auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
|
|
230
|
+
|
|
231
|
+
if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
|
|
232
|
+
// TODO: make project name an input
|
|
233
|
+
static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
|
|
234
|
+
|
|
235
|
+
extra_tokens.push_back(llama_token_fim_rep(model));
|
|
236
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
|
|
237
|
+
}
|
|
238
|
+
for (const auto & chunk : input_extra) {
|
|
239
|
+
// { "text": string, "filename": string }
|
|
240
|
+
const std::string text = json_value(chunk, "text", std::string());
|
|
241
|
+
const std::string filename = json_value(chunk, "filename", std::string("tmp"));
|
|
242
|
+
|
|
243
|
+
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
|
244
|
+
const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
|
|
245
|
+
|
|
246
|
+
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
|
|
247
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
|
248
|
+
} else {
|
|
249
|
+
// chunk separator in binary form to avoid confusing the AI
|
|
250
|
+
static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
|
|
251
|
+
static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);
|
|
252
|
+
|
|
253
|
+
extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const auto chunk_tokens = common_tokenize(ctx, text, false, false);
|
|
257
|
+
extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
|
261
|
+
// TODO: current filename
|
|
262
|
+
static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
|
|
263
|
+
|
|
264
|
+
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
|
|
265
|
+
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
|
|
269
|
+
const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
|
|
270
|
+
const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
|
|
271
|
+
|
|
272
|
+
SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
|
|
273
|
+
|
|
274
|
+
// fill the rest of the context with extra chunks
|
|
275
|
+
const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
|
|
276
|
+
|
|
277
|
+
tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
|
|
278
|
+
tokens_suffix.resize(n_suffix_take);
|
|
279
|
+
|
|
280
|
+
tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
|
|
281
|
+
tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
|
|
282
|
+
tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
|
|
283
|
+
|
|
284
|
+
auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
|
|
285
|
+
auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
|
|
286
|
+
|
|
287
|
+
if (llama_add_bos_token(model)) {
|
|
288
|
+
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
|
|
292
|
+
|
|
293
|
+
// put the extra context before the FIM prefix
|
|
294
|
+
embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
|
|
295
|
+
|
|
296
|
+
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
297
|
+
embd_inp.push_back(llama_token_fim_mid(model));
|
|
298
|
+
|
|
299
|
+
return embd_inp;
|
|
300
|
+
}
|
|
301
|
+
|
|
58
302
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
|
59
303
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
|
60
|
-
std::vector<
|
|
304
|
+
std::vector<common_chat_msg> chat;
|
|
61
305
|
|
|
62
306
|
for (size_t i = 0; i < messages.size(); ++i) {
|
|
63
307
|
const auto & curr_msg = messages[i];
|
|
@@ -84,12 +328,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|
|
84
328
|
chat.push_back({role, content});
|
|
85
329
|
}
|
|
86
330
|
|
|
87
|
-
const auto formatted_chat =
|
|
331
|
+
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
|
|
88
332
|
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
|
89
333
|
|
|
90
334
|
return formatted_chat;
|
|
91
335
|
}
|
|
92
336
|
|
|
337
|
+
static std::string llama_get_chat_template(const struct llama_model * model) {
|
|
338
|
+
std::string template_key = "tokenizer.chat_template";
|
|
339
|
+
// call with NULL buffer to get the total size of the string
|
|
340
|
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
|
|
341
|
+
if (res < 0) {
|
|
342
|
+
return "";
|
|
343
|
+
} else {
|
|
344
|
+
std::vector<char> model_template(res, 0);
|
|
345
|
+
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
|
346
|
+
return std::string(model_template.data(), model_template.size());
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
93
350
|
//
|
|
94
351
|
// base64 utils (TODO: move to common in the future)
|
|
95
352
|
//
|
|
@@ -182,18 +439,60 @@ static std::string gen_chatcmplid() {
|
|
|
182
439
|
// other common utils
|
|
183
440
|
//
|
|
184
441
|
|
|
185
|
-
static size_t
|
|
442
|
+
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
|
|
186
443
|
size_t i;
|
|
187
444
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
|
188
445
|
|
|
189
446
|
return i;
|
|
190
447
|
}
|
|
191
448
|
|
|
192
|
-
static size_t
|
|
193
|
-
|
|
194
|
-
|
|
449
|
+
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
|
|
450
|
+
// check for empty sequences
|
|
451
|
+
if (a.empty() || b.empty()) {
|
|
452
|
+
return 0;
|
|
453
|
+
}
|
|
195
454
|
|
|
196
|
-
|
|
455
|
+
// get the lengths of the input sequences
|
|
456
|
+
size_t a_len = a.size();
|
|
457
|
+
size_t b_len = b.size();
|
|
458
|
+
|
|
459
|
+
// initialize the maximum length of the longest common subsequence (LCS)
|
|
460
|
+
size_t max_length = 0;
|
|
461
|
+
|
|
462
|
+
// use two rows instead of a 2D matrix to optimize space
|
|
463
|
+
std::vector<size_t> prev_row(b_len + 1, 0);
|
|
464
|
+
std::vector<size_t> curr_row(b_len + 1, 0);
|
|
465
|
+
|
|
466
|
+
// iterate through the elements of a
|
|
467
|
+
for (size_t i = 1; i <= a_len; i++) {
|
|
468
|
+
// iterate through the elements of b
|
|
469
|
+
for (size_t j = 1; j <= b_len; j++) {
|
|
470
|
+
// if elements at the current positions match
|
|
471
|
+
if (a[i - 1] == b[j - 1]) {
|
|
472
|
+
// if it's the first element of either sequences, set LCS length to 1
|
|
473
|
+
if (i == 1 || j == 1) {
|
|
474
|
+
curr_row[j] = 1;
|
|
475
|
+
} else {
|
|
476
|
+
// increment LCS length by 1 compared to the previous element
|
|
477
|
+
curr_row[j] = prev_row[j - 1] + 1;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// update max_length if necessary
|
|
481
|
+
if (curr_row[j] > max_length) {
|
|
482
|
+
max_length = curr_row[j];
|
|
483
|
+
}
|
|
484
|
+
} else {
|
|
485
|
+
// reset LCS length if elements don't match
|
|
486
|
+
curr_row[j] = 0;
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// update the previous row for the next iteration
|
|
491
|
+
prev_row = curr_row;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// return the maximum length of the LCS
|
|
495
|
+
return max_length;
|
|
197
496
|
}
|
|
198
497
|
|
|
199
498
|
static bool ends_with(const std::string & str, const std::string & suffix) {
|
|
@@ -216,24 +515,12 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
|
|
|
216
515
|
return std::string::npos;
|
|
217
516
|
}
|
|
218
517
|
|
|
219
|
-
static bool json_is_array_of_numbers(const json & data) {
|
|
220
|
-
if (data.is_array()) {
|
|
221
|
-
for (const auto & e : data) {
|
|
222
|
-
if (!e.is_number()) {
|
|
223
|
-
return false;
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
return true;
|
|
227
|
-
}
|
|
228
|
-
return false;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
518
|
// TODO: reuse llama_detokenize
|
|
232
519
|
template <class Iter>
|
|
233
520
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|
234
521
|
std::string ret;
|
|
235
522
|
for (; begin != end; ++begin) {
|
|
236
|
-
ret +=
|
|
523
|
+
ret += common_token_to_piece(ctx, *begin);
|
|
237
524
|
}
|
|
238
525
|
|
|
239
526
|
return ret;
|
|
@@ -241,7 +528,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|
|
241
528
|
|
|
242
529
|
// format incomplete utf-8 multibyte character for output
|
|
243
530
|
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
|
244
|
-
std::string out = token == -1 ? "" :
|
|
531
|
+
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
|
245
532
|
|
|
246
533
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
|
247
534
|
// (size > 1 meaning it's already a known token)
|
|
@@ -347,9 +634,9 @@ static json oaicompat_completion_params_parse(
|
|
|
347
634
|
|
|
348
635
|
// Handle "logprobs" field
|
|
349
636
|
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
|
|
350
|
-
if (body
|
|
637
|
+
if (json_value(body, "logprobs", false)) {
|
|
351
638
|
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
|
|
352
|
-
} else if (body.contains("top_logprobs")) {
|
|
639
|
+
} else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
|
|
353
640
|
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
|
|
354
641
|
}
|
|
355
642
|
|
|
@@ -362,7 +649,7 @@ static json oaicompat_completion_params_parse(
|
|
|
362
649
|
}
|
|
363
650
|
|
|
364
651
|
// Copy remaining properties to llama_params
|
|
365
|
-
// This allows user to use llama.cpp-specific params like "mirostat",
|
|
652
|
+
// This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
|
|
366
653
|
// See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
|
|
367
654
|
for (const auto & item : body.items()) {
|
|
368
655
|
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
set(TARGET llama-simple)
|
|
2
2
|
add_executable(${TARGET} simple.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
|
-
target_link_libraries(${TARGET} PRIVATE
|
|
4
|
+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
5
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|