@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -24,6 +24,22 @@
24
24
  #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
25
25
 
26
26
  using json = nlohmann::ordered_json;
27
+ using llama_tokens = std::vector<llama_token>;
28
+
29
+ #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
30
+ #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
31
+ #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
32
+ #define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
33
+
34
+ #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
35
+ #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
36
+ #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
37
+ #define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
38
+
39
+ #define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
40
+ #define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
41
+ #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
42
+ #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
27
43
 
28
44
  // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
29
45
  enum error_type {
@@ -52,12 +68,240 @@ static T json_value(const json & body, const std::string & key, const T & defaul
52
68
  }
53
69
 
54
70
  //
55
- // chat template utils
71
+ // tokenizer and input processing utils
56
72
  //
57
73
 
74
+ static bool json_is_array_of_numbers(const json & data) {
75
+ if (data.is_array()) {
76
+ for (const auto & e : data) {
77
+ if (!e.is_number_integer()) {
78
+ return false;
79
+ }
80
+ }
81
+ return true;
82
+ }
83
+ return false;
84
+ }
85
+
86
+ // is array having BOTH numbers & strings?
87
+ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
88
+ bool seen_string = false;
89
+ bool seen_number = false;
90
+ if (data.is_array()) {
91
+ for (const auto & e : data) {
92
+ seen_string |= e.is_string();
93
+ seen_number |= e.is_number_integer();
94
+ if (seen_number && seen_string) {
95
+ return true;
96
+ }
97
+ }
98
+ }
99
+ return false;
100
+ }
101
+
102
+ /**
103
+ * this handles 2 cases:
104
+ * - only string, example: "string"
105
+ * - mixed string and tokens, example: [12, 34, "string", 56, 78]
106
+ */
107
+ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
108
+ // If `add_bos` is true, we only add BOS, when json_prompt is a string,
109
+ // or the first element of the json_prompt array is a string.
110
+ llama_tokens prompt_tokens;
111
+
112
+ if (json_prompt.is_array()) {
113
+ bool first = true;
114
+ for (const auto & p : json_prompt) {
115
+ if (p.is_string()) {
116
+ auto s = p.template get<std::string>();
117
+
118
+ llama_tokens p;
119
+ if (first) {
120
+ p = common_tokenize(ctx, s, add_special, parse_special);
121
+ first = false;
122
+ } else {
123
+ p = common_tokenize(ctx, s, false, parse_special);
124
+ }
125
+
126
+ prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
127
+ } else {
128
+ if (first) {
129
+ first = false;
130
+ }
131
+
132
+ prompt_tokens.push_back(p.template get<llama_token>());
133
+ }
134
+ }
135
+ } else {
136
+ auto s = json_prompt.template get<std::string>();
137
+ prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
138
+ }
139
+
140
+ return prompt_tokens;
141
+ }
142
+
143
+ /**
144
+ * break the input "prompt" object into multiple prompt if needed, then tokenize them
145
+ * this supports these cases:
146
+ * - "prompt": "string"
147
+ * - "prompt": [12, 34, 56]
148
+ * - "prompt": [12, 34, "string", 56, 78]
149
+ * and multiple prompts (multi-tasks):
150
+ * - "prompt": ["string1", "string2"]
151
+ * - "prompt": ["string1", [12, 34, 56]]
152
+ * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
153
+ */
154
+ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) {
155
+ std::vector<llama_tokens> result;
156
+ if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
157
+ // string or mixed
158
+ result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special));
159
+ } else if (json_is_array_of_numbers(json_prompt)) {
160
+ // array of tokens
161
+ result.push_back(json_prompt.get<llama_tokens>());
162
+ } else if (json_prompt.is_array()) {
163
+ // array of prompts
164
+ result.reserve(json_prompt.size());
165
+ for (const auto & p : json_prompt) {
166
+ if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
167
+ result.push_back(tokenize_mixed(ctx, p, add_special, parse_special));
168
+ } else if (json_is_array_of_numbers(p)) {
169
+ // array of tokens
170
+ result.push_back(p.get<llama_tokens>());
171
+ } else {
172
+ throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
173
+ }
174
+ }
175
+ } else {
176
+ throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
177
+ }
178
+ return result;
179
+ }
180
+
181
+ //
182
+ // template utils
183
+ //
184
+
185
+ // format rerank task: [BOS]query[EOS][SEP]doc[EOS]
186
+ static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) {
187
+ llama_tokens result;
188
+ result.reserve(doc.size() + query.size() + 4);
189
+ result.push_back(llama_token_bos(model));
190
+ result.insert(result.end(), query.begin(), query.end());
191
+ result.push_back(llama_token_eos(model));
192
+ result.push_back(llama_token_sep(model));
193
+ result.insert(result.end(), doc.begin(), doc.end());
194
+ result.push_back(llama_token_eos(model));
195
+ return result;
196
+ }
197
+
198
+ // format infill task
199
+ static llama_tokens format_infill(
200
+ const llama_context * ctx,
201
+ const json & input_prefix,
202
+ const json & input_suffix,
203
+ const json & input_extra,
204
+ const int n_batch,
205
+ const int n_predict,
206
+ const int n_ctx,
207
+ const bool spm_infill,
208
+ const llama_tokens & tokens_prompt
209
+ ) {
210
+ // TODO: optimize this block by reducing memory allocations and movement
211
+
212
+ // use FIM repo-level pattern:
213
+ // ref: https://arxiv.org/pdf/2409.12186
214
+ //
215
+ // [FIM_REP]myproject
216
+ // [FIM_SEP]filename0
217
+ // extra chunk 0
218
+ // [FIM_SEP]filename1
219
+ // extra chunk 1
220
+ // ...
221
+ // [FIM_SEP]filename
222
+ // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
223
+ //
224
+ llama_tokens extra_tokens;
225
+ extra_tokens.reserve(n_ctx);
226
+
227
+ auto model = llama_get_model(ctx);
228
+ auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false);
229
+ auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
230
+
231
+ if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
232
+ // TODO: make project name an input
233
+ static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false);
234
+
235
+ extra_tokens.push_back(llama_token_fim_rep(model));
236
+ extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
237
+ }
238
+ for (const auto & chunk : input_extra) {
239
+ // { "text": string, "filename": string }
240
+ const std::string text = json_value(chunk, "text", std::string());
241
+ const std::string filename = json_value(chunk, "filename", std::string("tmp"));
242
+
243
+ if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
244
+ const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false);
245
+
246
+ extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
247
+ extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
248
+ } else {
249
+ // chunk separator in binary form to avoid confusing the AI
250
+ static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
251
+ static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false);
252
+
253
+ extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
254
+ }
255
+
256
+ const auto chunk_tokens = common_tokenize(ctx, text, false, false);
257
+ extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
258
+ }
259
+
260
+ if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
261
+ // TODO: current filename
262
+ static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false);
263
+
264
+ extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model));
265
+ extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
266
+ }
267
+
268
+ // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
269
+ const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4));
270
+ const int n_suffix_take = std::min<int>(tokens_suffix.size(), std::max<int>(0, (n_batch/4) - (2 + tokens_prompt.size())));
271
+
272
+ SRV_DBG("n_prefix_take = %d, n_suffix_take = %d, total = %d\n", n_prefix_take, n_suffix_take, (n_prefix_take + n_suffix_take));
273
+
274
+ // fill the rest of the context with extra chunks
275
+ const int n_extra_take = std::min<int>(std::max<int>(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size());
276
+
277
+ tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
278
+ tokens_suffix.resize(n_suffix_take);
279
+
280
+ tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
281
+ tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
282
+ tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
283
+
284
+ auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
285
+ auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
286
+
287
+ if (llama_add_bos_token(model)) {
288
+ embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
289
+ }
290
+
291
+ SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
292
+
293
+ // put the extra context before the FIM prefix
294
+ embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
295
+
296
+ embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
297
+ embd_inp.push_back(llama_token_fim_mid(model));
298
+
299
+ return embd_inp;
300
+ }
301
+
58
302
  // Format given chat. If tmpl is empty, we take the template from model metadata
59
303
  inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
60
- std::vector<llama_chat_msg> chat;
304
+ std::vector<common_chat_msg> chat;
61
305
 
62
306
  for (size_t i = 0; i < messages.size(); ++i) {
63
307
  const auto & curr_msg = messages[i];
@@ -84,12 +328,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
84
328
  chat.push_back({role, content});
85
329
  }
86
330
 
87
- const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
331
+ const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
88
332
  LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
89
333
 
90
334
  return formatted_chat;
91
335
  }
92
336
 
337
+ static std::string llama_get_chat_template(const struct llama_model * model) {
338
+ std::string template_key = "tokenizer.chat_template";
339
+ // call with NULL buffer to get the total size of the string
340
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
341
+ if (res < 0) {
342
+ return "";
343
+ } else {
344
+ std::vector<char> model_template(res, 0);
345
+ llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
346
+ return std::string(model_template.data(), model_template.size());
347
+ }
348
+ }
349
+
93
350
  //
94
351
  // base64 utils (TODO: move to common in the future)
95
352
  //
@@ -182,18 +439,60 @@ static std::string gen_chatcmplid() {
182
439
  // other common utils
183
440
  //
184
441
 
185
- static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
442
+ static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
186
443
  size_t i;
187
444
  for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
188
445
 
189
446
  return i;
190
447
  }
191
448
 
192
- static size_t common_part(const std::string & a, const std::string & b) {
193
- size_t i;
194
- for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
449
+ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
450
+ // check for empty sequences
451
+ if (a.empty() || b.empty()) {
452
+ return 0;
453
+ }
195
454
 
196
- return i;
455
+ // get the lengths of the input sequences
456
+ size_t a_len = a.size();
457
+ size_t b_len = b.size();
458
+
459
+ // initialize the maximum length of the longest common subsequence (LCS)
460
+ size_t max_length = 0;
461
+
462
+ // use two rows instead of a 2D matrix to optimize space
463
+ std::vector<size_t> prev_row(b_len + 1, 0);
464
+ std::vector<size_t> curr_row(b_len + 1, 0);
465
+
466
+ // iterate through the elements of a
467
+ for (size_t i = 1; i <= a_len; i++) {
468
+ // iterate through the elements of b
469
+ for (size_t j = 1; j <= b_len; j++) {
470
+ // if elements at the current positions match
471
+ if (a[i - 1] == b[j - 1]) {
472
+ // if it's the first element of either sequences, set LCS length to 1
473
+ if (i == 1 || j == 1) {
474
+ curr_row[j] = 1;
475
+ } else {
476
+ // increment LCS length by 1 compared to the previous element
477
+ curr_row[j] = prev_row[j - 1] + 1;
478
+ }
479
+
480
+ // update max_length if necessary
481
+ if (curr_row[j] > max_length) {
482
+ max_length = curr_row[j];
483
+ }
484
+ } else {
485
+ // reset LCS length if elements don't match
486
+ curr_row[j] = 0;
487
+ }
488
+ }
489
+
490
+ // update the previous row for the next iteration
491
+ prev_row = curr_row;
492
+ }
493
+
494
+ // return the maximum length of the LCS
495
+ return max_length;
197
496
  }
198
497
 
199
498
  static bool ends_with(const std::string & str, const std::string & suffix) {
@@ -216,24 +515,12 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
216
515
  return std::string::npos;
217
516
  }
218
517
 
219
- static bool json_is_array_of_numbers(const json & data) {
220
- if (data.is_array()) {
221
- for (const auto & e : data) {
222
- if (!e.is_number()) {
223
- return false;
224
- }
225
- }
226
- return true;
227
- }
228
- return false;
229
- }
230
-
231
518
  // TODO: reuse llama_detokenize
232
519
  template <class Iter>
233
520
  static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
234
521
  std::string ret;
235
522
  for (; begin != end; ++begin) {
236
- ret += llama_token_to_piece(ctx, *begin);
523
+ ret += common_token_to_piece(ctx, *begin);
237
524
  }
238
525
 
239
526
  return ret;
@@ -241,7 +528,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
241
528
 
242
529
  // format incomplete utf-8 multibyte character for output
243
530
  static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
244
- std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
531
+ std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
245
532
 
246
533
  // if the size is 1 and first bit is 1, meaning it's a partial character
247
534
  // (size > 1 meaning it's already a known token)
@@ -347,9 +634,9 @@ static json oaicompat_completion_params_parse(
347
634
 
348
635
  // Handle "logprobs" field
349
636
  // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
350
- if (body.contains("logprobs")) {
637
+ if (json_value(body, "logprobs", false)) {
351
638
  llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
352
- } else if (body.contains("top_logprobs")) {
639
+ } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
353
640
  throw std::runtime_error("top_logprobs requires logprobs to be set to true");
354
641
  }
355
642
 
@@ -362,7 +649,7 @@ static json oaicompat_completion_params_parse(
362
649
  }
363
650
 
364
651
  // Copy remaining properties to llama_params
365
- // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
652
+ // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
366
653
  // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
367
654
  for (const auto & item : body.items()) {
368
655
  // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
@@ -1,5 +1,5 @@
1
1
  set(TARGET llama-simple)
2
2
  add_executable(${TARGET} simple.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4
+ target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_compile_features(${TARGET} PRIVATE cxx_std_11)