@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,6 @@
1
+ aiohttp~=3.9.3
2
+ behave~=1.2.6
3
+ huggingface_hub~=0.20.3
4
+ numpy~=1.24.4
5
+ openai~=0.25.0
6
+ prometheus-client~=0.20.0
@@ -0,0 +1,653 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "common.h"
5
+
6
+ #include "json.hpp"
7
+
8
+ #include <string>
9
+ #include <vector>
10
+ #include <sstream>
11
+ #include <random>
12
+
13
+ #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
14
+
15
+ using json = nlohmann::ordered_json;
16
+
17
+ // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
18
+ enum error_type {
19
+ ERROR_TYPE_INVALID_REQUEST,
20
+ ERROR_TYPE_AUTHENTICATION,
21
+ ERROR_TYPE_SERVER,
22
+ ERROR_TYPE_NOT_FOUND,
23
+ ERROR_TYPE_PERMISSION,
24
+ ERROR_TYPE_UNAVAILABLE, // custom error
25
+ ERROR_TYPE_NOT_SUPPORTED, // custom error
26
+ };
27
+
28
+ extern bool server_verbose;
29
+ extern bool server_log_json;
30
+
31
+ #ifndef SERVER_VERBOSE
32
+ #define SERVER_VERBOSE 1
33
+ #endif
34
+
35
+ #if SERVER_VERBOSE != 1
36
+ #define LOG_VERBOSE(MSG, ...)
37
+ #else
38
+ #define LOG_VERBOSE(MSG, ...) \
39
+ do \
40
+ { \
41
+ if (server_verbose) \
42
+ { \
43
+ server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
44
+ } \
45
+ } while (0)
46
+ #endif
47
+
48
+ #define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
49
+ #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
50
+ #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
51
+
52
+ static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
53
+
54
+ template <typename T>
55
+ static T json_value(const json &body, const std::string &key, const T &default_value) {
56
+ // Fallback null to default value
57
+ if (body.contains(key) && !body.at(key).is_null()){
58
+ try {
59
+ return body.value(key, default_value);
60
+ }
61
+ catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
62
+ std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
63
+ server_log("WARN", __func__, __LINE__, message.c_str(), body);
64
+ return default_value;
65
+ }
66
+ } else {
67
+ return default_value;
68
+ }
69
+ }
70
+
71
+ static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
72
+ std::stringstream ss_tid;
73
+ ss_tid << std::this_thread::get_id();
74
+ json log = nlohmann::ordered_json{
75
+ {"tid", ss_tid.str()},
76
+ {"timestamp", time(nullptr)},
77
+ };
78
+
79
+ if (server_log_json) {
80
+ log.merge_patch( {
81
+ {"level", level},
82
+ {"function", function},
83
+ {"line", line},
84
+ {"msg", message},
85
+ });
86
+
87
+ if (!extra.empty()) {
88
+ log.merge_patch(extra);
89
+ }
90
+
91
+ printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
92
+ } else {
93
+ char buf[1024];
94
+ snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
95
+
96
+ if (!extra.empty()) {
97
+ log.merge_patch(extra);
98
+ }
99
+ std::stringstream ss;
100
+ ss << buf << " |";
101
+ for (const auto& el : log.items())
102
+ {
103
+ const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
104
+ ss << " " << el.key() << "=" << value;
105
+ }
106
+
107
+ const std::string str = ss.str();
108
+ printf("%.*s\n", (int)str.size(), str.data());
109
+ }
110
+ fflush(stdout);
111
+ }
112
+
113
+ //
114
+ // chat template utils
115
+ //
116
+
117
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
118
+ inline bool verify_custom_template(const std::string & tmpl) {
119
+ llama_chat_message chat[] = {{"user", "test"}};
120
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
121
+ return res >= 0;
122
+ }
123
+
124
+ // Format given chat. If tmpl is empty, we take the template from model metadata
125
+ inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
126
+ size_t alloc_size = 0;
127
+ // vector holding all allocated string to be passed to llama_chat_apply_template
128
+ std::vector<std::string> str(messages.size() * 2);
129
+ std::vector<llama_chat_message> chat(messages.size());
130
+
131
+ for (size_t i = 0; i < messages.size(); ++i) {
132
+ const auto & curr_msg = messages[i];
133
+ str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
134
+ str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
135
+ alloc_size += str[i*2 + 1].length();
136
+ chat[i].role = str[i*2 + 0].c_str();
137
+ chat[i].content = str[i*2 + 1].c_str();
138
+ }
139
+
140
+ const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
141
+ std::vector<char> buf(alloc_size * 2);
142
+
143
+ // run the first time to get the total output length
144
+ int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
145
+
146
+ // if it turns out that our buffer is too small, we resize it
147
+ if ((size_t) res > buf.size()) {
148
+ buf.resize(res);
149
+ res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
150
+ }
151
+
152
+ const std::string formatted_chat(buf.data(), res);
153
+
154
+ LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
155
+
156
+ return formatted_chat;
157
+ }
158
+
159
+ //
160
+ // base64 utils (TODO: move to common in the future)
161
+ //
162
+
163
+ static const std::string base64_chars =
164
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
165
+ "abcdefghijklmnopqrstuvwxyz"
166
+ "0123456789+/";
167
+
168
+ static inline bool is_base64(uint8_t c) {
169
+ return (isalnum(c) || (c == '+') || (c == '/'));
170
+ }
171
+
172
+ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
173
+ int i = 0;
174
+ int j = 0;
175
+ int in_ = 0;
176
+
177
+ int in_len = encoded_string.size();
178
+
179
+ uint8_t char_array_4[4];
180
+ uint8_t char_array_3[3];
181
+
182
+ std::vector<uint8_t> ret;
183
+
184
+ while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
185
+ char_array_4[i++] = encoded_string[in_]; in_++;
186
+ if (i == 4) {
187
+ for (i = 0; i < 4; i++) {
188
+ char_array_4[i] = base64_chars.find(char_array_4[i]);
189
+ }
190
+
191
+ char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
192
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
193
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
194
+
195
+ for (i = 0; (i < 3); i++) {
196
+ ret.push_back(char_array_3[i]);
197
+ }
198
+
199
+ i = 0;
200
+ }
201
+ }
202
+
203
+ if (i) {
204
+ for (j = i; j < 4; j++) {
205
+ char_array_4[j] = 0;
206
+ }
207
+
208
+ for (j = 0; j < 4; j++) {
209
+ char_array_4[j] = base64_chars.find(char_array_4[j]);
210
+ }
211
+
212
+ char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
213
+ char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
214
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
215
+
216
+ for (j = 0; j < i - 1; j++) {
217
+ ret.push_back(char_array_3[j]);
218
+ }
219
+ }
220
+
221
+ return ret;
222
+ }
223
+
224
+ //
225
+ // random string / id
226
+ //
227
+
228
+ static std::string random_string() {
229
+ static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
230
+
231
+ std::random_device rd;
232
+ std::mt19937 generator(rd());
233
+
234
+ std::string result(32, ' ');
235
+
236
+ for (int i = 0; i < 32; ++i) {
237
+ result[i] = str[generator() % str.size()];
238
+ }
239
+
240
+ return result;
241
+ }
242
+
243
+ static std::string gen_chatcmplid() {
244
+ std::stringstream chatcmplid;
245
+ chatcmplid << "chatcmpl-" << random_string();
246
+
247
+ return chatcmplid.str();
248
+ }
249
+
250
+ //
251
+ // other common utils
252
+ //
253
+
254
+ static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
255
+ size_t i;
256
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
257
+
258
+ return i;
259
+ }
260
+
261
+ static bool ends_with(const std::string & str, const std::string & suffix) {
262
+ return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
263
+ }
264
+
265
+ static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
266
+ if (!text.empty() && !stop.empty()) {
267
+ const char text_last_char = text.back();
268
+ for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
269
+ if (stop[char_index] == text_last_char) {
270
+ const std::string current_partial = stop.substr(0, char_index + 1);
271
+ if (ends_with(text, current_partial)) {
272
+ return text.size() - char_index - 1;
273
+ }
274
+ }
275
+ }
276
+ }
277
+
278
+ return std::string::npos;
279
+ }
280
+
281
+ // TODO: reuse llama_detokenize
282
+ template <class Iter>
283
+ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
284
+ std::string ret;
285
+ for (; begin != end; ++begin) {
286
+ ret += llama_token_to_piece(ctx, *begin);
287
+ }
288
+
289
+ return ret;
290
+ }
291
+
292
+ // format incomplete utf-8 multibyte character for output
293
+ static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
294
+ std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
295
+
296
+ // if the size is 1 and first bit is 1, meaning it's a partial character
297
+ // (size > 1 meaning it's already a known token)
298
+ if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
299
+ std::stringstream ss;
300
+ ss << std::hex << (out[0] & 0xff);
301
+ std::string res(ss.str());
302
+ out = "byte: \\x" + res;
303
+ }
304
+
305
+ return out;
306
+ }
307
+
308
+ struct completion_token_output {
309
+ llama_token tok;
310
+ std::string text_to_send;
311
+
312
+ struct token_prob {
313
+ llama_token tok;
314
+ float prob;
315
+ };
316
+
317
+ std::vector<token_prob> probs;
318
+ };
319
+
320
+ // convert a vector of completion_token_output to json
321
+ static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
322
+ json out = json::array();
323
+
324
+ for (const auto & prob : probs) {
325
+ json probs_for_token = json::array();
326
+
327
+ for (const auto & p : prob.probs) {
328
+ const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
329
+ probs_for_token.push_back(json {
330
+ {"tok_str", tok_str},
331
+ {"prob", p.prob},
332
+ });
333
+ }
334
+
335
+ const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
336
+ out.push_back(json {
337
+ {"content", tok_str},
338
+ {"probs", probs_for_token},
339
+ });
340
+ }
341
+
342
+ return out;
343
+ }
344
+
345
+ //
346
+ // OAI utils
347
+ //
348
+
349
+ static json oaicompat_completion_params_parse(
350
+ const struct llama_model * model,
351
+ const json & body, /* openai api json semantics */
352
+ const std::string & chat_template) {
353
+ json llama_params;
354
+
355
+ llama_params["__oaicompat"] = true;
356
+
357
+ // Map OpenAI parameters to llama.cpp parameters
358
+ //
359
+ // For parameters that are defined by the OpenAI documentation (e.g.
360
+ // temperature), we explicitly specify OpenAI's intended default; we
361
+ // need to do that because sometimes OpenAI disagrees with llama.cpp
362
+ //
363
+ // https://platform.openai.com/docs/api-reference/chat/create
364
+ llama_sampling_params default_sparams;
365
+ llama_params["model"] = json_value(body, "model", std::string("unknown"));
366
+ llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
367
+ llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
368
+ llama_params["n_predict"] = json_value(body, "max_tokens", -1);
369
+ llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
370
+ llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
371
+ llama_params["stream"] = json_value(body, "stream", false);
372
+ llama_params["temperature"] = json_value(body, "temperature", 0.0);
373
+ llama_params["top_p"] = json_value(body, "top_p", 1.0);
374
+
375
+ // Apply chat template to the list of messages
376
+ llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
377
+
378
+ // Handle "stop" field
379
+ if (body.contains("stop") && body["stop"].is_string()) {
380
+ llama_params["stop"] = json::array({body["stop"].get<std::string>()});
381
+ } else {
382
+ llama_params["stop"] = json_value(body, "stop", json::array());
383
+ }
384
+
385
+ // Handle "response_format" field
386
+ if (body.contains("response_format")) {
387
+ json response_format = json_value(body, "response_format", json::object());
388
+ std::string response_type = json_value(response_format, "type", std::string());
389
+ if (response_type == "json_object") {
390
+ llama_params["json_schema"] = json_value(response_format, "schema", json::object());
391
+ } else if (!response_type.empty() && response_type != "text") {
392
+ throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
393
+ }
394
+ }
395
+
396
+ // Handle "n" field
397
+ int n_choices = json_value(body, "n", 1);
398
+ if (n_choices != 1) {
399
+ throw std::runtime_error("Only one completion choice is allowed");
400
+ }
401
+
402
+ // Handle "logprobs" field
403
+ // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
404
+ if (body.contains("logprobs")) {
405
+ llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
406
+ } else if (body.contains("top_logprobs")) {
407
+ throw std::runtime_error("top_logprobs requires logprobs to be set to true");
408
+ }
409
+
410
+ // Params supported by OAI but unsupported by llama.cpp
411
+ static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
412
+ for (auto & param : unsupported_params) {
413
+ if (body.contains(param)) {
414
+ throw std::runtime_error("Unsupported param: " + param);
415
+ }
416
+ }
417
+
418
+ // Copy remaining properties to llama_params
419
+ // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
420
+ // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
421
+ for (const auto & item : body.items()) {
422
+ // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
423
+ if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
424
+ llama_params[item.key()] = item.value();
425
+ }
426
+ }
427
+
428
+ return llama_params;
429
+ }
430
+
431
+ static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
432
+ bool stopped_word = result.count("stopped_word") != 0;
433
+ bool stopped_eos = json_value(result, "stopped_eos", false);
434
+ int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
435
+ int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
436
+ std::string content = json_value(result, "content", std::string(""));
437
+
438
+ std::string finish_reason = "length";
439
+ if (stopped_word || stopped_eos) {
440
+ finish_reason = "stop";
441
+ }
442
+
443
+ json choices =
444
+ streaming ? json::array({json{{"finish_reason", finish_reason},
445
+ {"index", 0},
446
+ {"delta", json::object()}}})
447
+ : json::array({json{{"finish_reason", finish_reason},
448
+ {"index", 0},
449
+ {"message", json{{"content", content},
450
+ {"role", "assistant"}}}}});
451
+
452
+ std::time_t t = std::time(0);
453
+
454
+ json res = json {
455
+ {"choices", choices},
456
+ {"created", t},
457
+ {"model",
458
+ json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
459
+ {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
460
+ {"usage", json {
461
+ {"completion_tokens", num_tokens_predicted},
462
+ {"prompt_tokens", num_prompt_tokens},
463
+ {"total_tokens", num_tokens_predicted + num_prompt_tokens}
464
+ }},
465
+ {"id", completion_id}
466
+ };
467
+
468
+ if (server_verbose) {
469
+ res["__verbose"] = result;
470
+ }
471
+
472
+ if (result.contains("completion_probabilities")) {
473
+ res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
474
+ }
475
+
476
+ return res;
477
+ }
478
+
479
+ // return value is vector as there is one case where we might need to generate two responses
480
+ static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
481
+ if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
482
+ return std::vector<json>({result});
483
+ }
484
+
485
+ bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
486
+ std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
487
+
488
+ bool stopped_word = json_value(result, "stopped_word", false);
489
+ bool stopped_eos = json_value(result, "stopped_eos", false);
490
+ bool stopped_limit = json_value(result, "stopped_limit", false);
491
+ std::string content = json_value(result, "content", std::string(""));
492
+
493
+ std::string finish_reason;
494
+ if (stopped_word || stopped_eos) {
495
+ finish_reason = "stop";
496
+ }
497
+ if (stopped_limit) {
498
+ finish_reason = "length";
499
+ }
500
+
501
+ std::time_t t = std::time(0);
502
+
503
+ json choices;
504
+
505
+ if (!finish_reason.empty()) {
506
+ choices = json::array({json{{"finish_reason", finish_reason},
507
+ {"index", 0},
508
+ {"delta", json::object()}}});
509
+ } else {
510
+ if (first) {
511
+ if (content.empty()) {
512
+ choices = json::array({json{{"finish_reason", nullptr},
513
+ {"index", 0},
514
+ {"delta", json{{"role", "assistant"}}}}});
515
+ } else {
516
+ // We have to send this as two updates to conform to openai behavior
517
+ json initial_ret = json{{"choices", json::array({json{
518
+ {"finish_reason", nullptr},
519
+ {"index", 0},
520
+ {"delta", json{
521
+ {"role", "assistant"}
522
+ }}}})},
523
+ {"created", t},
524
+ {"id", completion_id},
525
+ {"model", modelname},
526
+ {"object", "chat.completion.chunk"}};
527
+
528
+ json second_ret = json{
529
+ {"choices", json::array({json{{"finish_reason", nullptr},
530
+ {"index", 0},
531
+ {"delta", json{
532
+ {"content", content}}}
533
+ }})},
534
+ {"created", t},
535
+ {"id", completion_id},
536
+ {"model", modelname},
537
+ {"object", "chat.completion.chunk"}};
538
+
539
+ return std::vector<json>({initial_ret, second_ret});
540
+ }
541
+ } else {
542
+ // Some idiosyncrasy in task processing logic makes several trailing calls
543
+ // with empty content, we ignore these at the calee site.
544
+ if (content.empty()) {
545
+ return std::vector<json>({json::object()});
546
+ }
547
+
548
+ choices = json::array({json{
549
+ {"finish_reason", nullptr},
550
+ {"index", 0},
551
+ {"delta",
552
+ json{
553
+ {"content", content},
554
+ }},
555
+ }});
556
+ }
557
+ }
558
+
559
+ json ret = json {
560
+ {"choices", choices},
561
+ {"created", t},
562
+ {"id", completion_id},
563
+ {"model", modelname},
564
+ {"object", "chat.completion.chunk"}
565
+ };
566
+ if (!finish_reason.empty()) {
567
+ int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
568
+ int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
569
+ ret.push_back({"usage", json {
570
+ {"completion_tokens", num_tokens_predicted},
571
+ {"prompt_tokens", num_prompt_tokens},
572
+ {"total_tokens", num_tokens_predicted + num_prompt_tokens}
573
+ }});
574
+ }
575
+
576
+ return std::vector<json>({ret});
577
+ }
578
+
579
+ static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
580
+ json data = json::array();
581
+ int i = 0;
582
+ for (auto & elem : embeddings) {
583
+ data.push_back(json{
584
+ {"embedding", json_value(elem, "embedding", json::array())},
585
+ {"index", i++},
586
+ {"object", "embedding"}
587
+ });
588
+ }
589
+
590
+ json res = json {
591
+ {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
592
+ {"object", "list"},
593
+ {"usage", json {
594
+ {"prompt_tokens", 0},
595
+ {"total_tokens", 0}
596
+ }},
597
+ {"data", data}
598
+ };
599
+
600
+ return res;
601
+ }
602
+
603
+ static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
604
+ return json {
605
+ {"tokens", tokens}
606
+ };
607
+ }
608
+
609
+ static json format_detokenized_response(const std::string & content) {
610
+ return json {
611
+ {"content", content}
612
+ };
613
+ }
614
+
615
+ static json format_error_response(const std::string & message, const enum error_type type) {
616
+ std::string type_str;
617
+ int code = 500;
618
+ switch (type) {
619
+ case ERROR_TYPE_INVALID_REQUEST:
620
+ type_str = "invalid_request_error";
621
+ code = 400;
622
+ break;
623
+ case ERROR_TYPE_AUTHENTICATION:
624
+ type_str = "authentication_error";
625
+ code = 401;
626
+ break;
627
+ case ERROR_TYPE_NOT_FOUND:
628
+ type_str = "not_found_error";
629
+ code = 404;
630
+ break;
631
+ case ERROR_TYPE_SERVER:
632
+ type_str = "server_error";
633
+ code = 500;
634
+ break;
635
+ case ERROR_TYPE_PERMISSION:
636
+ type_str = "permission_error";
637
+ code = 403;
638
+ break;
639
+ case ERROR_TYPE_NOT_SUPPORTED:
640
+ type_str = "not_supported_error";
641
+ code = 501;
642
+ break;
643
+ case ERROR_TYPE_UNAVAILABLE:
644
+ type_str = "unavailable_error";
645
+ code = 503;
646
+ break;
647
+ }
648
+ return json {
649
+ {"code", code},
650
+ {"message", message},
651
+ {"type", type_str},
652
+ };
653
+ }
@@ -0,0 +1,5 @@
1
+ set(TARGET simple)
2
+ add_executable(${TARGET} simple.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)