@fugood/llama.node 0.3.17 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -1,590 +0,0 @@
1
- #include "arg.h"
2
- #include "common.h"
3
- #include "console.h"
4
- #include "sampling.h"
5
- #include "log.h"
6
- #include "llama.h"
7
-
8
- #include <cassert>
9
- #include <cinttypes>
10
- #include <cmath>
11
- #include <cstdio>
12
- #include <cstring>
13
- #include <ctime>
14
- #include <fstream>
15
- #include <iostream>
16
- #include <sstream>
17
- #include <string>
18
- #include <vector>
19
-
20
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
21
- #include <signal.h>
22
- #include <unistd.h>
23
- #elif defined (_WIN32)
24
- #define WIN32_LEAN_AND_MEAN
25
- #ifndef NOMINMAX
26
- #define NOMINMAX
27
- #endif
28
- #include <windows.h>
29
- #include <signal.h>
30
- #endif
31
-
32
- #if defined(_MSC_VER)
33
- #pragma warning(disable: 4244 4267) // possible loss of data
34
- #endif
35
-
36
- static llama_context ** g_ctx;
37
- static llama_model ** g_model;
38
- static common_sampler ** g_smpl;
39
- static common_params * g_params;
40
- static std::vector<llama_token> * g_input_tokens;
41
- static std::ostringstream * g_output_ss;
42
- static std::vector<llama_token> * g_output_tokens;
43
-
44
- static bool is_interacting = false;
45
-
46
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
47
- static void sigint_handler(int signo) {
48
- if (signo == SIGINT) {
49
- if (!is_interacting) {
50
- is_interacting = true;
51
- } else {
52
- console::cleanup();
53
- LOG("\n");
54
- common_perf_print(*g_ctx, *g_smpl);
55
-
56
- // make sure all logs are flushed
57
- LOG("Interrupted by user\n");
58
- common_log_pause(common_log_main());
59
-
60
- _exit(130);
61
- }
62
- }
63
- }
64
- #endif
65
-
66
- int main(int argc, char ** argv) {
67
- common_params params;
68
- g_params = &params;
69
-
70
- if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
71
- return 1;
72
- }
73
-
74
- common_init();
75
-
76
- auto & sparams = params.sampling;
77
-
78
- console::init(params.simple_io, params.use_color);
79
- atexit([]() { console::cleanup(); });
80
-
81
- if (params.logits_all) {
82
- LOG_ERR("\n************\n");
83
- LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
84
- LOG_ERR("************\n\n");
85
-
86
- return 0;
87
- }
88
-
89
- if (params.embedding) {
90
- LOG_ERR("\n************\n");
91
- LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
92
- LOG_ERR("************\n\n");
93
-
94
- return 0;
95
- }
96
-
97
- if (params.n_ctx != 0 && params.n_ctx < 8) {
98
- LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
99
- params.n_ctx = 8;
100
- }
101
-
102
- if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
103
- LOG_ERR("\n************\n");
104
- LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
105
- LOG_ERR("************\n\n");
106
-
107
- return 0;
108
- }
109
-
110
- if (params.rope_freq_base != 0.0) {
111
- LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
112
- }
113
-
114
- if (params.rope_freq_scale != 0.0) {
115
- LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
116
- }
117
-
118
- LOG_INF("%s: llama backend init\n", __func__);
119
- llama_backend_init();
120
- llama_numa_init(params.numa);
121
-
122
- llama_model * model = nullptr;
123
- llama_context * ctx = nullptr;
124
- common_sampler * smpl = nullptr;
125
-
126
- g_model = &model;
127
- g_ctx = &ctx;
128
- g_smpl = &smpl;
129
-
130
- // load the model and apply lora adapter, if any
131
- LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
132
- common_init_result llama_init = common_init_from_params(params);
133
-
134
- model = llama_init.model.get();
135
- ctx = llama_init.context.get();
136
-
137
- if (model == NULL) {
138
- LOG_ERR("%s: unable to load model\n", __func__);
139
- return 1;
140
- }
141
-
142
- const llama_vocab * vocab = llama_model_get_vocab(model);
143
-
144
- const int n_ctx_train = llama_model_n_ctx_train(model);
145
- const int n_ctx = llama_n_ctx(ctx);
146
- LOG_DBG("n_ctx: %d\n", n_ctx);
147
-
148
- if (n_ctx > n_ctx_train) {
149
- LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
150
- }
151
-
152
- // print system information
153
- {
154
- LOG_INF("\n");
155
- LOG_INF("%s\n", common_params_get_system_info(params).c_str());
156
- }
157
- const bool add_bos = llama_vocab_get_add_bos(vocab);
158
- GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
159
-
160
- std::vector<llama_token> embd_inp;
161
- std::vector<llama_token> embd_end;
162
- std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
163
- std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
164
-
165
- GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
166
- GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
167
-
168
- inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
169
- inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
170
-
171
- embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
172
- embd_end = params.spm_infill ? inp_pfx : inp_sfx;
173
- if (add_bos) {
174
- embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
175
- }
176
- embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
177
-
178
- const llama_token middle_token = llama_vocab_fim_mid(vocab);
179
- if (middle_token >= 0) {
180
- embd_inp.push_back(middle_token);
181
- }
182
-
183
- LOG_DBG("add_bos: %d\n", add_bos);
184
- LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
185
- LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
186
- LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
187
-
188
- // Should not run without any tokens
189
- if (embd_inp.empty()) {
190
- embd_inp.push_back(llama_vocab_bos(vocab));
191
- LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
192
- }
193
-
194
- if ((int) embd_inp.size() > n_ctx - 4) {
195
- LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
196
- return 1;
197
- }
198
-
199
- // number of tokens to keep when resetting context
200
- if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
201
- params.n_keep = (int)embd_inp.size();
202
- }
203
-
204
- LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
205
- LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
206
-
207
- // enable interactive mode if interactive start is specified
208
- if (params.interactive_first) {
209
- params.interactive = true;
210
- }
211
-
212
- if (params.verbose_prompt) {
213
- LOG_INF("\n");
214
- LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
215
- LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
216
- for (int i = 0; i < (int) embd_inp.size(); i++) {
217
- LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
218
- }
219
-
220
- if (params.n_keep > 0) {
221
- LOG_INF("%s: static prompt based on n_keep: '", __func__);
222
- for (int i = 0; i < params.n_keep; i++) {
223
- LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
224
- }
225
- LOG_CNT("'\n");
226
- }
227
- LOG_INF("\n");
228
- }
229
-
230
- if (params.interactive) {
231
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
232
- struct sigaction sigint_action;
233
- sigint_action.sa_handler = sigint_handler;
234
- sigemptyset (&sigint_action.sa_mask);
235
- sigint_action.sa_flags = 0;
236
- sigaction(SIGINT, &sigint_action, NULL);
237
- #elif defined (_WIN32)
238
- auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
239
- return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
240
- };
241
- SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
242
- #endif
243
-
244
- LOG_INF("%s: interactive mode on.\n", __func__);
245
-
246
- if (params.input_prefix_bos) {
247
- LOG_INF("Input prefix with BOS\n");
248
- }
249
-
250
- if (!params.input_prefix.empty()) {
251
- LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
252
- }
253
-
254
- if (!params.input_suffix.empty()) {
255
- LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
256
- }
257
- }
258
- smpl = common_sampler_init(model, sparams);
259
-
260
- LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
261
- LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
262
- LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
263
-
264
- LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
265
-
266
- LOG_INF("\n");
267
- LOG_INF("\n##### Infill mode #####\n\n");
268
- if (params.interactive) {
269
- const char *control_message;
270
- if (params.multiline_input) {
271
- control_message = " - To return control to LLaMA, end your input with '\\'.\n"
272
- " - To return control without starting a new line, end your input with '/'.\n";
273
- } else {
274
- control_message = " - Press Return to return control to LLaMA.\n"
275
- " - To return control without starting a new line, end your input with '/'.\n"
276
- " - If you want to submit another line, end your input with '\\'.\n";
277
- }
278
- LOG_INF("== Running in interactive mode. ==\n");
279
- #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
280
- LOG_INF( " - Press Ctrl+C to interject at any time.\n");
281
- #endif
282
- LOG_INF( "%s\n", control_message);
283
-
284
- is_interacting = params.interactive_first;
285
- }
286
-
287
- bool input_echo = true;
288
-
289
- int n_past = 0;
290
- int n_remain = params.n_predict;
291
- int n_consumed = 0;
292
-
293
- std::vector<int> input_tokens; g_input_tokens = &input_tokens;
294
- std::vector<int> output_tokens; g_output_tokens = &output_tokens;
295
- std::ostringstream output_ss; g_output_ss = &output_ss;
296
-
297
- // the first thing we will do is to output the prompt, so set color accordingly
298
- console::set_display(console::prompt);
299
-
300
- std::vector<llama_token> embd;
301
-
302
- while (n_remain != 0 || params.interactive) {
303
- // predict
304
- if (!embd.empty()) {
305
- // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
306
- // --prompt or --file which uses the same value.
307
- int max_embd_size = n_ctx - 4;
308
-
309
- // Ensure the input doesn't exceed the context size by truncating embd if necessary.
310
- if ((int) embd.size() > max_embd_size) {
311
- const int skipped_tokens = (int) embd.size() - max_embd_size;
312
- embd.resize(max_embd_size);
313
-
314
- console::set_display(console::error);
315
- LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
316
- console::set_display(console::reset);
317
- }
318
-
319
- // infinite text generation via context swapping
320
- // if we run out of context:
321
- // - take the n_keep first tokens from the original prompt (via n_past)
322
- // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
323
- if (n_past + (int) embd.size() > n_ctx) {
324
- if (params.n_predict == -2) {
325
- LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
326
- break;
327
- }
328
-
329
- const int n_left = n_past - params.n_keep - 1;
330
- const int n_discard = n_left/2;
331
-
332
- LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
333
- n_past, n_left, n_ctx, params.n_keep, n_discard);
334
-
335
- llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
336
- llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
337
-
338
- n_past -= n_discard;
339
-
340
- LOG_DBG("after swap: n_past = %d\n", n_past);
341
-
342
- LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
343
-
344
- }
345
-
346
- // evaluate tokens in batches
347
- // embd is typically prepared beforehand to fit within a batch, but not always
348
- for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
349
- int n_eval = (int) embd.size() - i;
350
- if (n_eval > params.n_batch) {
351
- n_eval = params.n_batch;
352
- }
353
-
354
- LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
355
-
356
- if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
357
- LOG_ERR("%s : failed to eval\n", __func__);
358
- return 1;
359
- }
360
-
361
- n_past += n_eval;
362
-
363
- LOG_DBG("n_past = %d\n", n_past);
364
- }
365
-
366
- }
367
-
368
- embd.clear();
369
-
370
- if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
371
- const llama_token id = common_sampler_sample(smpl, ctx, -1);
372
-
373
- common_sampler_accept(smpl, id, true);
374
-
375
- // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
376
-
377
- embd.push_back(id);
378
-
379
- // echo this to console
380
- input_echo = true;
381
-
382
- // decrement remaining sampling budget
383
- --n_remain;
384
-
385
- LOG_DBG("n_remain: %d\n", n_remain);
386
- } else {
387
- // some user input remains from prompt or interaction, forward it to processing
388
- LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
389
- while ((int) embd_inp.size() > n_consumed) {
390
- embd.push_back(embd_inp[n_consumed]);
391
-
392
- // push the prompt in the sampling context in order to apply repetition penalties later
393
- // for the prompt, we don't apply grammar rules
394
- common_sampler_accept(smpl, embd_inp[n_consumed], false);
395
-
396
- ++n_consumed;
397
- if ((int) embd.size() >= params.n_batch) {
398
- break;
399
- }
400
- }
401
- }
402
-
403
- // display text
404
- if (input_echo) {
405
- for (auto id : embd) {
406
- const std::string token_str = common_token_to_piece(ctx, id);
407
- LOG("%s", token_str.c_str());
408
-
409
- if (embd.size() > 1) {
410
- input_tokens.push_back(id);
411
- } else {
412
- output_tokens.push_back(id);
413
- output_ss << token_str;
414
- }
415
- }
416
- }
417
- // reset color to default if we there is no pending user input
418
- if (input_echo && (int) embd_inp.size() == n_consumed) {
419
- console::set_display(console::reset);
420
- }
421
-
422
- // if not currently processing queued inputs;
423
- if ((int) embd_inp.size() <= n_consumed) {
424
- // deal with eot token in infill mode
425
- if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
426
- if (is_interacting && !params.interactive_first) {
427
- // print an eot token
428
- LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
429
- }
430
- LOG("\n");
431
- console::set_display(console::user_input);
432
- std::string buffer;
433
- std::string line;
434
- bool another_line=true;
435
- // set a new prefix via stdin
436
- do {
437
- another_line = console::readline(line, params.multiline_input);
438
- buffer += line;
439
- } while (another_line);
440
- // check if we got an empty line, if so we use the old input
441
- if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
442
- params.input_prefix = buffer;
443
- }
444
- buffer.clear();
445
- // set a new suffix via stdin
446
- do {
447
- another_line = console::readline(line, params.multiline_input);
448
- buffer += line;
449
- } while (another_line);
450
- // check if we got an empty line
451
- if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
452
- params.input_suffix = buffer;
453
- }
454
- buffer.clear();
455
- // done taking input, reset color
456
- console::set_display(console::reset);
457
-
458
- if (params.escape) {
459
- //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
460
- string_process_escapes(params.input_prefix);
461
- string_process_escapes(params.input_suffix);
462
- }
463
-
464
- // tokenize new prefix and suffix
465
- std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
466
- std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
467
-
468
- inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
469
- inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
470
-
471
- embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
472
- embd_end = params.spm_infill ? inp_pfx : inp_sfx;
473
- if (add_bos) {
474
- embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
475
- }
476
- embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
477
-
478
- if (middle_token >= 0) {
479
- embd_inp.push_back(middle_token);
480
- }
481
-
482
- embd.clear();
483
- n_remain = params.n_predict;
484
- n_past = 0;
485
- n_consumed = 0;
486
- is_interacting = false;
487
- }
488
- // deal with end of generation tokens in interactive mode
489
- else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
490
- LOG_DBG("found EOS token\n");
491
-
492
- if (params.interactive) {
493
-
494
- is_interacting = true;
495
- LOG("\n");
496
- console::set_display(console::user_input);
497
- }
498
- }
499
-
500
- if (n_past > 0 && is_interacting && !params.interactive) {
501
- LOG_DBG("waiting for user input\n");
502
-
503
- if (params.input_prefix_bos) {
504
- LOG_DBG("adding input prefix BOS token\n");
505
- embd_inp.push_back(llama_vocab_bos(vocab));
506
- }
507
-
508
- std::string buffer;
509
- if (!params.input_prefix.empty()) {
510
- LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
511
- buffer += params.input_prefix;
512
- LOG("%s", buffer.c_str());
513
- }
514
-
515
- std::string line;
516
- bool another_line = true;
517
- do {
518
- another_line = console::readline(line, params.multiline_input);
519
- buffer += line;
520
- } while (another_line);
521
-
522
- // done taking input, reset color
523
- console::set_display(console::reset);
524
-
525
- // Add tokens to embd only if the input buffer is non-empty
526
- // Entering a empty line lets the user pass control back
527
- if (buffer.length() > 1) {
528
- // append input suffix if any
529
- if (!params.input_suffix.empty()) {
530
- LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
531
- buffer += params.input_suffix;
532
- LOG("%s", params.input_suffix.c_str());
533
- }
534
-
535
- LOG_DBG("buffer: '%s'\n", buffer.c_str());
536
-
537
- const size_t original_size = embd_inp.size();
538
-
539
- const auto line_inp = common_tokenize(ctx, buffer, false);
540
- LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
541
-
542
- embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
543
-
544
- for (size_t i = original_size; i < embd_inp.size(); ++i) {
545
- const llama_token token = embd_inp[i];
546
- output_tokens.push_back(token);
547
- output_ss << common_token_to_piece(ctx, token);
548
- }
549
-
550
- n_remain -= line_inp.size();
551
- LOG_DBG("n_remain: %d\n", n_remain);
552
- } else {
553
- LOG_DBG("empty line, passing control back\n");
554
- }
555
-
556
- input_echo = false; // do not echo this again
557
- }
558
-
559
- if (n_past > 0) {
560
- if (is_interacting) {
561
- common_sampler_reset(smpl);
562
- }
563
- is_interacting = false;
564
- }
565
- }
566
-
567
- // end of generation
568
- if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
569
- break;
570
- }
571
-
572
- // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
573
- // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
574
- if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
575
- n_remain = params.n_predict;
576
- is_interacting = true;
577
- }
578
- }
579
- if (!params.interactive && n_remain <= 0) {
580
- LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
581
- }
582
-
583
- LOG("\n");
584
- common_perf_print(ctx, smpl);
585
-
586
- common_sampler_free(smpl);
587
- llama_backend_free();
588
-
589
- return 0;
590
- }
@@ -1,8 +0,0 @@
1
- #!/bin/bash
2
- cmake ../../../../ \
3
- -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
4
- -DCMAKE_BUILD_TYPE=Release \
5
- -DANDROID_ABI="arm64-v8a" \
6
- -DANDROID_PLATFORM=android-23 $1
7
-
8
- make -j4
@@ -1,59 +0,0 @@
1
- #include "arg.h"
2
- #include "base64.hpp"
3
- #include "log.h"
4
- #include "common.h"
5
- #include "sampling.h"
6
- #include "clip.h"
7
- #include "llava.h"
8
- #include "llama.h"
9
- #include "ggml.h"
10
-
11
- static void print_usage(int argc, char ** argv) {
12
- (void) argc;
13
-
14
- fprintf(stderr, "usage: %s /path/to/ggml-model-f32.gguf /path/to/ggml-model-quantized.gguf type\n", argv[0]);
15
- fprintf(stderr, " type = 2 - q4_0\n");
16
- fprintf(stderr, " type = 3 - q4_1\n");
17
- fprintf(stderr, " type = 6 - q5_0\n");
18
- fprintf(stderr, " type = 7 - q5_1\n");
19
- fprintf(stderr, " type = 8 - q8_0\n");
20
- }
21
-
22
- int main(int argc, char ** argv) {
23
- if (argc != 4) {
24
- print_usage(argc, argv);
25
- return 1;
26
- }
27
-
28
- const std::string fname_inp = argv[1];
29
- const std::string fname_out = argv[2];
30
-
31
- const int itype = atoi(argv[3]);
32
-
33
- const int64_t t_main_start_us = ggml_time_us();
34
-
35
- int64_t t_quantize_us = 0;
36
-
37
- // load the model
38
- {
39
- const int64_t t_start_us = ggml_time_us();
40
-
41
- if (!clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
42
- fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
43
- return 1;
44
- }
45
-
46
- t_quantize_us = ggml_time_us() - t_start_us;
47
- }
48
-
49
- // report timing
50
- {
51
- const int64_t t_main_end_us = ggml_time_us();
52
-
53
- printf("\n");
54
- printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us / 1000.0f);
55
- printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
56
- }
57
-
58
- return 0;
59
- }