@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -0,0 +1,114 @@
1
+ /* linenoise.h -- VERSION 1.0
2
+ *
3
+ * Guerrilla line editing library against the idea that a line editing lib
4
+ * needs to be 20,000 lines of C++ code.
5
+ *
6
+ * See linenoise.cpp for more information.
7
+ *
8
+ * ------------------------------------------------------------------------
9
+ *
10
+ * Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
11
+ * Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
12
+ * Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
13
+ *
14
+ * All rights reserved.
15
+ *
16
+ * Redistribution and use in source and binary forms, with or without
17
+ * modification, are permitted provided that the following conditions are
18
+ * met:
19
+ *
20
+ * * Redistributions of source code must retain the above copyright
21
+ * notice, this list of conditions and the following disclaimer.
22
+ *
23
+ * * Redistributions in binary form must reproduce the above copyright
24
+ * notice, this list of conditions and the following disclaimer in the
25
+ * documentation and/or other materials provided with the distribution.
26
+ *
27
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
+ */
39
+
40
+ #ifndef __LINENOISE_H
41
+ #define __LINENOISE_H
42
+
43
+ #ifdef __cplusplus
44
+ extern "C" {
45
+ #endif
46
+
47
+ #include <stddef.h> /* For size_t. */
48
+
49
+ extern const char *linenoiseEditMore;
50
+
51
+ /* The linenoiseState structure represents the state during line editing.
52
+ * We pass this state to functions implementing specific editing
53
+ * functionalities. */
54
+ struct linenoiseState {
55
+ int in_completion; /* The user pressed TAB and we are now in completion
56
+ * mode, so input is handled by completeLine(). */
57
+ size_t completion_idx; /* Index of next completion to propose. */
58
+ int ifd; /* Terminal stdin file descriptor. */
59
+ int ofd; /* Terminal stdout file descriptor. */
60
+ char *buf; /* Edited line buffer. */
61
+ size_t buflen; /* Edited line buffer size. */
62
+ const char *prompt; /* Prompt to display. */
63
+ size_t plen; /* Prompt length. */
64
+ size_t pos; /* Current cursor position. */
65
+ size_t oldpos; /* Previous refresh cursor position. */
66
+ size_t len; /* Current edited line length. */
67
+ size_t cols; /* Number of columns in terminal. */
68
+ size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */
69
+ int history_index; /* The history index we are currently editing. */
70
+ };
71
+
72
+ typedef struct linenoiseCompletions {
73
+ size_t len;
74
+ char **cvec;
75
+ } linenoiseCompletions;
76
+
77
+ /* Non blocking API. */
78
+ int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
79
+ const char *linenoiseEditFeed(struct linenoiseState *l);
80
+ void linenoiseEditStop(struct linenoiseState *l);
81
+ void linenoiseHide(struct linenoiseState *l);
82
+ void linenoiseShow(struct linenoiseState *l);
83
+
84
+ /* Blocking API. */
85
+ const char *linenoise(const char *prompt);
86
+ void linenoiseFree(void *ptr);
87
+
88
+ /* Completion API. */
89
+ typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
90
+ typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
91
+ typedef void(linenoiseFreeHintsCallback)(const char *);
92
+ void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
93
+ void linenoiseSetHintsCallback(linenoiseHintsCallback *);
94
+ void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
95
+ void linenoiseAddCompletion(linenoiseCompletions *, const char *);
96
+
97
+ /* History API. */
98
+ int linenoiseHistoryAdd(const char *line);
99
+ int linenoiseHistorySetMaxLen(int len);
100
+ int linenoiseHistorySave(const char *filename);
101
+ int linenoiseHistoryLoad(const char *filename);
102
+
103
+ /* Other utilities. */
104
+ void linenoiseClearScreen(void);
105
+ void linenoiseSetMultiLine(int ml);
106
+ void linenoisePrintKeyCodes(void);
107
+ void linenoiseMaskModeEnable(void);
108
+ void linenoiseMaskModeDisable(void);
109
+
110
+ #ifdef __cplusplus
111
+ }
112
+ #endif
113
+
114
+ #endif /* __LINENOISE_H */
@@ -1,5 +1,6 @@
1
1
  #if defined(_WIN32)
2
2
  # include <windows.h>
3
+ # include <io.h>
3
4
  #else
4
5
  # include <sys/file.h>
5
6
  # include <sys/ioctl.h>
@@ -10,20 +11,31 @@
10
11
  # include <curl/curl.h>
11
12
  #endif
12
13
 
14
+ #include <signal.h>
15
+
13
16
  #include <climits>
14
17
  #include <cstdarg>
15
18
  #include <cstdio>
16
19
  #include <cstring>
17
20
  #include <filesystem>
18
21
  #include <iostream>
22
+ #include <list>
19
23
  #include <sstream>
20
24
  #include <string>
21
25
  #include <vector>
22
26
 
23
27
  #include "common.h"
24
28
  #include "json.hpp"
29
+ #include "linenoise.cpp/linenoise.h"
25
30
  #include "llama-cpp.h"
26
31
 
32
+ #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
33
+ [[noreturn]] static void sigint_handler(int) {
34
+ printf("\n\033[0m");
35
+ exit(0); // not ideal, but it's the only way to guarantee exit in all cases
36
+ }
37
+ #endif
38
+
27
39
  GGML_ATTRIBUTE_FORMAT(1, 2)
28
40
  static std::string fmt(const char * fmt, ...) {
29
41
  va_list ap;
@@ -55,29 +67,52 @@ static int printe(const char * fmt, ...) {
55
67
  class Opt {
56
68
  public:
57
69
  int init(int argc, const char ** argv) {
70
+ ctx_params = llama_context_default_params();
71
+ model_params = llama_model_default_params();
72
+ context_size_default = ctx_params.n_batch;
73
+ ngl_default = model_params.n_gpu_layers;
74
+ common_params_sampling sampling;
75
+ temperature_default = sampling.temp;
76
+
77
+ if (argc < 2) {
78
+ printe("Error: No arguments provided.\n");
79
+ print_help();
80
+ return 1;
81
+ }
82
+
58
83
  // Parse arguments
59
84
  if (parse(argc, argv)) {
60
85
  printe("Error: Failed to parse arguments.\n");
61
- help();
86
+ print_help();
62
87
  return 1;
63
88
  }
64
89
 
65
90
  // If help is requested, show help and exit
66
- if (help_) {
67
- help();
91
+ if (help) {
92
+ print_help();
68
93
  return 2;
69
94
  }
70
95
 
96
+ ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
97
+ ctx_params.n_ctx = ctx_params.n_batch;
98
+ model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
99
+ temperature = temperature >= 0 ? temperature : temperature_default;
100
+
71
101
  return 0; // Success
72
102
  }
73
103
 
104
+ llama_context_params ctx_params;
105
+ llama_model_params model_params;
74
106
  std::string model_;
75
- std::string user_;
76
- int context_size_ = -1, ngl_ = -1;
77
- bool verbose_ = false;
107
+ std::string user;
108
+ int context_size = -1, ngl = -1;
109
+ float temperature = -1;
110
+ bool verbose = false;
78
111
 
79
112
  private:
80
- bool help_ = false;
113
+ int context_size_default = -1, ngl_default = -1;
114
+ float temperature_default = -1;
115
+ bool help = false;
81
116
 
82
117
  bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
83
118
  return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
@@ -89,6 +124,17 @@ class Opt {
89
124
  }
90
125
 
91
126
  option_value = std::atoi(argv[++i]);
127
+
128
+ return 0;
129
+ }
130
+
131
+ int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) {
132
+ if (i + 1 >= argc) {
133
+ return 1;
134
+ }
135
+
136
+ option_value = std::atof(argv[++i]);
137
+
92
138
  return 0;
93
139
  }
94
140
 
@@ -96,18 +142,22 @@ class Opt {
96
142
  bool options_parsing = true;
97
143
  for (int i = 1, positional_args_i = 0; i < argc; ++i) {
98
144
  if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
99
- if (handle_option_with_value(argc, argv, i, context_size_) == 1) {
145
+ if (handle_option_with_value(argc, argv, i, context_size) == 1) {
100
146
  return 1;
101
147
  }
102
148
  } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
103
- if (handle_option_with_value(argc, argv, i, ngl_) == 1) {
149
+ if (handle_option_with_value(argc, argv, i, ngl) == 1) {
150
+ return 1;
151
+ }
152
+ } else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
153
+ if (handle_option_with_value(argc, argv, i, temperature) == 1) {
104
154
  return 1;
105
155
  }
106
156
  } else if (options_parsing &&
107
157
  (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
108
- verbose_ = true;
158
+ verbose = true;
109
159
  } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
110
- help_ = true;
160
+ help = true;
111
161
  return 0;
112
162
  } else if (options_parsing && strcmp(argv[i], "--") == 0) {
113
163
  options_parsing = false;
@@ -120,16 +170,16 @@ class Opt {
120
170
  model_ = argv[i];
121
171
  } else if (positional_args_i == 1) {
122
172
  ++positional_args_i;
123
- user_ = argv[i];
173
+ user = argv[i];
124
174
  } else {
125
- user_ += " " + std::string(argv[i]);
175
+ user += " " + std::string(argv[i]);
126
176
  }
127
177
  }
128
178
 
129
179
  return 0;
130
180
  }
131
181
 
132
- void help() const {
182
+ void print_help() const {
133
183
  printf(
134
184
  "Description:\n"
135
185
  " Runs a llm\n"
@@ -142,6 +192,8 @@ class Opt {
142
192
  " Context size (default: %d)\n"
143
193
  " -n, --ngl <value>\n"
144
194
  " Number of GPU layers (default: %d)\n"
195
+ " --temp <value>\n"
196
+ " Temperature (default: %.1f)\n"
145
197
  " -v, --verbose, --log-verbose\n"
146
198
  " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
147
199
  " -h, --help\n"
@@ -170,7 +222,7 @@ class Opt {
170
222
  " llama-run file://some-file3.gguf\n"
171
223
  " llama-run --ngl 999 some-file4.gguf\n"
172
224
  " llama-run --ngl 999 some-file5.gguf Hello World\n",
173
- llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
225
+ context_size_default, ngl_default, temperature_default);
174
226
  }
175
227
  };
176
228
 
@@ -214,7 +266,7 @@ class File {
214
266
  return 1;
215
267
  }
216
268
 
217
- OVERLAPPED overlapped = { 0 };
269
+ OVERLAPPED overlapped = {};
218
270
  if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD,
219
271
  &overlapped)) {
220
272
  fd = -1;
@@ -238,7 +290,7 @@ class File {
238
290
  if (fd >= 0) {
239
291
  # ifdef _WIN32
240
292
  if (hFile != INVALID_HANDLE_VALUE) {
241
- OVERLAPPED overlapped = { 0 };
293
+ OVERLAPPED overlapped = {};
242
294
  UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped);
243
295
  }
244
296
  # else
@@ -254,7 +306,7 @@ class File {
254
306
  private:
255
307
  int fd = -1;
256
308
  # ifdef _WIN32
257
- HANDLE hFile;
309
+ HANDLE hFile = nullptr;
258
310
  # endif
259
311
  };
260
312
 
@@ -425,7 +477,7 @@ class HttpClient {
425
477
  return (now_downloaded_plus_file_size * 100) / total_to_download;
426
478
  }
427
479
 
428
- static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); }
480
+ static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
429
481
 
430
482
  static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
431
483
  const auto now = std::chrono::steady_clock::now();
@@ -486,7 +538,7 @@ class LlamaData {
486
538
  llama_sampler_ptr sampler;
487
539
  llama_context_ptr context;
488
540
  std::vector<llama_chat_message> messages;
489
- std::vector<std::string> msg_strs;
541
+ std::list<std::string> msg_strs;
490
542
  std::vector<char> fmtted;
491
543
 
492
544
  int init(Opt & opt) {
@@ -495,12 +547,12 @@ class LlamaData {
495
547
  return 1;
496
548
  }
497
549
 
498
- context = initialize_context(model, opt.context_size_);
550
+ context = initialize_context(model, opt);
499
551
  if (!context) {
500
552
  return 1;
501
553
  }
502
554
 
503
- sampler = initialize_sampler();
555
+ sampler = initialize_sampler(opt);
504
556
  return 0;
505
557
  }
506
558
 
@@ -619,14 +671,12 @@ class LlamaData {
619
671
  // Initializes the model and returns a unique pointer to it
620
672
  llama_model_ptr initialize_model(Opt & opt) {
621
673
  ggml_backend_load_all();
622
- llama_model_params model_params = llama_model_default_params();
623
- model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
624
674
  resolve_model(opt.model_);
625
675
  printe(
626
676
  "\r%*s"
627
677
  "\rLoading model",
628
678
  get_terminal_width(), " ");
629
- llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
679
+ llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params));
630
680
  if (!model) {
631
681
  printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
632
682
  }
@@ -636,10 +686,8 @@ class LlamaData {
636
686
  }
637
687
 
638
688
  // Initializes the context with the specified parameters
639
- llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
640
- llama_context_params ctx_params = llama_context_default_params();
641
- ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
642
- llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
689
+ llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
690
+ llama_context_ptr context(llama_init_from_model(model.get(), opt.ctx_params));
643
691
  if (!context) {
644
692
  printe("%s: error: failed to create the llama_context\n", __func__);
645
693
  }
@@ -648,10 +696,10 @@ class LlamaData {
648
696
  }
649
697
 
650
698
  // Initializes and configures the sampler
651
- llama_sampler_ptr initialize_sampler() {
699
+ llama_sampler_ptr initialize_sampler(const Opt & opt) {
652
700
  llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
653
701
  llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
654
- llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
702
+ llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature));
655
703
  llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
656
704
 
657
705
  return sampler;
@@ -667,11 +715,11 @@ static void add_message(const char * role, const std::string & text, LlamaData &
667
715
  // Function to apply the chat template and resize `formatted` if needed
668
716
  static int apply_chat_template(LlamaData & llama_data, const bool append) {
669
717
  int result = llama_chat_apply_template(
670
- llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append,
718
+ llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(), llama_data.messages.size(), append,
671
719
  append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
672
720
  if (append && result > static_cast<int>(llama_data.fmtted.size())) {
673
721
  llama_data.fmtted.resize(result);
674
- result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
722
+ result = llama_chat_apply_template(llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(),
675
723
  llama_data.messages.size(), append, llama_data.fmtted.data(),
676
724
  llama_data.fmtted.size());
677
725
  }
@@ -680,11 +728,11 @@ static int apply_chat_template(LlamaData & llama_data, const bool append) {
680
728
  }
681
729
 
682
730
  // Function to tokenize the prompt
683
- static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
731
+ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
684
732
  std::vector<llama_token> & prompt_tokens) {
685
- const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
733
+ const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
686
734
  prompt_tokens.resize(n_prompt_tokens);
687
- if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
735
+ if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
688
736
  true) < 0) {
689
737
  printe("failed to tokenize the prompt\n");
690
738
  return -1;
@@ -707,9 +755,9 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
707
755
  }
708
756
 
709
757
  // convert the token to a string
710
- static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
758
+ static int convert_token_to_string(const llama_vocab * vocab, const llama_token token_id, std::string & piece) {
711
759
  char buf[256];
712
- int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
760
+ int n = llama_token_to_piece(vocab, token_id, buf, sizeof(buf), 0, true);
713
761
  if (n < 0) {
714
762
  printe("failed to convert token to piece\n");
715
763
  return 1;
@@ -727,8 +775,10 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
727
775
 
728
776
  // helper function to evaluate a prompt and generate a response
729
777
  static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
778
+ const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
779
+
730
780
  std::vector<llama_token> tokens;
731
- if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) {
781
+ if (tokenize_prompt(vocab, prompt, tokens) < 0) {
732
782
  return 1;
733
783
  }
734
784
 
@@ -744,12 +794,12 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
744
794
 
745
795
  // sample the next token, check is it an end of generation?
746
796
  new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
747
- if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
797
+ if (llama_vocab_is_eog(vocab, new_token_id)) {
748
798
  break;
749
799
  }
750
800
 
751
801
  std::string piece;
752
- if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
802
+ if (convert_token_to_string(vocab, new_token_id, piece)) {
753
803
  return 1;
754
804
  }
755
805
 
@@ -759,12 +809,45 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
759
809
  batch = llama_batch_get_one(&new_token_id, 1);
760
810
  }
761
811
 
812
+ printf("\033[0m");
762
813
  return 0;
763
814
  }
764
815
 
765
- static int read_user_input(std::string & user) {
766
- std::getline(std::cin, user);
767
- return user.empty(); // Should have data in happy path
816
+ static int read_user_input(std::string & user_input) {
817
+ static const char * prompt_prefix = "> ";
818
+ #ifdef WIN32
819
+ printf(
820
+ "\r%*s"
821
+ "\r\033[0m%s",
822
+ get_terminal_width(), " ", prompt_prefix);
823
+
824
+ std::getline(std::cin, user_input);
825
+ if (std::cin.eof()) {
826
+ printf("\n");
827
+ return 1;
828
+ }
829
+ #else
830
+ std::unique_ptr<char, decltype(&std::free)> line(const_cast<char *>(linenoise(prompt_prefix)), free);
831
+ if (!line) {
832
+ return 1;
833
+ }
834
+
835
+ user_input = line.get();
836
+ #endif
837
+
838
+ if (user_input == "/bye") {
839
+ return 1;
840
+ }
841
+
842
+ if (user_input.empty()) {
843
+ return 2;
844
+ }
845
+
846
+ #ifndef WIN32
847
+ linenoiseHistoryAdd(line.get());
848
+ #endif
849
+
850
+ return 0; // Should have data in happy path
768
851
  }
769
852
 
770
853
  // Function to generate a response based on the prompt
@@ -798,16 +881,12 @@ static int apply_chat_template_with_error_handling(LlamaData & llama_data, const
798
881
  }
799
882
 
800
883
  // Helper function to handle user input
801
- static int handle_user_input(std::string & user_input, const std::string & user_) {
802
- if (!user_.empty()) {
803
- user_input = user_;
884
+ static int handle_user_input(std::string & user_input, const std::string & user) {
885
+ if (!user.empty()) {
886
+ user_input = user;
804
887
  return 0; // No need for interactive input
805
888
  }
806
889
 
807
- printf(
808
- "\r%*s"
809
- "\r\033[32m> \033[0m",
810
- get_terminal_width(), " ");
811
890
  return read_user_input(user_input); // Returns true if input ends the loop
812
891
  }
813
892
 
@@ -831,18 +910,37 @@ static bool is_stdout_a_terminal() {
831
910
  #endif
832
911
  }
833
912
 
834
- // Function to tokenize the prompt
835
- static int chat_loop(LlamaData & llama_data, const std::string & user_) {
913
+ // Function to handle user input
914
+ static int get_user_input(std::string & user_input, const std::string & user) {
915
+ while (true) {
916
+ const int ret = handle_user_input(user_input, user);
917
+ if (ret == 1) {
918
+ return 1;
919
+ }
920
+
921
+ if (ret == 2) {
922
+ continue;
923
+ }
924
+
925
+ break;
926
+ }
927
+
928
+ return 0;
929
+ }
930
+
931
+ // Main chat loop function
932
+ static int chat_loop(LlamaData & llama_data, const std::string & user) {
836
933
  int prev_len = 0;
837
934
  llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
838
935
  static const bool stdout_a_terminal = is_stdout_a_terminal();
839
936
  while (true) {
840
937
  // Get user input
841
938
  std::string user_input;
842
- while (handle_user_input(user_input, user_)) {
939
+ if (get_user_input(user_input, user) == 1) {
940
+ return 0;
843
941
  }
844
942
 
845
- add_message("user", user_.empty() ? user_input : user_, llama_data);
943
+ add_message("user", user.empty() ? user_input : user, llama_data);
846
944
  int new_len;
847
945
  if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
848
946
  return 1;
@@ -854,7 +952,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
854
952
  return 1;
855
953
  }
856
954
 
857
- if (!user_.empty()) {
955
+ if (!user.empty()) {
858
956
  break;
859
957
  }
860
958
 
@@ -869,7 +967,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
869
967
 
870
968
  static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
871
969
  const Opt * opt = static_cast<Opt *>(p);
872
- if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) {
970
+ if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) {
873
971
  printe("%s", text);
874
972
  }
875
973
  }
@@ -880,7 +978,23 @@ static std::string read_pipe_data() {
880
978
  return result.str();
881
979
  }
882
980
 
981
+ static void ctrl_c_handling() {
982
+ #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
983
+ struct sigaction sigint_action;
984
+ sigint_action.sa_handler = sigint_handler;
985
+ sigemptyset(&sigint_action.sa_mask);
986
+ sigint_action.sa_flags = 0;
987
+ sigaction(SIGINT, &sigint_action, NULL);
988
+ #elif defined(_WIN32)
989
+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
990
+ return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
991
+ };
992
+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
993
+ #endif
994
+ }
995
+
883
996
  int main(int argc, const char ** argv) {
997
+ ctrl_c_handling();
884
998
  Opt opt;
885
999
  const int ret = opt.init(argc, argv);
886
1000
  if (ret == 2) {
@@ -890,11 +1004,11 @@ int main(int argc, const char ** argv) {
890
1004
  }
891
1005
 
892
1006
  if (!is_stdin_a_terminal()) {
893
- if (!opt.user_.empty()) {
894
- opt.user_ += "\n\n";
1007
+ if (!opt.user.empty()) {
1008
+ opt.user += "\n\n";
895
1009
  }
896
1010
 
897
- opt.user_ += read_pipe_data();
1011
+ opt.user += read_pipe_data();
898
1012
  }
899
1013
 
900
1014
  llama_log_set(log_callback, &opt);
@@ -903,7 +1017,7 @@ int main(int argc, const char ** argv) {
903
1017
  return 1;
904
1018
  }
905
1019
 
906
- if (chat_loop(llama_data, opt.user_)) {
1020
+ if (chat_loop(llama_data, opt.user)) {
907
1021
  return 1;
908
1022
  }
909
1023