@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -47,27 +47,27 @@ extern "C" {
|
|
|
47
47
|
#include <stddef.h> /* For size_t. */
|
|
48
48
|
#include <stdlib.h>
|
|
49
49
|
|
|
50
|
-
extern const char *linenoiseEditMore;
|
|
50
|
+
extern const char * linenoiseEditMore;
|
|
51
51
|
|
|
52
52
|
/* The linenoiseState structure represents the state during line editing.
|
|
53
53
|
* We pass this state to functions implementing specific editing
|
|
54
54
|
* functionalities. */
|
|
55
55
|
struct linenoiseState {
|
|
56
|
-
int
|
|
56
|
+
int in_completion; /* The user pressed TAB and we are now in completion
|
|
57
57
|
* mode, so input is handled by completeLine(). */
|
|
58
|
-
size_t
|
|
59
|
-
int
|
|
60
|
-
int
|
|
61
|
-
char *buf;
|
|
62
|
-
size_t
|
|
63
|
-
const char *prompt;
|
|
64
|
-
size_t
|
|
65
|
-
size_t
|
|
66
|
-
size_t
|
|
67
|
-
size_t
|
|
68
|
-
size_t
|
|
69
|
-
size_t
|
|
70
|
-
int
|
|
58
|
+
size_t completion_idx; /* Index of next completion to propose. */
|
|
59
|
+
int ifd; /* Terminal stdin file descriptor. */
|
|
60
|
+
int ofd; /* Terminal stdout file descriptor. */
|
|
61
|
+
char * buf; /* Edited line buffer. */
|
|
62
|
+
size_t buflen; /* Edited line buffer size. */
|
|
63
|
+
const char * prompt; /* Prompt to display. */
|
|
64
|
+
size_t plen; /* Prompt length. */
|
|
65
|
+
size_t pos; /* Current cursor position. */
|
|
66
|
+
size_t oldcolpos; /* Previous refresh cursor column position. */
|
|
67
|
+
size_t len; /* Current edited line length. */
|
|
68
|
+
size_t cols; /* Number of columns in terminal. */
|
|
69
|
+
size_t oldrows; /* Rows used by last refreshed line (multiline mode) */
|
|
70
|
+
int history_index; /* The history index we are currently editing. */
|
|
71
71
|
};
|
|
72
72
|
|
|
73
73
|
struct linenoiseCompletions {
|
|
@@ -89,19 +89,20 @@ struct linenoiseCompletions {
|
|
|
89
89
|
};
|
|
90
90
|
|
|
91
91
|
/* Non blocking API. */
|
|
92
|
-
int
|
|
93
|
-
const char *
|
|
94
|
-
|
|
95
|
-
void
|
|
96
|
-
void
|
|
92
|
+
int linenoiseEditStart(struct linenoiseState * l, int stdin_fd, int stdout_fd, char * buf, size_t buflen,
|
|
93
|
+
const char * prompt);
|
|
94
|
+
const char * linenoiseEditFeed(struct linenoiseState * l);
|
|
95
|
+
void linenoiseEditStop(struct linenoiseState * l);
|
|
96
|
+
void linenoiseHide(struct linenoiseState * l);
|
|
97
|
+
void linenoiseShow(struct linenoiseState * l);
|
|
97
98
|
|
|
98
99
|
/* Blocking API. */
|
|
99
|
-
const char *linenoise(const char *prompt);
|
|
100
|
-
void
|
|
100
|
+
const char * linenoise(const char * prompt);
|
|
101
|
+
void linenoiseFree(void * ptr);
|
|
101
102
|
|
|
102
103
|
/* Completion API. */
|
|
103
104
|
typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
|
|
104
|
-
typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
|
|
105
|
+
typedef const char *(linenoiseHintsCallback) (const char *, int * color, int * bold);
|
|
105
106
|
typedef void(linenoiseFreeHintsCallback)(const char *);
|
|
106
107
|
void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
|
|
107
108
|
void linenoiseSetHintsCallback(linenoiseHintsCallback *);
|
|
@@ -109,10 +110,10 @@ void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
|
|
|
109
110
|
void linenoiseAddCompletion(linenoiseCompletions *, const char *);
|
|
110
111
|
|
|
111
112
|
/* History API. */
|
|
112
|
-
int linenoiseHistoryAdd(const char *line);
|
|
113
|
+
int linenoiseHistoryAdd(const char * line);
|
|
113
114
|
int linenoiseHistorySetMaxLen(int len);
|
|
114
|
-
int linenoiseHistorySave(const char *filename);
|
|
115
|
-
int linenoiseHistoryLoad(const char *filename);
|
|
115
|
+
int linenoiseHistorySave(const char * filename);
|
|
116
|
+
int linenoiseHistoryLoad(const char * filename);
|
|
116
117
|
|
|
117
118
|
/* Other utilities. */
|
|
118
119
|
void linenoiseClearScreen(void);
|
|
@@ -121,6 +122,14 @@ void linenoisePrintKeyCodes(void);
|
|
|
121
122
|
void linenoiseMaskModeEnable(void);
|
|
122
123
|
void linenoiseMaskModeDisable(void);
|
|
123
124
|
|
|
125
|
+
/* Encoding functions. */
|
|
126
|
+
typedef size_t(linenoisePrevCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
|
|
127
|
+
typedef size_t(linenoiseNextCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len);
|
|
128
|
+
typedef size_t(linenoiseReadCode)(int fd, char * buf, size_t buf_len, int * c);
|
|
129
|
+
|
|
130
|
+
void linenoiseSetEncodingFunctions(linenoisePrevCharLen * prevCharLenFunc, linenoiseNextCharLen * nextCharLenFunc,
|
|
131
|
+
linenoiseReadCode * readCodeFunc);
|
|
132
|
+
|
|
124
133
|
#ifdef __cplusplus
|
|
125
134
|
}
|
|
126
135
|
#endif
|
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
#include <string>
|
|
25
25
|
#include <vector>
|
|
26
26
|
|
|
27
|
-
#include "chat
|
|
27
|
+
#include "chat.h"
|
|
28
28
|
#include "common.h"
|
|
29
29
|
#include "json.hpp"
|
|
30
30
|
#include "linenoise.cpp/linenoise.h"
|
|
@@ -79,6 +79,7 @@ class Opt {
|
|
|
79
79
|
ctx_params = llama_context_default_params();
|
|
80
80
|
model_params = llama_model_default_params();
|
|
81
81
|
context_size_default = ctx_params.n_batch;
|
|
82
|
+
n_threads_default = ctx_params.n_threads;
|
|
82
83
|
ngl_default = model_params.n_gpu_layers;
|
|
83
84
|
common_params_sampling sampling;
|
|
84
85
|
temperature_default = sampling.temp;
|
|
@@ -104,6 +105,7 @@ class Opt {
|
|
|
104
105
|
|
|
105
106
|
ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default;
|
|
106
107
|
ctx_params.n_ctx = ctx_params.n_batch;
|
|
108
|
+
ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default;
|
|
107
109
|
model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
|
|
108
110
|
temperature = temperature >= 0 ? temperature : temperature_default;
|
|
109
111
|
|
|
@@ -113,14 +115,15 @@ class Opt {
|
|
|
113
115
|
llama_context_params ctx_params;
|
|
114
116
|
llama_model_params model_params;
|
|
115
117
|
std::string model_;
|
|
118
|
+
std::string chat_template_file;
|
|
116
119
|
std::string user;
|
|
117
120
|
bool use_jinja = false;
|
|
118
|
-
int context_size = -1, ngl = -1;
|
|
121
|
+
int context_size = -1, ngl = -1, n_threads = -1;
|
|
119
122
|
float temperature = -1;
|
|
120
123
|
bool verbose = false;
|
|
121
124
|
|
|
122
125
|
private:
|
|
123
|
-
int context_size_default = -1, ngl_default = -1;
|
|
126
|
+
int context_size_default = -1, ngl_default = -1, n_threads_default = -1;
|
|
124
127
|
float temperature_default = -1;
|
|
125
128
|
bool help = false;
|
|
126
129
|
|
|
@@ -148,48 +151,104 @@ class Opt {
|
|
|
148
151
|
return 0;
|
|
149
152
|
}
|
|
150
153
|
|
|
154
|
+
int handle_option_with_value(int argc, const char ** argv, int & i, std::string & option_value) {
|
|
155
|
+
if (i + 1 >= argc) {
|
|
156
|
+
return 1;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
option_value = argv[++i];
|
|
160
|
+
|
|
161
|
+
return 0;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) {
|
|
165
|
+
if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
|
|
166
|
+
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
|
167
|
+
return 1;
|
|
168
|
+
}
|
|
169
|
+
} else if (options_parsing &&
|
|
170
|
+
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
|
171
|
+
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
|
172
|
+
return 1;
|
|
173
|
+
}
|
|
174
|
+
} else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) {
|
|
175
|
+
if (handle_option_with_value(argc, argv, i, n_threads) == 1) {
|
|
176
|
+
return 1;
|
|
177
|
+
}
|
|
178
|
+
} else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
|
|
179
|
+
if (handle_option_with_value(argc, argv, i, temperature) == 1) {
|
|
180
|
+
return 1;
|
|
181
|
+
}
|
|
182
|
+
} else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) {
|
|
183
|
+
if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) {
|
|
184
|
+
return 1;
|
|
185
|
+
}
|
|
186
|
+
use_jinja = true;
|
|
187
|
+
} else {
|
|
188
|
+
return 2;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return 0;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
int parse_options(const char ** argv, int & i, bool & options_parsing) {
|
|
195
|
+
if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
|
196
|
+
verbose = true;
|
|
197
|
+
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
|
198
|
+
use_jinja = true;
|
|
199
|
+
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
|
200
|
+
help = true;
|
|
201
|
+
return 0;
|
|
202
|
+
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
|
203
|
+
options_parsing = false;
|
|
204
|
+
} else {
|
|
205
|
+
return 2;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
return 0;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
int parse_positional_args(const char ** argv, int & i, int & positional_args_i) {
|
|
212
|
+
if (positional_args_i == 0) {
|
|
213
|
+
if (!argv[i][0] || argv[i][0] == '-') {
|
|
214
|
+
return 1;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
++positional_args_i;
|
|
218
|
+
model_ = argv[i];
|
|
219
|
+
} else if (positional_args_i == 1) {
|
|
220
|
+
++positional_args_i;
|
|
221
|
+
user = argv[i];
|
|
222
|
+
} else {
|
|
223
|
+
user += " " + std::string(argv[i]);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return 0;
|
|
227
|
+
}
|
|
228
|
+
|
|
151
229
|
int parse(int argc, const char ** argv) {
|
|
152
230
|
bool options_parsing = true;
|
|
153
231
|
for (int i = 1, positional_args_i = 0; i < argc; ++i) {
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
|
171
|
-
use_jinja = true;
|
|
172
|
-
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
|
173
|
-
help = true;
|
|
174
|
-
return 0;
|
|
175
|
-
} else if (options_parsing && strcmp(argv[i], "--") == 0) {
|
|
176
|
-
options_parsing = false;
|
|
177
|
-
} else if (positional_args_i == 0) {
|
|
178
|
-
if (!argv[i][0] || argv[i][0] == '-') {
|
|
179
|
-
return 1;
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
++positional_args_i;
|
|
183
|
-
model_ = argv[i];
|
|
184
|
-
} else if (positional_args_i == 1) {
|
|
185
|
-
++positional_args_i;
|
|
186
|
-
user = argv[i];
|
|
187
|
-
} else {
|
|
188
|
-
user += " " + std::string(argv[i]);
|
|
232
|
+
int ret = parse_options_with_value(argc, argv, i, options_parsing);
|
|
233
|
+
if (ret == 0) {
|
|
234
|
+
continue;
|
|
235
|
+
} else if (ret == 1) {
|
|
236
|
+
return ret;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
ret = parse_options(argv, i, options_parsing);
|
|
240
|
+
if (ret == 0) {
|
|
241
|
+
continue;
|
|
242
|
+
} else if (ret == 1) {
|
|
243
|
+
return ret;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if (parse_positional_args(argv, i, positional_args_i)) {
|
|
247
|
+
return 1;
|
|
189
248
|
}
|
|
190
249
|
}
|
|
191
250
|
|
|
192
|
-
if (model_.empty()){
|
|
251
|
+
if (model_.empty()) {
|
|
193
252
|
return 1;
|
|
194
253
|
}
|
|
195
254
|
|
|
@@ -207,10 +266,17 @@ class Opt {
|
|
|
207
266
|
"Options:\n"
|
|
208
267
|
" -c, --context-size <value>\n"
|
|
209
268
|
" Context size (default: %d)\n"
|
|
269
|
+
" --chat-template-file <path>\n"
|
|
270
|
+
" Path to the file containing the chat template to use with the model.\n"
|
|
271
|
+
" Only supports jinja templates and implicitly sets the --jinja flag.\n"
|
|
272
|
+
" --jinja\n"
|
|
273
|
+
" Use jinja templating for the chat template of the model\n"
|
|
210
274
|
" -n, -ngl, --ngl <value>\n"
|
|
211
275
|
" Number of GPU layers (default: %d)\n"
|
|
212
276
|
" --temp <value>\n"
|
|
213
277
|
" Temperature (default: %.1f)\n"
|
|
278
|
+
" -t, --threads <value>\n"
|
|
279
|
+
" Number of threads to use during generation (default: %d)\n"
|
|
214
280
|
" -v, --verbose, --log-verbose\n"
|
|
215
281
|
" Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
|
|
216
282
|
" -h, --help\n"
|
|
@@ -239,7 +305,7 @@ class Opt {
|
|
|
239
305
|
" llama-run file://some-file3.gguf\n"
|
|
240
306
|
" llama-run --ngl 999 some-file4.gguf\n"
|
|
241
307
|
" llama-run --ngl 999 some-file5.gguf Hello World\n",
|
|
242
|
-
context_size_default, ngl_default, temperature_default);
|
|
308
|
+
context_size_default, ngl_default, temperature_default, n_threads_default);
|
|
243
309
|
}
|
|
244
310
|
};
|
|
245
311
|
|
|
@@ -261,13 +327,12 @@ static int get_terminal_width() {
|
|
|
261
327
|
#endif
|
|
262
328
|
}
|
|
263
329
|
|
|
264
|
-
#ifdef LLAMA_USE_CURL
|
|
265
330
|
class File {
|
|
266
331
|
public:
|
|
267
332
|
FILE * file = nullptr;
|
|
268
333
|
|
|
269
334
|
FILE * open(const std::string & filename, const char * mode) {
|
|
270
|
-
file =
|
|
335
|
+
file = ggml_fopen(filename.c_str(), mode);
|
|
271
336
|
|
|
272
337
|
return file;
|
|
273
338
|
}
|
|
@@ -303,6 +368,20 @@ class File {
|
|
|
303
368
|
return 0;
|
|
304
369
|
}
|
|
305
370
|
|
|
371
|
+
std::string to_string() {
|
|
372
|
+
fseek(file, 0, SEEK_END);
|
|
373
|
+
const size_t size = ftell(file);
|
|
374
|
+
fseek(file, 0, SEEK_SET);
|
|
375
|
+
std::string out;
|
|
376
|
+
out.resize(size);
|
|
377
|
+
const size_t read_size = fread(&out[0], 1, size, file);
|
|
378
|
+
if (read_size != size) {
|
|
379
|
+
printe("Error reading file: %s", strerror(errno));
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
return out;
|
|
383
|
+
}
|
|
384
|
+
|
|
306
385
|
~File() {
|
|
307
386
|
if (fd >= 0) {
|
|
308
387
|
# ifdef _WIN32
|
|
@@ -327,6 +406,7 @@ class File {
|
|
|
327
406
|
# endif
|
|
328
407
|
};
|
|
329
408
|
|
|
409
|
+
#ifdef LLAMA_USE_CURL
|
|
330
410
|
class HttpClient {
|
|
331
411
|
public:
|
|
332
412
|
int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
|
|
@@ -557,7 +637,7 @@ class LlamaData {
|
|
|
557
637
|
llama_model_ptr model;
|
|
558
638
|
llama_sampler_ptr sampler;
|
|
559
639
|
llama_context_ptr context;
|
|
560
|
-
std::vector<llama_chat_message> messages;
|
|
640
|
+
std::vector<llama_chat_message> messages; // TODO: switch to common_chat_msg
|
|
561
641
|
std::list<std::string> msg_strs;
|
|
562
642
|
std::vector<char> fmtted;
|
|
563
643
|
|
|
@@ -834,50 +914,29 @@ static void add_message(const char * role, const std::string & text, LlamaData &
|
|
|
834
914
|
}
|
|
835
915
|
|
|
836
916
|
// Function to apply the chat template and resize `formatted` if needed
|
|
837
|
-
static int apply_chat_template(const
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
|
|
856
|
-
llama_data.fmtted.resize(result.size() + 1);
|
|
857
|
-
memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
|
|
858
|
-
return result.size();
|
|
859
|
-
} catch (const std::exception & e) {
|
|
860
|
-
printe("failed to render the chat template: %s\n", e.what());
|
|
861
|
-
return -1;
|
|
862
|
-
}
|
|
863
|
-
}
|
|
864
|
-
int result = llama_chat_apply_template(
|
|
865
|
-
tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
|
|
866
|
-
append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
|
|
867
|
-
if (append && result > static_cast<int>(llama_data.fmtted.size())) {
|
|
868
|
-
llama_data.fmtted.resize(result);
|
|
869
|
-
result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
|
|
870
|
-
llama_data.messages.size(), append, llama_data.fmtted.data(),
|
|
871
|
-
llama_data.fmtted.size());
|
|
872
|
-
}
|
|
873
|
-
|
|
874
|
-
return result;
|
|
917
|
+
static int apply_chat_template(const struct common_chat_templates * tmpls, LlamaData & llama_data, const bool append, bool use_jinja) {
|
|
918
|
+
common_chat_templates_inputs inputs;
|
|
919
|
+
for (const auto & msg : llama_data.messages) {
|
|
920
|
+
common_chat_msg cmsg;
|
|
921
|
+
cmsg.role = msg.role;
|
|
922
|
+
cmsg.content = msg.content;
|
|
923
|
+
inputs.messages.push_back(cmsg);
|
|
924
|
+
}
|
|
925
|
+
inputs.add_generation_prompt = append;
|
|
926
|
+
inputs.use_jinja = use_jinja;
|
|
927
|
+
|
|
928
|
+
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
|
929
|
+
// TODO: use other params for tool calls.
|
|
930
|
+
auto result = chat_params.prompt;
|
|
931
|
+
llama_data.fmtted.resize(result.size() + 1);
|
|
932
|
+
memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
|
|
933
|
+
return result.size();
|
|
875
934
|
}
|
|
876
935
|
|
|
877
936
|
// Function to tokenize the prompt
|
|
878
937
|
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
|
|
879
938
|
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
|
|
880
|
-
const bool is_first =
|
|
939
|
+
const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
|
|
881
940
|
|
|
882
941
|
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
|
883
942
|
prompt_tokens.resize(n_prompt_tokens);
|
|
@@ -893,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
|
|
|
893
952
|
// Check if we have enough space in the context to evaluate this batch
|
|
894
953
|
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
|
895
954
|
const int n_ctx = llama_n_ctx(ctx.get());
|
|
896
|
-
const int n_ctx_used =
|
|
955
|
+
const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
|
|
897
956
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
|
898
957
|
printf(LOG_COL_DEFAULT "\n");
|
|
899
958
|
printe("context size exceeded\n");
|
|
@@ -963,7 +1022,8 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
|
|
963
1022
|
}
|
|
964
1023
|
|
|
965
1024
|
static int read_user_input(std::string & user_input) {
|
|
966
|
-
static const char *
|
|
1025
|
+
static const char * prompt_prefix_env = std::getenv("LLAMA_PROMPT_PREFIX");
|
|
1026
|
+
static const char * prompt_prefix = prompt_prefix_env ? prompt_prefix_env : "> ";
|
|
967
1027
|
#ifdef WIN32
|
|
968
1028
|
printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix);
|
|
969
1029
|
|
|
@@ -1015,8 +1075,8 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
|
|
|
1015
1075
|
}
|
|
1016
1076
|
|
|
1017
1077
|
// Helper function to apply the chat template and handle errors
|
|
1018
|
-
static int apply_chat_template_with_error_handling(const
|
|
1019
|
-
const int new_len = apply_chat_template(
|
|
1078
|
+
static int apply_chat_template_with_error_handling(const common_chat_templates * tmpls, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
|
|
1079
|
+
const int new_len = apply_chat_template(tmpls, llama_data, append, use_jinja);
|
|
1020
1080
|
if (new_len < 0) {
|
|
1021
1081
|
printe("failed to apply the chat template\n");
|
|
1022
1082
|
return -1;
|
|
@@ -1074,40 +1134,68 @@ static int get_user_input(std::string & user_input, const std::string & user) {
|
|
|
1074
1134
|
return 0;
|
|
1075
1135
|
}
|
|
1076
1136
|
|
|
1137
|
+
// Reads a chat template file to be used
|
|
1138
|
+
static std::string read_chat_template_file(const std::string & chat_template_file) {
|
|
1139
|
+
File file;
|
|
1140
|
+
if (!file.open(chat_template_file, "r")) {
|
|
1141
|
+
printe("Error opening chat template file '%s': %s", chat_template_file.c_str(), strerror(errno));
|
|
1142
|
+
return "";
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
return file.to_string();
|
|
1146
|
+
}
|
|
1147
|
+
|
|
1148
|
+
static int process_user_message(const Opt & opt, const std::string & user_input, LlamaData & llama_data,
|
|
1149
|
+
const common_chat_templates_ptr & chat_templates, int & prev_len,
|
|
1150
|
+
const bool stdout_a_terminal) {
|
|
1151
|
+
add_message("user", opt.user.empty() ? user_input : opt.user, llama_data);
|
|
1152
|
+
int new_len;
|
|
1153
|
+
if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, true, new_len, opt.use_jinja) < 0) {
|
|
1154
|
+
return 1;
|
|
1155
|
+
}
|
|
1156
|
+
|
|
1157
|
+
std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
|
|
1158
|
+
std::string response;
|
|
1159
|
+
if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
|
|
1160
|
+
return 1;
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
if (!opt.user.empty()) {
|
|
1164
|
+
return 2;
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
add_message("assistant", response, llama_data);
|
|
1168
|
+
if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, false, prev_len, opt.use_jinja) < 0) {
|
|
1169
|
+
return 1;
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
return 0;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1077
1175
|
// Main chat loop function
|
|
1078
|
-
static int chat_loop(LlamaData & llama_data, const
|
|
1176
|
+
static int chat_loop(LlamaData & llama_data, const Opt & opt) {
|
|
1079
1177
|
int prev_len = 0;
|
|
1080
1178
|
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
|
1081
|
-
|
|
1082
|
-
|
|
1179
|
+
std::string chat_template;
|
|
1180
|
+
if (!opt.chat_template_file.empty()) {
|
|
1181
|
+
chat_template = read_chat_template_file(opt.chat_template_file);
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
common_chat_templates_ptr chat_templates = common_chat_templates_init(llama_data.model.get(), chat_template);
|
|
1083
1185
|
static const bool stdout_a_terminal = is_stdout_a_terminal();
|
|
1084
1186
|
while (true) {
|
|
1085
1187
|
// Get user input
|
|
1086
1188
|
std::string user_input;
|
|
1087
|
-
if (get_user_input(user_input, user) == 1) {
|
|
1189
|
+
if (get_user_input(user_input, opt.user) == 1) {
|
|
1088
1190
|
return 0;
|
|
1089
1191
|
}
|
|
1090
1192
|
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
|
|
1094
|
-
return 1;
|
|
1095
|
-
}
|
|
1096
|
-
|
|
1097
|
-
std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
|
|
1098
|
-
std::string response;
|
|
1099
|
-
if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
|
|
1193
|
+
const int ret = process_user_message(opt, user_input, llama_data, chat_templates, prev_len, stdout_a_terminal);
|
|
1194
|
+
if (ret == 1) {
|
|
1100
1195
|
return 1;
|
|
1101
|
-
}
|
|
1102
|
-
|
|
1103
|
-
if (!user.empty()) {
|
|
1196
|
+
} else if (ret == 2) {
|
|
1104
1197
|
break;
|
|
1105
1198
|
}
|
|
1106
|
-
|
|
1107
|
-
add_message("assistant", response, llama_data);
|
|
1108
|
-
if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
|
|
1109
|
-
return 1;
|
|
1110
|
-
}
|
|
1111
1199
|
}
|
|
1112
1200
|
|
|
1113
1201
|
return 0;
|
|
@@ -1165,7 +1253,7 @@ int main(int argc, const char ** argv) {
|
|
|
1165
1253
|
return 1;
|
|
1166
1254
|
}
|
|
1167
1255
|
|
|
1168
|
-
if (chat_loop(llama_data, opt
|
|
1256
|
+
if (chat_loop(llama_data, opt)) {
|
|
1169
1257
|
return 1;
|
|
1170
1258
|
}
|
|
1171
1259
|
|
|
@@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
|
|
|
15
15
|
return 1;
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
common_init();
|
|
19
19
|
|
|
20
20
|
if (params.n_predict < 0) {
|
|
21
21
|
params.n_predict = 16;
|
|
@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
|
|
|
196
196
|
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
|
197
197
|
|
|
198
198
|
// erase whole kv
|
|
199
|
-
|
|
199
|
+
llama_kv_self_clear(ctx3);
|
|
200
200
|
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
|
201
201
|
|
|
202
202
|
// restore kv into seq 1
|