@fugood/llama.node 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/LlamaContext.cpp +2 -2
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +82 -54
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +748 -754
- package/src/llama.cpp/common/common.h +49 -41
- package/src/llama.cpp/common/grammar-parser.cpp +10 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +92 -10
- package/src/llama.cpp/common/sampling.h +6 -1
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +29 -17
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
- package/src/llama.cpp/examples/server/server.cpp +33 -25
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +2 -3
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +13 -3
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3715 -2050
- package/src/llama.cpp/ggml-rpc.cpp +1155 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +119 -673
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +203 -224
- package/src/llama.cpp/ggml.c +1208 -1483
- package/src/llama.cpp/ggml.h +71 -46
- package/src/llama.cpp/llama.cpp +1374 -938
- package/src/llama.cpp/llama.h +22 -6
- package/src/llama.cpp/requirements.txt +0 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -3,40 +3,390 @@
|
|
|
3
3
|
|
|
4
4
|
#include <cmath>
|
|
5
5
|
#include <cstdio>
|
|
6
|
+
#include <fstream>
|
|
6
7
|
#include <string>
|
|
7
8
|
#include <vector>
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
#if defined(_WIN32)
|
|
11
|
+
#define WIN32_LEAN_AND_MEAN
|
|
12
|
+
#include <windows.h>
|
|
13
|
+
#include <shellapi.h> // For CommandLineToArgvW
|
|
14
|
+
#endif
|
|
15
|
+
|
|
16
|
+
static void print_usage_information(const char * argv0, FILE * stream) {
|
|
17
|
+
fprintf(stream, "usage: %s [options]\n\n", argv0);
|
|
18
|
+
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
|
|
19
|
+
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
|
|
20
|
+
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
|
|
21
|
+
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
|
|
22
|
+
fprintf(stream, " The possible options are:\n");
|
|
23
|
+
fprintf(stream, "\n");
|
|
24
|
+
fprintf(stream, " -h, --help print this help and exit\n");
|
|
25
|
+
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
|
26
|
+
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
|
|
27
|
+
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
|
28
|
+
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
|
29
|
+
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
|
30
|
+
fprintf(stream, " --stdin read prompt from standard input.\n");
|
|
31
|
+
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
|
32
|
+
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
|
36
|
+
(void) level;
|
|
37
|
+
(void) text;
|
|
38
|
+
(void) user_data;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
static std::string read_prompt_from_file(const char * filepath, bool & success) {
|
|
42
|
+
success = false;
|
|
43
|
+
|
|
44
|
+
std::ifstream in(filepath, std::ios::binary);
|
|
45
|
+
if (!in) {
|
|
46
|
+
fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
|
|
47
|
+
return std::string();
|
|
48
|
+
}
|
|
49
|
+
// do not assume the file is seekable (e.g. /dev/stdin)
|
|
50
|
+
std::stringstream buffer;
|
|
51
|
+
buffer << in.rdbuf();
|
|
52
|
+
if (in.fail()) {
|
|
53
|
+
fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
|
|
54
|
+
return std::string();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
success = true;
|
|
58
|
+
return buffer.str();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
//
|
|
62
|
+
// Function: ingest_args(...) -> vector<string>
|
|
63
|
+
//
|
|
64
|
+
// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
|
|
65
|
+
// strings, as an STL vector<string>.
|
|
66
|
+
//
|
|
67
|
+
// In particular, it handles character encoding shenanigans on Windows.
|
|
68
|
+
//
|
|
69
|
+
// Note: raw_argc and raw_argv are not actually read at all on Windows.
|
|
70
|
+
// On Windows we call GetCommandLineW to get the arguments in wchar_t
|
|
71
|
+
// format, ignoring the regular argc/argv arguments to main().
|
|
72
|
+
//
|
|
73
|
+
// TODO: potential opportunity to roll common stuff into common/console.cpp
|
|
74
|
+
// in relation to Windows wchar_t shenanigans.
|
|
75
|
+
static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
|
|
76
|
+
std::vector<std::string> argv;
|
|
77
|
+
|
|
78
|
+
// Handle Windows, if given non-ASCII arguments.
|
|
79
|
+
// We convert wchar_t arguments into UTF-8 char* on this platform.
|
|
80
|
+
// Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
|
|
81
|
+
// without throwing tantrums.
|
|
82
|
+
#if defined(_WIN32)
|
|
83
|
+
int argc;
|
|
84
|
+
const LPWSTR cmdline_wargv = GetCommandLineW();
|
|
85
|
+
LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
|
|
86
|
+
|
|
87
|
+
// silence unused arg warnings
|
|
88
|
+
(void) raw_argc;
|
|
89
|
+
(void) raw_argv;
|
|
90
|
+
|
|
91
|
+
for (int i = 0; i < argc; ++i) {
|
|
92
|
+
int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
|
|
93
|
+
char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
|
|
94
|
+
GGML_ASSERT(output_buf);
|
|
95
|
+
|
|
96
|
+
WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
|
|
97
|
+
output_buf[length_needed] = '\0';
|
|
98
|
+
|
|
99
|
+
argv.push_back(output_buf);
|
|
100
|
+
free(output_buf);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
LocalFree((HLOCAL) wargv);
|
|
104
|
+
#else
|
|
105
|
+
int argc = raw_argc;
|
|
106
|
+
for (int i = 0; i < argc; ++i) {
|
|
107
|
+
argv.push_back(raw_argv[i]);
|
|
108
|
+
}
|
|
109
|
+
#endif
|
|
110
|
+
|
|
111
|
+
GGML_ASSERT((unsigned int) argc == argv.size());
|
|
112
|
+
|
|
113
|
+
return argv;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
//
|
|
117
|
+
// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
|
|
118
|
+
//
|
|
119
|
+
// writes a string to standard output; taking into account that on Windows
|
|
120
|
+
// to display correctly you have to use special handling. Works even if the
|
|
121
|
+
// user has not set a unicode code page on a Windows cmd.exe.
|
|
122
|
+
//
|
|
123
|
+
// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
|
|
124
|
+
// a human-readable is written instead.
|
|
125
|
+
//
|
|
126
|
+
// On non-Windows systems, simply printfs() the string.
|
|
127
|
+
static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
|
128
|
+
invalid_utf8 = false;
|
|
129
|
+
|
|
130
|
+
#if defined(_WIN32)
|
|
131
|
+
// Are we in a console?
|
|
132
|
+
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
|
|
133
|
+
DWORD dwMode = 0;
|
|
134
|
+
|
|
135
|
+
// According to Microsoft docs:
|
|
136
|
+
// "WriteConsole fails if it is used with a standard handle that is redirected to a file."
|
|
137
|
+
// Also according to the docs, you can use GetConsoleMode to check for that.
|
|
138
|
+
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
|
|
139
|
+
printf("%s", str);
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// MultiByteToWideChar reports an error if str is empty, don't report
|
|
144
|
+
// them as invalid_utf8.
|
|
145
|
+
if (*str == 0) {
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
|
|
149
|
+
if (length_needed == 0) {
|
|
150
|
+
DWORD err = GetLastError();
|
|
151
|
+
if (err == ERROR_NO_UNICODE_TRANSLATION) {
|
|
152
|
+
invalid_utf8 = true;
|
|
153
|
+
int len = strlen(str);
|
|
154
|
+
printf("<");
|
|
155
|
+
for (int i = 0; i < len; ++i) {
|
|
156
|
+
if (i > 0) {
|
|
157
|
+
printf(" ");
|
|
158
|
+
}
|
|
159
|
+
printf("%02x", (uint8_t) str[i]);
|
|
160
|
+
}
|
|
161
|
+
printf(">");
|
|
162
|
+
return;
|
|
163
|
+
}
|
|
164
|
+
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
|
168
|
+
GGML_ASSERT(wstr);
|
|
169
|
+
|
|
170
|
+
MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
|
|
171
|
+
WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
|
|
172
|
+
|
|
173
|
+
free(wstr);
|
|
174
|
+
#else
|
|
175
|
+
// TODO: reporting invalid_utf8 would be useful on non-Windows too.
|
|
176
|
+
// printf will silently just write bad unicode.
|
|
177
|
+
printf("%s", str);
|
|
178
|
+
#endif
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
int main(int raw_argc, char ** raw_argv) {
|
|
182
|
+
const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
|
|
183
|
+
const int argc = argv.size();
|
|
184
|
+
|
|
185
|
+
if (argc <= 1) {
|
|
186
|
+
print_usage_information(argv[0].c_str(), stderr);
|
|
187
|
+
return 1;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
//////
|
|
191
|
+
// Read out all the command line arguments.
|
|
192
|
+
//////
|
|
193
|
+
|
|
194
|
+
// variables where to put any arguments we see.
|
|
195
|
+
bool printing_ids = false;
|
|
196
|
+
bool no_bos = false;
|
|
197
|
+
bool disable_logging = false;
|
|
198
|
+
const char * model_path = NULL;
|
|
199
|
+
const char * prompt_path = NULL;
|
|
200
|
+
const char * prompt_arg = NULL;
|
|
201
|
+
|
|
202
|
+
// track which arguments were explicitly given
|
|
203
|
+
// used for sanity checking down the line
|
|
204
|
+
bool model_path_set = false;
|
|
205
|
+
bool prompt_path_set = false;
|
|
206
|
+
bool prompt_set = false;
|
|
207
|
+
bool stdin_set = false;
|
|
208
|
+
|
|
209
|
+
int iarg = 1;
|
|
210
|
+
for (; iarg < argc; ++iarg) {
|
|
211
|
+
std::string arg{argv[iarg]};
|
|
212
|
+
if (arg == "-h" || arg == "--help") {
|
|
213
|
+
print_usage_information(argv[0].c_str(), stdout);
|
|
214
|
+
return 0;
|
|
215
|
+
}
|
|
216
|
+
else if (arg == "--ids") {
|
|
217
|
+
printing_ids = true;
|
|
218
|
+
}
|
|
219
|
+
else if (arg == "-m" || arg == "--model") {
|
|
220
|
+
if (model_path_set) {
|
|
221
|
+
fprintf(stderr, "Error: -m or --model specified multiple times.\n");
|
|
222
|
+
return 1;
|
|
223
|
+
}
|
|
224
|
+
model_path = argv[++iarg].c_str();
|
|
225
|
+
model_path_set = true;
|
|
226
|
+
}
|
|
227
|
+
else if (arg == "--no-bos") {
|
|
228
|
+
no_bos = true;
|
|
229
|
+
}
|
|
230
|
+
else if (arg == "-p" || arg == "--prompt") {
|
|
231
|
+
if (prompt_set) {
|
|
232
|
+
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
|
233
|
+
return 1;
|
|
234
|
+
}
|
|
235
|
+
prompt_arg = argv[++iarg].c_str();
|
|
236
|
+
prompt_set = true;
|
|
237
|
+
}
|
|
238
|
+
else if (arg == "-f" || arg == "--file") {
|
|
239
|
+
if (prompt_path_set) {
|
|
240
|
+
fprintf(stderr, "Error: -f or --file specified multiple times.\n");
|
|
241
|
+
return 1;
|
|
242
|
+
}
|
|
243
|
+
prompt_path = argv[++iarg].c_str();
|
|
244
|
+
prompt_path_set = true;
|
|
245
|
+
}
|
|
246
|
+
else if (arg == "--stdin") {
|
|
247
|
+
stdin_set = true;
|
|
248
|
+
}
|
|
249
|
+
else if (arg == "--log-disable") {
|
|
250
|
+
disable_logging = true;
|
|
251
|
+
}
|
|
252
|
+
else {
|
|
253
|
+
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
|
254
|
+
return 1;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
//////
|
|
259
|
+
// Sanity check the command line arguments.
|
|
260
|
+
//////
|
|
261
|
+
|
|
262
|
+
// Check that we have the required stuff set.
|
|
263
|
+
if (model_path_set && model_path == NULL) {
|
|
264
|
+
fprintf(stderr, "Error: --model requires an argument.\n");
|
|
265
|
+
return 1;
|
|
266
|
+
}
|
|
267
|
+
if (!model_path_set) {
|
|
268
|
+
fprintf(stderr, "Error: must specify --model.\n");
|
|
269
|
+
return 1;
|
|
270
|
+
}
|
|
271
|
+
if (prompt_path_set && prompt_path == NULL) {
|
|
272
|
+
fprintf(stderr, "Error: --file requires an argument.\n");
|
|
273
|
+
return 1;
|
|
274
|
+
}
|
|
275
|
+
if (prompt_set && prompt_arg == NULL) {
|
|
276
|
+
fprintf(stderr, "Error: --prompt requires an argument.\n");
|
|
277
|
+
return 1;
|
|
278
|
+
}
|
|
279
|
+
const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
|
|
280
|
+
if (prompts_set > 1) {
|
|
281
|
+
fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
|
|
282
|
+
return 1;
|
|
283
|
+
}
|
|
284
|
+
// Must have some prompt.
|
|
285
|
+
if (prompts_set == 0) {
|
|
286
|
+
fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
|
|
12
287
|
return 1;
|
|
13
288
|
}
|
|
14
289
|
|
|
15
|
-
|
|
16
|
-
|
|
290
|
+
GGML_ASSERT(model_path);
|
|
291
|
+
GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
|
|
17
292
|
|
|
18
|
-
|
|
293
|
+
//////
|
|
294
|
+
// Figure out where will the prompt come from.
|
|
295
|
+
//////
|
|
296
|
+
|
|
297
|
+
std::string prompt;
|
|
298
|
+
if (prompt_path_set) {
|
|
299
|
+
bool success = false;
|
|
300
|
+
prompt = read_prompt_from_file(prompt_path, success);
|
|
301
|
+
if (!success) {
|
|
302
|
+
return 1;
|
|
303
|
+
}
|
|
304
|
+
} else if (prompt_set) {
|
|
305
|
+
prompt = prompt_arg;
|
|
306
|
+
} else {
|
|
307
|
+
GGML_ASSERT(stdin_set);
|
|
308
|
+
// we read stdin *after* loading model (early exit if model cannot
|
|
309
|
+
// be loaded, which can be a nicer user experience)
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
//////
|
|
313
|
+
// Start actually doing the tokenizing stuff.
|
|
314
|
+
//////
|
|
315
|
+
|
|
316
|
+
#ifdef LOG_DISABLE_LOGS
|
|
317
|
+
disable_logging = true;
|
|
318
|
+
#endif
|
|
319
|
+
|
|
320
|
+
if (disable_logging) {
|
|
321
|
+
llama_log_set(llama_log_callback_null, NULL);
|
|
322
|
+
}
|
|
19
323
|
|
|
20
324
|
llama_backend_init();
|
|
21
325
|
|
|
22
326
|
llama_model_params model_params = llama_model_default_params();
|
|
23
327
|
model_params.vocab_only = true;
|
|
24
328
|
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
|
329
|
+
if (!model) {
|
|
330
|
+
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
|
331
|
+
return 1;
|
|
332
|
+
}
|
|
25
333
|
|
|
26
334
|
llama_context_params ctx_params = llama_context_default_params();
|
|
27
335
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
336
|
+
if (!ctx) {
|
|
337
|
+
fprintf(stderr, "Error: could not create context.\n");
|
|
338
|
+
return 1;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// read entire prompt from stdin?
|
|
342
|
+
if (stdin_set) {
|
|
343
|
+
GGML_ASSERT(!prompt_path_set && !prompt_set);
|
|
344
|
+
|
|
345
|
+
std::stringstream stdin_buffer;
|
|
346
|
+
stdin_buffer << std::cin.rdbuf();
|
|
347
|
+
if (std::cin.fail()) {
|
|
348
|
+
fprintf(stderr, "Error: could not read the entire standard input.\n");
|
|
349
|
+
return 1;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
prompt = stdin_buffer.str();
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
|
356
|
+
const bool add_bos = model_wants_add_bos && !no_bos;
|
|
28
357
|
|
|
29
358
|
std::vector<llama_token> tokens;
|
|
359
|
+
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
|
30
360
|
|
|
31
|
-
|
|
361
|
+
if (printing_ids) {
|
|
362
|
+
printf("[");
|
|
363
|
+
}
|
|
32
364
|
|
|
33
365
|
for (int i = 0; i < (int) tokens.size(); i++) {
|
|
34
366
|
if (printing_ids) {
|
|
35
|
-
|
|
367
|
+
if (i > 0) {
|
|
368
|
+
printf(", ");
|
|
369
|
+
}
|
|
370
|
+
printf("%d", tokens[i]);
|
|
36
371
|
} else {
|
|
37
|
-
|
|
372
|
+
bool invalid_utf8 = false;
|
|
373
|
+
printf("%6d -> '", tokens[i]);
|
|
374
|
+
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
|
375
|
+
if (invalid_utf8) {
|
|
376
|
+
printf("' (utf-8 decode failure)\n");
|
|
377
|
+
} else {
|
|
378
|
+
printf("'\n");
|
|
379
|
+
}
|
|
38
380
|
}
|
|
39
381
|
}
|
|
40
382
|
|
|
383
|
+
if (printing_ids) {
|
|
384
|
+
printf("]\n");
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
// silence valgrind
|
|
388
|
+
llama_free(ctx);
|
|
389
|
+
llama_free_model(model);
|
|
390
|
+
|
|
41
391
|
return 0;
|
|
42
392
|
}
|
|
@@ -301,8 +301,8 @@ static struct ggml_tensor * llama_build_train_graphs(
|
|
|
301
301
|
// not capturing these, to silcence warnings
|
|
302
302
|
const int rope_mode = 0;
|
|
303
303
|
|
|
304
|
-
return
|
|
305
|
-
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
|
304
|
+
return ggml_rope_ext(
|
|
305
|
+
ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
|
306
306
|
);
|
|
307
307
|
};
|
|
308
308
|
|
|
@@ -341,7 +341,8 @@ static struct ggml_tensor * llama_build_train_graphs(
|
|
|
341
341
|
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
|
|
342
342
|
struct ggml_tensor * t16;
|
|
343
343
|
if (enable_flash_attn) {
|
|
344
|
-
|
|
344
|
+
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
|
345
|
+
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
|
|
345
346
|
} else {
|
|
346
347
|
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
|
347
348
|
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
|
@@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
|
1182
1182
|
static char * fmt_size(size_t size) {
|
|
1183
1183
|
static char buffer[128];
|
|
1184
1184
|
if (size >= 1024*1024) {
|
|
1185
|
-
|
|
1185
|
+
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
|
1186
1186
|
} else {
|
|
1187
|
-
|
|
1187
|
+
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
|
1188
1188
|
}
|
|
1189
1189
|
return buffer;
|
|
1190
1190
|
}
|
|
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
|
|
|
1895
1895
|
|
|
1896
1896
|
tensor->buffer = buffer;
|
|
1897
1897
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
|
1898
|
-
tensor->backend = tensor->view_src->backend;
|
|
1899
1898
|
ggml_backend_buffer_init_tensor(buffer, tensor);
|
|
1900
1899
|
}
|
|
1901
1900
|
|
|
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
|
|
65
65
|
// QK = number of values after dequantization
|
|
66
66
|
// QK_K = super-block size
|
|
67
67
|
|
|
68
|
-
#ifdef GGML_QKK_64
|
|
69
|
-
#define QK_K 64
|
|
70
|
-
#define K_SCALE_SIZE 4
|
|
71
|
-
#else
|
|
72
68
|
#define QK_K 256
|
|
73
69
|
#define K_SCALE_SIZE 12
|
|
74
|
-
#endif // GGML_QKK_64
|
|
75
70
|
|
|
76
71
|
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
|
77
72
|
// QR = QK / number of values before dequantization
|
|
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
|
|
131
126
|
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
|
132
127
|
#define QR4_NL 2
|
|
133
128
|
|
|
134
|
-
#if QK_K == 64
|
|
135
|
-
#define QI4_XS QI4_NL
|
|
136
|
-
#define QR4_XS QR4_NL
|
|
137
|
-
#else
|
|
138
129
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
|
139
130
|
#define QR4_XS 8
|
|
140
|
-
#endif
|
|
141
131
|
|
|
142
132
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
|
143
133
|
|
|
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
|
|
228
218
|
// weight is represented as x = a * q
|
|
229
219
|
// 16 blocks of 16 elements each
|
|
230
220
|
// Effectively 3.4375 bits per weight
|
|
231
|
-
#ifdef GGML_QKK_64
|
|
232
|
-
typedef struct {
|
|
233
|
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
|
234
|
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
|
235
|
-
uint8_t scales[2];
|
|
236
|
-
ggml_half d; // super-block scale
|
|
237
|
-
} block_q3_K;
|
|
238
|
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
|
239
|
-
#else
|
|
240
221
|
typedef struct {
|
|
241
222
|
uint8_t hmask[QK_K/8]; // quants - high bit
|
|
242
223
|
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
|
@@ -244,20 +225,11 @@ typedef struct {
|
|
|
244
225
|
ggml_half d; // super-block scale
|
|
245
226
|
} block_q3_K;
|
|
246
227
|
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
|
247
|
-
#endif
|
|
248
228
|
|
|
249
229
|
// 4-bit quantization
|
|
250
230
|
// 8 blocks of 32 elements each
|
|
251
231
|
// weight is represented as x = a * q + b
|
|
252
232
|
// Effectively 4.5 bits per weight
|
|
253
|
-
#ifdef GGML_QKK_64
|
|
254
|
-
typedef struct {
|
|
255
|
-
ggml_half d[2]; // super-block scales/mins
|
|
256
|
-
uint8_t scales[2]; // 4-bit block scales/mins
|
|
257
|
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
|
258
|
-
} block_q4_K;
|
|
259
|
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
|
260
|
-
#else
|
|
261
233
|
typedef struct {
|
|
262
234
|
union {
|
|
263
235
|
struct {
|
|
@@ -270,21 +242,11 @@ typedef struct {
|
|
|
270
242
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
|
271
243
|
} block_q4_K;
|
|
272
244
|
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
|
273
|
-
#endif
|
|
274
245
|
|
|
275
246
|
// 5-bit quantization
|
|
276
247
|
// 8 blocks of 32 elements each
|
|
277
248
|
// weight is represented as x = a * q + b
|
|
278
249
|
// Effectively 5.5 bits per weight
|
|
279
|
-
#ifdef GGML_QKK_64
|
|
280
|
-
typedef struct {
|
|
281
|
-
ggml_half d; // super-block scale
|
|
282
|
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
|
283
|
-
uint8_t qh[QK_K/8]; // quants, high bit
|
|
284
|
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
|
285
|
-
} block_q5_K;
|
|
286
|
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
|
287
|
-
#else
|
|
288
250
|
typedef struct {
|
|
289
251
|
union {
|
|
290
252
|
struct {
|
|
@@ -298,7 +260,6 @@ typedef struct {
|
|
|
298
260
|
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
|
299
261
|
} block_q5_K;
|
|
300
262
|
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
|
301
|
-
#endif
|
|
302
263
|
|
|
303
264
|
// 6-bit quantization
|
|
304
265
|
// weight is represented as x = a * q
|
|
@@ -356,11 +317,7 @@ typedef struct {
|
|
|
356
317
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
|
357
318
|
|
|
358
319
|
// 3.4375 bpw
|
|
359
|
-
#if QK_K == 64
|
|
360
|
-
#define IQ3S_N_SCALE 2
|
|
361
|
-
#else
|
|
362
320
|
#define IQ3S_N_SCALE QK_K/64
|
|
363
|
-
#endif
|
|
364
321
|
typedef struct {
|
|
365
322
|
ggml_half d;
|
|
366
323
|
uint8_t qs[QK_K/4];
|
|
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
|
|
381
338
|
typedef struct {
|
|
382
339
|
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
|
383
340
|
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
|
384
|
-
#if QK_K == 64
|
|
385
|
-
ggml_half d;
|
|
386
|
-
#endif
|
|
387
341
|
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
|
388
342
|
} block_iq1_m;
|
|
389
|
-
#if QK_K == 64
|
|
390
|
-
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
|
391
|
-
#else
|
|
392
343
|
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
|
393
|
-
#endif
|
|
394
344
|
|
|
395
345
|
// Used by IQ1_M quants
|
|
396
346
|
typedef union {
|
|
@@ -406,9 +356,6 @@ typedef struct {
|
|
|
406
356
|
} block_iq4_nl;
|
|
407
357
|
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
|
408
358
|
|
|
409
|
-
#if QK_K == 64
|
|
410
|
-
#define block_iq4_xs block_iq4_nl
|
|
411
|
-
#else
|
|
412
359
|
typedef struct {
|
|
413
360
|
ggml_half d;
|
|
414
361
|
uint16_t scales_h;
|
|
@@ -416,7 +363,6 @@ typedef struct {
|
|
|
416
363
|
uint8_t qs[QK_K/2];
|
|
417
364
|
} block_iq4_xs;
|
|
418
365
|
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
|
419
|
-
#endif
|
|
420
366
|
|
|
421
367
|
#endif // GGML_COMMON_DECL
|
|
422
368
|
#endif // GGML_COMMON_DECL
|
|
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|
|
38
38
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
|
39
39
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
|
40
40
|
|
|
41
|
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
|
41
42
|
#ifdef __cplusplus
|
|
42
43
|
}
|
|
43
44
|
#endif
|
|
@@ -17,6 +17,18 @@
|
|
|
17
17
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
18
18
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
19
19
|
|
|
20
|
+
#if defined(_WIN32)
|
|
21
|
+
|
|
22
|
+
#define m512bh(p) p
|
|
23
|
+
#define m512i(p) p
|
|
24
|
+
|
|
25
|
+
#else
|
|
26
|
+
|
|
27
|
+
#define m512bh(p) (__m512bh)(p)
|
|
28
|
+
#define m512i(p) (__m512i)(p)
|
|
29
|
+
|
|
30
|
+
#endif
|
|
31
|
+
|
|
20
32
|
/**
|
|
21
33
|
* Converts brain16 to float32.
|
|
22
34
|
*
|
|
@@ -120,9 +132,20 @@ extern "C" {
|
|
|
120
132
|
#ifndef __F16C__
|
|
121
133
|
#define __F16C__
|
|
122
134
|
#endif
|
|
135
|
+
#endif
|
|
136
|
+
|
|
137
|
+
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
|
138
|
+
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
|
123
139
|
#ifndef __SSE3__
|
|
124
140
|
#define __SSE3__
|
|
125
141
|
#endif
|
|
142
|
+
#ifndef __SSSE3__
|
|
143
|
+
#define __SSSE3__
|
|
144
|
+
#endif
|
|
145
|
+
#endif
|
|
146
|
+
|
|
147
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
148
|
+
#include <arm_sve.h>
|
|
126
149
|
#endif
|
|
127
150
|
|
|
128
151
|
// 16-bit float
|
|
@@ -436,6 +459,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
|
436
459
|
#include <riscv_vector.h>
|
|
437
460
|
#endif
|
|
438
461
|
|
|
462
|
+
#if defined(__loongarch64)
|
|
463
|
+
#if defined(__loongarch_asx)
|
|
464
|
+
#include <lasxintrin.h>
|
|
465
|
+
#endif
|
|
466
|
+
#if defined(__loongarch_sx)
|
|
467
|
+
#include <lsxintrin.h>
|
|
468
|
+
#endif
|
|
469
|
+
#endif
|
|
470
|
+
|
|
471
|
+
#if defined(__loongarch_asx)
|
|
472
|
+
|
|
473
|
+
typedef union {
|
|
474
|
+
int32_t i;
|
|
475
|
+
float f;
|
|
476
|
+
} ft_union;
|
|
477
|
+
|
|
478
|
+
/* float type data load instructions */
|
|
479
|
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
|
480
|
+
ft_union fi_tmpval = {.f = val};
|
|
481
|
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
|
485
|
+
ft_union fi_tmpval = {.f = val};
|
|
486
|
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
|
487
|
+
}
|
|
488
|
+
#endif
|
|
489
|
+
|
|
439
490
|
#ifdef __F16C__
|
|
440
491
|
|
|
441
492
|
#ifdef _MSC_VER
|