@fugood/llama.node 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/bin/darwin/arm64/default.metallib +0 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/default.metallib +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/LlamaContext.cpp +2 -2
  19. package/src/llama.cpp/CMakeLists.txt +72 -46
  20. package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
  21. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
  22. package/src/llama.cpp/common/common.cpp +732 -752
  23. package/src/llama.cpp/common/common.h +47 -41
  24. package/src/llama.cpp/common/grammar-parser.cpp +1 -1
  25. package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
  26. package/src/llama.cpp/common/log.h +5 -5
  27. package/src/llama.cpp/common/sampling.cpp +89 -7
  28. package/src/llama.cpp/common/sampling.h +5 -0
  29. package/src/llama.cpp/common/train.cpp +2 -2
  30. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  31. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  32. package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
  33. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
  34. package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
  35. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
  36. package/src/llama.cpp/examples/infill/infill.cpp +8 -8
  37. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  38. package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
  39. package/src/llama.cpp/examples/llava/clip.h +1 -1
  40. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  41. package/src/llama.cpp/examples/llava/llava.cpp +0 -15
  42. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  43. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  44. package/src/llama.cpp/examples/main/main.cpp +24 -16
  45. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  46. package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
  47. package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
  48. package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
  49. package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
  50. package/src/llama.cpp/examples/server/server.cpp +21 -9
  51. package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
  52. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
  53. package/src/llama.cpp/ggml-backend.c +0 -1
  54. package/src/llama.cpp/ggml-common.h +0 -54
  55. package/src/llama.cpp/ggml-cuda.h +1 -0
  56. package/src/llama.cpp/ggml-impl.h +51 -0
  57. package/src/llama.cpp/ggml-kompute.cpp +4 -0
  58. package/src/llama.cpp/ggml-opencl.cpp +4 -1
  59. package/src/llama.cpp/ggml-quants.c +3700 -2041
  60. package/src/llama.cpp/ggml-rpc.cpp +188 -56
  61. package/src/llama.cpp/ggml-sycl.cpp +99 -530
  62. package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
  63. package/src/llama.cpp/ggml-vulkan.cpp +202 -225
  64. package/src/llama.cpp/ggml.c +1034 -1154
  65. package/src/llama.cpp/ggml.h +59 -31
  66. package/src/llama.cpp/llama.cpp +859 -609
  67. package/src/llama.cpp/llama.h +19 -6
  68. package/src/llama.cpp/requirements.txt +0 -1
  69. package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
  70. package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
  71. package/src/llama.cpp/tests/test-grad0.cpp +43 -83
  72. package/src/llama.cpp/unicode-data.cpp +6969 -2169
  73. package/src/llama.cpp/unicode-data.h +15 -12
  74. package/src/llama.cpp/unicode.cpp +89 -111
  75. package/src/llama.cpp/unicode.h +44 -12
  76. package/src/llama.cpp/build.zig +0 -172
  77. package/src/llama.cpp/ggml-mpi.c +0 -216
  78. package/src/llama.cpp/ggml-mpi.h +0 -39
  79. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
@@ -3,40 +3,390 @@
3
3
 
4
4
  #include <cmath>
5
5
  #include <cstdio>
6
+ #include <fstream>
6
7
  #include <string>
7
8
  #include <vector>
8
9
 
9
- int main(int argc, char ** argv) {
10
- if (argc < 3 || argv[1][0] == '-') {
11
- printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
10
+ #if defined(_WIN32)
11
+ #define WIN32_LEAN_AND_MEAN
12
+ #include <windows.h>
13
+ #include <shellapi.h> // For CommandLineToArgvW
14
+ #endif
15
+
16
+ static void print_usage_information(const char * argv0, FILE * stream) {
17
+ fprintf(stream, "usage: %s [options]\n\n", argv0);
18
+ fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
19
+ fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
20
+ fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
21
+ fprintf(stream, "to control the behavior of the tokenizer.\n\n");
22
+ fprintf(stream, " The possible options are:\n");
23
+ fprintf(stream, "\n");
24
+ fprintf(stream, " -h, --help print this help and exit\n");
25
+ fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
26
+ fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
27
+ fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
28
+ fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
29
+ fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
30
+ fprintf(stream, " --stdin read prompt from standard input.\n");
31
+ fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
32
+ fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
33
+ }
34
+
35
+ static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
36
+ (void) level;
37
+ (void) text;
38
+ (void) user_data;
39
+ }
40
+
41
+ static std::string read_prompt_from_file(const char * filepath, bool & success) {
42
+ success = false;
43
+
44
+ std::ifstream in(filepath, std::ios::binary);
45
+ if (!in) {
46
+ fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
47
+ return std::string();
48
+ }
49
+ // do not assume the file is seekable (e.g. /dev/stdin)
50
+ std::stringstream buffer;
51
+ buffer << in.rdbuf();
52
+ if (in.fail()) {
53
+ fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
54
+ return std::string();
55
+ }
56
+
57
+ success = true;
58
+ return buffer.str();
59
+ }
60
+
61
+ //
62
+ // Function: ingest_args(...) -> vector<string>
63
+ //
64
+ // Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
65
+ // strings, as an STL vector<string>.
66
+ //
67
+ // In particular, it handles character encoding shenanigans on Windows.
68
+ //
69
+ // Note: raw_argc and raw_argv are not actually read at all on Windows.
70
+ // On Windows we call GetCommandLineW to get the arguments in wchar_t
71
+ // format, ignoring the regular argc/argv arguments to main().
72
+ //
73
+ // TODO: potential opportunity to roll common stuff into common/console.cpp
74
+ // in relation to Windows wchar_t shenanigans.
75
+ static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
76
+ std::vector<std::string> argv;
77
+
78
+ // Handle Windows, if given non-ASCII arguments.
79
+ // We convert wchar_t arguments into UTF-8 char* on this platform.
80
+ // Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
81
+ // without throwing tantrums.
82
+ #if defined(_WIN32)
83
+ int argc;
84
+ const LPWSTR cmdline_wargv = GetCommandLineW();
85
+ LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
86
+
87
+ // silence unused arg warnings
88
+ (void) raw_argc;
89
+ (void) raw_argv;
90
+
91
+ for (int i = 0; i < argc; ++i) {
92
+ int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
93
+ char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
94
+ GGML_ASSERT(output_buf);
95
+
96
+ WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
97
+ output_buf[length_needed] = '\0';
98
+
99
+ argv.push_back(output_buf);
100
+ free(output_buf);
101
+ }
102
+
103
+ LocalFree((HLOCAL) wargv);
104
+ #else
105
+ int argc = raw_argc;
106
+ for (int i = 0; i < argc; ++i) {
107
+ argv.push_back(raw_argv[i]);
108
+ }
109
+ #endif
110
+
111
+ GGML_ASSERT((unsigned int) argc == argv.size());
112
+
113
+ return argv;
114
+ }
115
+
116
+ //
117
+ // Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
118
+ //
119
+ // writes a string to standard output; taking into account that on Windows
120
+ // to display correctly you have to use special handling. Works even if the
121
+ // user has not set a unicode code page on a Windows cmd.exe.
122
+ //
123
+ // In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
124
+ // a human-readable is written instead.
125
+ //
126
+ // On non-Windows systems, simply printfs() the string.
127
+ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
128
+ invalid_utf8 = false;
129
+
130
+ #if defined(_WIN32)
131
+ // Are we in a console?
132
+ HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
133
+ DWORD dwMode = 0;
134
+
135
+ // According to Microsoft docs:
136
+ // "WriteConsole fails if it is used with a standard handle that is redirected to a file."
137
+ // Also according to the docs, you can use GetConsoleMode to check for that.
138
+ if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
139
+ printf("%s", str);
140
+ return;
141
+ }
142
+
143
+ // MultiByteToWideChar reports an error if str is empty, don't report
144
+ // them as invalid_utf8.
145
+ if (*str == 0) {
146
+ return;
147
+ }
148
+ int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
149
+ if (length_needed == 0) {
150
+ DWORD err = GetLastError();
151
+ if (err == ERROR_NO_UNICODE_TRANSLATION) {
152
+ invalid_utf8 = true;
153
+ int len = strlen(str);
154
+ printf("<");
155
+ for (int i = 0; i < len; ++i) {
156
+ if (i > 0) {
157
+ printf(" ");
158
+ }
159
+ printf("%02x", (uint8_t) str[i]);
160
+ }
161
+ printf(">");
162
+ return;
163
+ }
164
+ GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
165
+ }
166
+
167
+ LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
168
+ GGML_ASSERT(wstr);
169
+
170
+ MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
171
+ WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
172
+
173
+ free(wstr);
174
+ #else
175
+ // TODO: reporting invalid_utf8 would be useful on non-Windows too.
176
+ // printf will silently just write bad unicode.
177
+ printf("%s", str);
178
+ #endif
179
+ }
180
+
181
+ int main(int raw_argc, char ** raw_argv) {
182
+ const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
183
+ const int argc = argv.size();
184
+
185
+ if (argc <= 1) {
186
+ print_usage_information(argv[0].c_str(), stderr);
187
+ return 1;
188
+ }
189
+
190
+ //////
191
+ // Read out all the command line arguments.
192
+ //////
193
+
194
+ // variables where to put any arguments we see.
195
+ bool printing_ids = false;
196
+ bool no_bos = false;
197
+ bool disable_logging = false;
198
+ const char * model_path = NULL;
199
+ const char * prompt_path = NULL;
200
+ const char * prompt_arg = NULL;
201
+
202
+ // track which arguments were explicitly given
203
+ // used for sanity checking down the line
204
+ bool model_path_set = false;
205
+ bool prompt_path_set = false;
206
+ bool prompt_set = false;
207
+ bool stdin_set = false;
208
+
209
+ int iarg = 1;
210
+ for (; iarg < argc; ++iarg) {
211
+ std::string arg{argv[iarg]};
212
+ if (arg == "-h" || arg == "--help") {
213
+ print_usage_information(argv[0].c_str(), stdout);
214
+ return 0;
215
+ }
216
+ else if (arg == "--ids") {
217
+ printing_ids = true;
218
+ }
219
+ else if (arg == "-m" || arg == "--model") {
220
+ if (model_path_set) {
221
+ fprintf(stderr, "Error: -m or --model specified multiple times.\n");
222
+ return 1;
223
+ }
224
+ model_path = argv[++iarg].c_str();
225
+ model_path_set = true;
226
+ }
227
+ else if (arg == "--no-bos") {
228
+ no_bos = true;
229
+ }
230
+ else if (arg == "-p" || arg == "--prompt") {
231
+ if (prompt_set) {
232
+ fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
233
+ return 1;
234
+ }
235
+ prompt_arg = argv[++iarg].c_str();
236
+ prompt_set = true;
237
+ }
238
+ else if (arg == "-f" || arg == "--file") {
239
+ if (prompt_path_set) {
240
+ fprintf(stderr, "Error: -f or --file specified multiple times.\n");
241
+ return 1;
242
+ }
243
+ prompt_path = argv[++iarg].c_str();
244
+ prompt_path_set = true;
245
+ }
246
+ else if (arg == "--stdin") {
247
+ stdin_set = true;
248
+ }
249
+ else if (arg == "--log-disable") {
250
+ disable_logging = true;
251
+ }
252
+ else {
253
+ fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
254
+ return 1;
255
+ }
256
+ }
257
+
258
+ //////
259
+ // Sanity check the command line arguments.
260
+ //////
261
+
262
+ // Check that we have the required stuff set.
263
+ if (model_path_set && model_path == NULL) {
264
+ fprintf(stderr, "Error: --model requires an argument.\n");
265
+ return 1;
266
+ }
267
+ if (!model_path_set) {
268
+ fprintf(stderr, "Error: must specify --model.\n");
269
+ return 1;
270
+ }
271
+ if (prompt_path_set && prompt_path == NULL) {
272
+ fprintf(stderr, "Error: --file requires an argument.\n");
273
+ return 1;
274
+ }
275
+ if (prompt_set && prompt_arg == NULL) {
276
+ fprintf(stderr, "Error: --prompt requires an argument.\n");
277
+ return 1;
278
+ }
279
+ const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
280
+ if (prompts_set > 1) {
281
+ fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
282
+ return 1;
283
+ }
284
+ // Must have some prompt.
285
+ if (prompts_set == 0) {
286
+ fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
12
287
  return 1;
13
288
  }
14
289
 
15
- const char * model_path = argv[1];
16
- const char * prompt = argv[2];
290
+ GGML_ASSERT(model_path);
291
+ GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
17
292
 
18
- const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
293
+ //////
294
+ // Figure out where will the prompt come from.
295
+ //////
296
+
297
+ std::string prompt;
298
+ if (prompt_path_set) {
299
+ bool success = false;
300
+ prompt = read_prompt_from_file(prompt_path, success);
301
+ if (!success) {
302
+ return 1;
303
+ }
304
+ } else if (prompt_set) {
305
+ prompt = prompt_arg;
306
+ } else {
307
+ GGML_ASSERT(stdin_set);
308
+ // we read stdin *after* loading model (early exit if model cannot
309
+ // be loaded, which can be a nicer user experience)
310
+ }
311
+
312
+ //////
313
+ // Start actually doing the tokenizing stuff.
314
+ //////
315
+
316
+ #ifdef LOG_DISABLE_LOGS
317
+ disable_logging = true;
318
+ #endif
319
+
320
+ if (disable_logging) {
321
+ llama_log_set(llama_log_callback_null, NULL);
322
+ }
19
323
 
20
324
  llama_backend_init();
21
325
 
22
326
  llama_model_params model_params = llama_model_default_params();
23
327
  model_params.vocab_only = true;
24
328
  llama_model * model = llama_load_model_from_file(model_path, model_params);
329
+ if (!model) {
330
+ fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
331
+ return 1;
332
+ }
25
333
 
26
334
  llama_context_params ctx_params = llama_context_default_params();
27
335
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
336
+ if (!ctx) {
337
+ fprintf(stderr, "Error: could not create context.\n");
338
+ return 1;
339
+ }
340
+
341
+ // read entire prompt from stdin?
342
+ if (stdin_set) {
343
+ GGML_ASSERT(!prompt_path_set && !prompt_set);
344
+
345
+ std::stringstream stdin_buffer;
346
+ stdin_buffer << std::cin.rdbuf();
347
+ if (std::cin.fail()) {
348
+ fprintf(stderr, "Error: could not read the entire standard input.\n");
349
+ return 1;
350
+ }
351
+
352
+ prompt = stdin_buffer.str();
353
+ }
354
+
355
+ const bool model_wants_add_bos = llama_should_add_bos_token(model);
356
+ const bool add_bos = model_wants_add_bos && !no_bos;
28
357
 
29
358
  std::vector<llama_token> tokens;
359
+ tokens = ::llama_tokenize(model, prompt, add_bos, true);
30
360
 
31
- tokens = ::llama_tokenize(model, prompt, true, true);
361
+ if (printing_ids) {
362
+ printf("[");
363
+ }
32
364
 
33
365
  for (int i = 0; i < (int) tokens.size(); i++) {
34
366
  if (printing_ids) {
35
- printf("%d\n", tokens[i]);
367
+ if (i > 0) {
368
+ printf(", ");
369
+ }
370
+ printf("%d", tokens[i]);
36
371
  } else {
37
- printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
372
+ bool invalid_utf8 = false;
373
+ printf("%6d -> '", tokens[i]);
374
+ write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
375
+ if (invalid_utf8) {
376
+ printf("' (utf-8 decode failure)\n");
377
+ } else {
378
+ printf("'\n");
379
+ }
38
380
  }
39
381
  }
40
382
 
383
+ if (printing_ids) {
384
+ printf("]\n");
385
+ }
386
+
387
+ // silence valgrind
388
+ llama_free(ctx);
389
+ llama_free_model(model);
390
+
41
391
  return 0;
42
392
  }
@@ -301,8 +301,8 @@ static struct ggml_tensor * llama_build_train_graphs(
301
301
  // not capturing these, to silcence warnings
302
302
  const int rope_mode = 0;
303
303
 
304
- return ggml_rope_custom(
305
- ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
304
+ return ggml_rope_ext(
305
+ ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
306
306
  );
307
307
  };
308
308
 
@@ -341,7 +341,8 @@ static struct ggml_tensor * llama_build_train_graphs(
341
341
  struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
342
342
  struct ggml_tensor * t16;
343
343
  if (enable_flash_attn) {
344
- t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
344
+ GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
345
+ //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
345
346
  } else {
346
347
  struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
347
348
  struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
1895
1895
 
1896
1896
  tensor->buffer = buffer;
1897
1897
  tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
1898
- tensor->backend = tensor->view_src->backend;
1899
1898
  ggml_backend_buffer_init_tensor(buffer, tensor);
1900
1899
  }
1901
1900
 
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
65
65
  // QK = number of values after dequantization
66
66
  // QK_K = super-block size
67
67
 
68
- #ifdef GGML_QKK_64
69
- #define QK_K 64
70
- #define K_SCALE_SIZE 4
71
- #else
72
68
  #define QK_K 256
73
69
  #define K_SCALE_SIZE 12
74
- #endif // GGML_QKK_64
75
70
 
76
71
  #if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
77
72
  // QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
131
126
  #define QI4_NL (QK4_NL / (4*QR4_NL))
132
127
  #define QR4_NL 2
133
128
 
134
- #if QK_K == 64
135
- #define QI4_XS QI4_NL
136
- #define QR4_XS QR4_NL
137
- #else
138
129
  #define QI4_XS (QK_K / (4*QR4_XS))
139
130
  #define QR4_XS 8
140
- #endif
141
131
 
142
132
  #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143
133
 
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
228
218
  // weight is represented as x = a * q
229
219
  // 16 blocks of 16 elements each
230
220
  // Effectively 3.4375 bits per weight
231
- #ifdef GGML_QKK_64
232
- typedef struct {
233
- uint8_t hmask[QK_K/8]; // quants - high bit
234
- uint8_t qs[QK_K/4]; // quants - low 2 bits
235
- uint8_t scales[2];
236
- ggml_half d; // super-block scale
237
- } block_q3_K;
238
- static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239
- #else
240
221
  typedef struct {
241
222
  uint8_t hmask[QK_K/8]; // quants - high bit
242
223
  uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -244,20 +225,11 @@ typedef struct {
244
225
  ggml_half d; // super-block scale
245
226
  } block_q3_K;
246
227
  static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247
- #endif
248
228
 
249
229
  // 4-bit quantization
250
230
  // 8 blocks of 32 elements each
251
231
  // weight is represented as x = a * q + b
252
232
  // Effectively 4.5 bits per weight
253
- #ifdef GGML_QKK_64
254
- typedef struct {
255
- ggml_half d[2]; // super-block scales/mins
256
- uint8_t scales[2]; // 4-bit block scales/mins
257
- uint8_t qs[QK_K/2]; // 4--bit quants
258
- } block_q4_K;
259
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260
- #else
261
233
  typedef struct {
262
234
  union {
263
235
  struct {
@@ -270,21 +242,11 @@ typedef struct {
270
242
  uint8_t qs[QK_K/2]; // 4--bit quants
271
243
  } block_q4_K;
272
244
  static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273
- #endif
274
245
 
275
246
  // 5-bit quantization
276
247
  // 8 blocks of 32 elements each
277
248
  // weight is represented as x = a * q + b
278
249
  // Effectively 5.5 bits per weight
279
- #ifdef GGML_QKK_64
280
- typedef struct {
281
- ggml_half d; // super-block scale
282
- int8_t scales[QK_K/16]; // 8-bit block scales
283
- uint8_t qh[QK_K/8]; // quants, high bit
284
- uint8_t qs[QK_K/2]; // quants, low 4 bits
285
- } block_q5_K;
286
- static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287
- #else
288
250
  typedef struct {
289
251
  union {
290
252
  struct {
@@ -298,7 +260,6 @@ typedef struct {
298
260
  uint8_t qs[QK_K/2]; // quants, low 4 bits
299
261
  } block_q5_K;
300
262
  static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301
- #endif
302
263
 
303
264
  // 6-bit quantization
304
265
  // weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356
317
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
357
318
 
358
319
  // 3.4375 bpw
359
- #if QK_K == 64
360
- #define IQ3S_N_SCALE 2
361
- #else
362
320
  #define IQ3S_N_SCALE QK_K/64
363
- #endif
364
321
  typedef struct {
365
322
  ggml_half d;
366
323
  uint8_t qs[QK_K/4];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
381
338
  typedef struct {
382
339
  uint8_t qs[QK_K/8]; // grid index, low 8 bits
383
340
  uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
- #if QK_K == 64
385
- ggml_half d;
386
- #endif
387
341
  uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
388
342
  } block_iq1_m;
389
- #if QK_K == 64
390
- static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
- #else
392
343
  static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
- #endif
394
344
 
395
345
  // Used by IQ1_M quants
396
346
  typedef union {
@@ -406,9 +356,6 @@ typedef struct {
406
356
  } block_iq4_nl;
407
357
  static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408
358
 
409
- #if QK_K == 64
410
- #define block_iq4_xs block_iq4_nl
411
- #else
412
359
  typedef struct {
413
360
  ggml_half d;
414
361
  uint16_t scales_h;
@@ -416,7 +363,6 @@ typedef struct {
416
363
  uint8_t qs[QK_K/2];
417
364
  } block_iq4_xs;
418
365
  static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419
- #endif
420
366
 
421
367
  #endif // GGML_COMMON_DECL
422
368
  #endif // GGML_COMMON_DECL
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
38
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
40
 
41
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
41
42
  #ifdef __cplusplus
42
43
  }
43
44
  #endif
@@ -17,6 +17,18 @@
17
17
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
18
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
19
 
20
+ #if defined(_WIN32)
21
+
22
+ #define m512bh(p) p
23
+ #define m512i(p) p
24
+
25
+ #else
26
+
27
+ #define m512bh(p) (__m512bh)(p)
28
+ #define m512i(p) (__m512i)(p)
29
+
30
+ #endif
31
+
20
32
  /**
21
33
  * Converts brain16 to float32.
22
34
  *
@@ -120,9 +132,20 @@ extern "C" {
120
132
  #ifndef __F16C__
121
133
  #define __F16C__
122
134
  #endif
135
+ #endif
136
+
137
+ // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
138
+ #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
123
139
  #ifndef __SSE3__
124
140
  #define __SSE3__
125
141
  #endif
142
+ #ifndef __SSSE3__
143
+ #define __SSSE3__
144
+ #endif
145
+ #endif
146
+
147
+ #if defined(__ARM_FEATURE_SVE)
148
+ #include <arm_sve.h>
126
149
  #endif
127
150
 
128
151
  // 16-bit float
@@ -436,6 +459,34 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
436
459
  #include <riscv_vector.h>
437
460
  #endif
438
461
 
462
+ #if defined(__loongarch64)
463
+ #if defined(__loongarch_asx)
464
+ #include <lasxintrin.h>
465
+ #endif
466
+ #if defined(__loongarch_sx)
467
+ #include <lsxintrin.h>
468
+ #endif
469
+ #endif
470
+
471
+ #if defined(__loongarch_asx)
472
+
473
+ typedef union {
474
+ int32_t i;
475
+ float f;
476
+ } ft_union;
477
+
478
+ /* float type data load instructions */
479
+ static __m128 __lsx_vreplfr2vr_s(float val) {
480
+ ft_union fi_tmpval = {.f = val};
481
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
482
+ }
483
+
484
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
485
+ ft_union fi_tmpval = {.f = val};
486
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
487
+ }
488
+ #endif
489
+
439
490
  #ifdef __F16C__
440
491
 
441
492
  #ifdef _MSC_VER
@@ -1677,6 +1677,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1677
1677
  } break;
1678
1678
  case GGML_OP_ROPE:
1679
1679
  {
1680
+ #pragma message("TODO: implement phi3 frequency factors support")
1681
+ #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1682
+ GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1683
+
1680
1684
  GGML_ASSERT(ne10 == ne02);
1681
1685
  GGML_ASSERT(src0t == dstt);
1682
1686
  // const int n_past = ((int32_t *) dst->op_params)[0];
@@ -1835,7 +1835,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1835
1835
  CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1836
1836
  }
1837
1837
 
1838
- for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1838
+ int64_t i12 = i02 * r2;
1839
+ int64_t e12 = i12 + r2;
1840
+ events.reserve(e12 - i12);
1841
+ for (; i12 < e12; i12++) {
1839
1842
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1840
1843
  // copy src1 to device
1841
1844
  events.emplace_back();