@fugood/llama.node 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +19 -15
  8. package/src/LlamaCompletionWorker.cpp +73 -18
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/llama.cpp/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/arg.cpp +147 -46
  12. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  13. package/src/llama.cpp/common/chat.cpp +350 -3
  14. package/src/llama.cpp/common/chat.h +11 -3
  15. package/src/llama.cpp/common/common.cpp +54 -0
  16. package/src/llama.cpp/common/common.h +44 -9
  17. package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
  18. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  19. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  20. package/src/llama.cpp/ggml/include/ggml.h +65 -3
  21. package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  37. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  39. package/src/llama.cpp/include/llama.h +26 -0
  40. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  41. package/src/llama.cpp/src/llama-arch.h +10 -0
  42. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  43. package/src/llama.cpp/src/llama-chat.cpp +15 -4
  44. package/src/llama.cpp/src/llama-chat.h +1 -0
  45. package/src/llama.cpp/src/llama-context.cpp +37 -25
  46. package/src/llama.cpp/src/llama-context.h +6 -5
  47. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  48. package/src/llama.cpp/src/llama-graph.h +38 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -3
  50. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  51. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  52. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
  53. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  54. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  55. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  56. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  57. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  58. package/src/llama.cpp/src/llama-memory.h +2 -2
  59. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  60. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  61. package/src/llama.cpp/src/llama-model.cpp +500 -4
  62. package/src/llama.cpp/src/llama-model.h +25 -4
  63. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  64. package/src/llama.cpp/src/llama-vocab.cpp +43 -0
@@ -2,14 +2,17 @@
2
2
 
3
3
  #pragma once
4
4
 
5
- #include "llama-cpp.h"
6
-
7
5
  #include <set>
6
+ #include <sstream>
8
7
  #include <string>
9
8
  #include <string_view>
10
9
  #include <vector>
11
10
  #include <map>
12
11
  #include <sstream>
12
+ #include <cmath>
13
+
14
+ #include "ggml-opt.h"
15
+ #include "llama-cpp.h"
13
16
 
14
17
  #ifdef _WIN32
15
18
  #define DIRECTORY_SEPARATOR '\\'
@@ -82,6 +85,7 @@ enum llama_example {
82
85
  LLAMA_EXAMPLE_PARALLEL,
83
86
  LLAMA_EXAMPLE_TTS,
84
87
  LLAMA_EXAMPLE_DIFFUSION,
88
+ LLAMA_EXAMPLE_FINETUNE,
85
89
 
86
90
  LLAMA_EXAMPLE_COUNT,
87
91
  };
@@ -202,6 +206,7 @@ struct common_params_speculative {
202
206
  float p_split = 0.1f; // speculative decoding split probability
203
207
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
204
208
  std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
209
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
205
210
 
206
211
  ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
207
212
  ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -236,10 +241,31 @@ struct common_params_diffusion {
236
241
 
237
242
  enum common_reasoning_format {
238
243
  COMMON_REASONING_FORMAT_NONE,
244
+ COMMON_REASONING_FORMAT_AUTO,
239
245
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
240
246
  COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
247
+ COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
241
248
  };
242
249
 
250
+
251
+ struct lr_opt {
252
+ float lr0 = 1e-5; // learning rate at first epoch
253
+ float lr_min = -1;
254
+ float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
255
+ float scale_epoch = 0;
256
+ float wd = 0;
257
+ unsigned epochs = 2;
258
+
259
+ unsigned epoch; // set by optimizer outer (epochs) loop
260
+ // learning rate decay - constant LR per epoch only for now
261
+ float get_lr(float e) const;
262
+ float get_lr() const { return get_lr(epoch); }
263
+ // must call after arg parse, before get_lr
264
+ void init();
265
+ };
266
+
267
+ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
268
+
243
269
  struct common_params {
244
270
  bool vocab_only = false;
245
271
  int32_t n_predict = -1; // new tokens to predict
@@ -375,6 +401,11 @@ struct common_params {
375
401
  bool no_mmproj = false; // explicitly disable multimodal model
376
402
  std::vector<std::string> image; // path to image file(s)
377
403
 
404
+ // finetune
405
+ struct lr_opt lr;
406
+ enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
407
+ float val_split = 0.05f; // fraction of the data used for the validation set
408
+
378
409
  // embedding
379
410
  bool embedding = false; // get only sentence embedding
380
411
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
@@ -383,11 +414,12 @@ struct common_params {
383
414
  std::string cls_sep = "\t"; // separator of classification sequences
384
415
 
385
416
  // server params
386
- int32_t port = 8080; // server listens on this network port
387
- int32_t timeout_read = 600; // http read timeout in seconds
388
- int32_t timeout_write = timeout_read; // http write timeout in seconds
389
- int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
390
- int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
417
+ int32_t port = 8080; // server listens on this network port
418
+ int32_t timeout_read = 600; // http read timeout in seconds
419
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
420
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
421
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
422
+ int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
391
423
 
392
424
  std::string hostname = "127.0.0.1";
393
425
  std::string public_path = ""; // NOLINT
@@ -395,7 +427,7 @@ struct common_params {
395
427
  std::string chat_template = ""; // NOLINT
396
428
  bool use_jinja = false; // NOLINT
397
429
  bool enable_chat_template = true;
398
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
430
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
399
431
  int reasoning_budget = -1;
400
432
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
401
433
 
@@ -440,7 +472,7 @@ struct common_params {
440
472
  int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
441
473
  int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
442
474
  int32_t i_chunk = 0; // start processing from this chunk
443
- bool imat_dat = false; // whether the legacy imatrix.dat format should be output
475
+ int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
444
476
 
445
477
  bool process_output = false; // collect data for the output tensor
446
478
  bool compute_ppl = true; // whether to compute perplexity
@@ -702,3 +734,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
702
734
  //
703
735
 
704
736
  ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
737
+
738
+ // "adamw" or "sgd" (case insensitive)
739
+ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
@@ -39,8 +39,9 @@ if (WIN32)
39
39
  set(CMAKE_SHARED_MODULE_PREFIX "")
40
40
  endif()
41
41
 
42
- option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
43
- option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
42
+ option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
43
+ option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
44
+ set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
44
45
 
45
46
  #
46
47
  # option list
@@ -175,6 +176,7 @@ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM"
175
176
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
176
177
  option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
177
178
  option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
179
+ option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
178
180
  option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
179
181
  option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
180
182
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
@@ -186,6 +188,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
186
188
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
187
189
  option(GGML_WEBGPU "ggml: use WebGPU" OFF)
188
190
  option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
191
+ option(GGML_ZDNN "ggml: use zDNN" OFF)
189
192
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
190
193
  option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
191
194
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -74,16 +74,26 @@ extern "C" {
74
74
  GGML_OPT_BUILD_TYPE_OPT = 30,
75
75
  };
76
76
 
77
+ enum ggml_opt_optimizer_type {
78
+ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
79
+ GGML_OPT_OPTIMIZER_TYPE_SGD,
80
+
81
+ GGML_OPT_OPTIMIZER_TYPE_COUNT
82
+ };
83
+
77
84
  // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
78
85
  struct ggml_opt_optimizer_params {
79
- // AdamW optimizer parameters
80
86
  struct {
81
87
  float alpha; // learning rate
82
- float beta1;
83
- float beta2;
88
+ float beta1; // first AdamW momentum
89
+ float beta2; // second AdamW momentum
84
90
  float eps; // epsilon for numerical stability
85
- float wd; // weight decay for AdamW, use 0.0f to disable
91
+ float wd; // weight decay - 0.0f to disable
86
92
  } adamw;
93
+ struct {
94
+ float alpha; // learning rate
95
+ float wd; // weight decay
96
+ } sgd;
87
97
  };
88
98
 
89
99
  // callback to calculate optimizer parameters prior to a backward pass
@@ -112,8 +122,11 @@ extern "C" {
112
122
 
113
123
  int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
114
124
 
115
- ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
116
- void * get_opt_pars_ud; // userdata for calculating optimizer parameters
125
+ ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
126
+ void * get_opt_pars_ud; // userdata for calculating optimizer parameters
127
+
128
+ // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
129
+ enum ggml_opt_optimizer_type optimizer;
117
130
  };
118
131
 
119
132
  // get parameters for an optimization context with defaults set where possible
@@ -142,6 +155,10 @@ extern "C" {
142
155
  // get the gradient accumulator for a node from the forward graph
143
156
  GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
144
157
 
158
+ GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
159
+
160
+ GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
161
+
145
162
  // ====== Optimization Result ======
146
163
 
147
164
  GGML_API ggml_opt_result_t ggml_opt_result_init(void);
@@ -226,12 +243,14 @@ extern "C" {
226
243
  struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
227
244
  ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
228
245
  enum ggml_opt_loss_type loss_type, // loss to minimize
246
+ enum ggml_opt_optimizer_type optimizer, // sgd or adamw
229
247
  ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
230
248
  int64_t nepoch, // how many times the dataset should be iterated over
231
249
  int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
232
250
  float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
233
251
  bool silent); // whether or not info prints to stderr should be suppressed
234
252
 
253
+
235
254
  #ifdef __cplusplus
236
255
  }
237
256
  #endif
@@ -0,0 +1,16 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
11
+
12
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
13
+
14
+ #ifdef __cplusplus
15
+ }
16
+ #endif
@@ -241,6 +241,8 @@
241
241
  #define GGML_ROPE_TYPE_MROPE 8
242
242
  #define GGML_ROPE_TYPE_VISION 24
243
243
 
244
+ #define GGML_MROPE_SECTIONS 4
245
+
244
246
  #define GGML_UNUSED(x) (void)(x)
245
247
 
246
248
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -304,6 +306,16 @@
304
306
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
305
307
  GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
306
308
 
309
+ #define GGML_TENSOR_TERNARY_OP_LOCALS \
310
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
311
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
312
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
313
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
314
+ GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
315
+ GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
316
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
317
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
318
+
307
319
  #define GGML_TENSOR_BINARY_OP_LOCALS01 \
308
320
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
309
321
  GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
@@ -395,7 +407,8 @@ extern "C" {
395
407
  // GGML_TYPE_IQ4_NL_4_4 = 36,
396
408
  // GGML_TYPE_IQ4_NL_4_8 = 37,
397
409
  // GGML_TYPE_IQ4_NL_8_8 = 38,
398
- GGML_TYPE_COUNT = 39,
410
+ GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
411
+ GGML_TYPE_COUNT = 40,
399
412
  };
400
413
 
401
414
  // precision
@@ -430,6 +443,7 @@ extern "C" {
430
443
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
431
444
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
432
445
  GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
446
+ GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
433
447
  };
434
448
 
435
449
  // available tensor operations:
@@ -438,6 +452,7 @@ extern "C" {
438
452
 
439
453
  GGML_OP_DUP,
440
454
  GGML_OP_ADD,
455
+ GGML_OP_ADD_ID,
441
456
  GGML_OP_ADD1,
442
457
  GGML_OP_ACC,
443
458
  GGML_OP_SUB,
@@ -527,6 +542,7 @@ extern "C" {
527
542
  GGML_OP_CROSS_ENTROPY_LOSS,
528
543
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
529
544
  GGML_OP_OPT_STEP_ADAMW,
545
+ GGML_OP_OPT_STEP_SGD,
530
546
 
531
547
  GGML_OP_GLU,
532
548
 
@@ -557,6 +573,7 @@ extern "C" {
557
573
  GGML_GLU_OP_REGLU,
558
574
  GGML_GLU_OP_GEGLU,
559
575
  GGML_GLU_OP_SWIGLU,
576
+ GGML_GLU_OP_SWIGLU_OAI,
560
577
  GGML_GLU_OP_GEGLU_ERF,
561
578
  GGML_GLU_OP_GEGLU_QUICK,
562
579
 
@@ -831,6 +848,13 @@ extern "C" {
831
848
  struct ggml_tensor * b,
832
849
  enum ggml_type type);
833
850
 
851
+ // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
852
+ GGML_API struct ggml_tensor * ggml_add_id(
853
+ struct ggml_context * ctx,
854
+ struct ggml_tensor * a,
855
+ struct ggml_tensor * b,
856
+ struct ggml_tensor * ids);
857
+
834
858
  GGML_API struct ggml_tensor * ggml_add1(
835
859
  struct ggml_context * ctx,
836
860
  struct ggml_tensor * a,
@@ -1198,6 +1222,13 @@ extern "C" {
1198
1222
  struct ggml_tensor * a,
1199
1223
  struct ggml_tensor * b);
1200
1224
 
1225
+ GGML_API struct ggml_tensor * ggml_swiglu_oai(
1226
+ struct ggml_context * ctx,
1227
+ struct ggml_tensor * a,
1228
+ struct ggml_tensor * b,
1229
+ float alpha,
1230
+ float limit);
1231
+
1201
1232
  // normalize along rows
1202
1233
  GGML_API struct ggml_tensor * ggml_norm(
1203
1234
  struct ggml_context * ctx,
@@ -1570,6 +1601,10 @@ extern "C" {
1570
1601
  float scale,
1571
1602
  float max_bias);
1572
1603
 
1604
+ GGML_API void ggml_soft_max_add_sinks(
1605
+ struct ggml_tensor * a,
1606
+ struct ggml_tensor * sinks);
1607
+
1573
1608
  GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
1574
1609
  struct ggml_context * ctx,
1575
1610
  struct ggml_tensor * a,
@@ -1628,7 +1663,7 @@ extern "C" {
1628
1663
  struct ggml_tensor * b,
1629
1664
  struct ggml_tensor * c,
1630
1665
  int n_dims,
1631
- int sections[4],
1666
+ int sections[GGML_MROPE_SECTIONS],
1632
1667
  int mode,
1633
1668
  int n_ctx_orig,
1634
1669
  float freq_base,
@@ -1654,6 +1689,22 @@ extern "C" {
1654
1689
  float beta_fast,
1655
1690
  float beta_slow);
1656
1691
 
1692
+ GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
1693
+ struct ggml_context * ctx,
1694
+ struct ggml_tensor * a,
1695
+ struct ggml_tensor * b,
1696
+ struct ggml_tensor * c,
1697
+ int n_dims,
1698
+ int sections[GGML_MROPE_SECTIONS],
1699
+ int mode,
1700
+ int n_ctx_orig,
1701
+ float freq_base,
1702
+ float freq_scale,
1703
+ float ext_factor,
1704
+ float attn_factor,
1705
+ float beta_fast,
1706
+ float beta_slow);
1707
+
1657
1708
  GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1658
1709
  struct ggml_context * ctx,
1659
1710
  struct ggml_tensor * a,
@@ -2052,6 +2103,10 @@ extern "C" {
2052
2103
  GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
2053
2104
  const struct ggml_tensor * a);
2054
2105
 
2106
+ GGML_API void ggml_flash_attn_ext_add_sinks(
2107
+ struct ggml_tensor * a,
2108
+ struct ggml_tensor * sinks);
2109
+
2055
2110
  // TODO: needs to be adapted to ggml_flash_attn_ext
2056
2111
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
2057
2112
  struct ggml_context * ctx,
@@ -2257,7 +2312,14 @@ extern "C" {
2257
2312
  struct ggml_tensor * grad,
2258
2313
  struct ggml_tensor * m,
2259
2314
  struct ggml_tensor * v,
2260
- struct ggml_tensor * adamw_params); // parameters such a the learning rate
2315
+ struct ggml_tensor * adamw_params); // parameters such as the learning rate
2316
+
2317
+ // stochastic gradient descent step (with weight decay)
2318
+ GGML_API struct ggml_tensor * ggml_opt_step_sgd(
2319
+ struct ggml_context * ctx,
2320
+ struct ggml_tensor * a,
2321
+ struct ggml_tensor * grad,
2322
+ struct ggml_tensor * sgd_params); // alpha, weight decay
2261
2323
 
2262
2324
  //
2263
2325
  // automatic differentiation
@@ -214,6 +214,13 @@ add_library(ggml
214
214
  ggml-backend-reg.cpp)
215
215
  add_library(ggml::ggml ALIAS ggml)
216
216
 
217
+ if (GGML_BACKEND_DIR)
218
+ if (NOT GGML_BACKEND_DL)
219
+ message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
220
+ endif()
221
+ target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
222
+ endif()
223
+
217
224
  target_link_libraries(ggml PUBLIC ggml-base)
218
225
 
219
226
  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -227,7 +234,11 @@ function(ggml_add_backend_library backend)
227
234
  set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
228
235
  target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
229
236
  add_dependencies(ggml ${backend})
230
- install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
237
+ if (GGML_BACKEND_DIR)
238
+ install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
239
+ else()
240
+ install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
241
+ endif()
231
242
  else()
232
243
  add_library(${backend} ${ARGN})
233
244
  target_link_libraries(ggml PUBLIC ${backend})
@@ -371,6 +382,7 @@ ggml_add_backend(RPC)
371
382
  ggml_add_backend(SYCL)
372
383
  ggml_add_backend(Vulkan)
373
384
  ggml_add_backend(WebGPU)
385
+ ggml_add_backend(zDNN)
374
386
  ggml_add_backend(OpenCL)
375
387
 
376
388
  foreach (target ggml-base ggml)
@@ -460,7 +460,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
460
460
  # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
461
461
  # binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
462
462
  message(STATUS "z17 target")
463
- list(APPEND ARCH_FLAGS -march=z17)
463
+ list(APPEND ARCH_FLAGS -march=arch15)
464
464
  else()
465
465
  message(STATUS "Unknown target")
466
466
  message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
@@ -589,6 +589,67 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
589
589
  *s = sumf;
590
590
  }
591
591
 
592
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
593
+ assert(nrc == 1);
594
+ UNUSED(nrc);
595
+ UNUSED(bx);
596
+ UNUSED(by);
597
+ UNUSED(bs);
598
+ assert(n % QK_MXFP4 == 0);
599
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
600
+
601
+ const block_mxfp4 * GGML_RESTRICT x = vx;
602
+ const block_q8_0 * GGML_RESTRICT y = vy;
603
+
604
+ const int nb = n / QK_MXFP4;
605
+
606
+ int ib = 0;
607
+ float sumf = 0;
608
+
609
+ #if defined __ARM_NEON
610
+ const int8x16_t values = vld1q_s8(kvalues_mxfp4);
611
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
612
+ uint8x16x2_t q4bits;
613
+ int8x16x4_t q4b;
614
+ int8x16x4_t q8b;
615
+ int32x4_t prod_1;
616
+ int32x4_t prod_2;
617
+
618
+ for (; ib + 1 < nb; ib += 2) {
619
+ q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
620
+ q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
621
+ q8b.val[0] = vld1q_s8(y[ib + 0].qs);
622
+ q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
623
+ q8b.val[2] = vld1q_s8(y[ib + 1].qs);
624
+ q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
625
+
626
+ q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
627
+ q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
628
+ q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
629
+ q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
630
+
631
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
632
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
633
+
634
+ sumf +=
635
+ GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
636
+ GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
637
+ }
638
+
639
+ #endif
640
+ for (; ib < nb; ++ib) {
641
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
642
+ int sumi1 = 0;
643
+ int sumi2 = 0;
644
+ for (int j = 0; j < QK_MXFP4/2; ++j) {
645
+ sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
646
+ sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
647
+ }
648
+ sumf += d * (sumi1 + sumi2);
649
+ }
650
+ *s = sumf;
651
+ }
652
+
592
653
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
593
654
  const int qk = QK8_0;
594
655
  const int nb = n / qk;
@@ -66,6 +66,12 @@ static inline int hsum_i32_4(const __m128i a) {
66
66
  }
67
67
 
68
68
  #if defined(__AVX2__) || defined(__AVX512F__)
69
+ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
70
+ const __m256i ax = _mm256_sign_epi8(x, x);
71
+ const __m256i sy = _mm256_sign_epi8(y, x);
72
+ return _mm256_maddubs_epi16(ax, sy);
73
+ }
74
+
69
75
  // spread 32 bits to 32 bytes { 0x00, 0xFF }
70
76
  static inline __m256i bytes_from_bits_32(const uint8_t * x) {
71
77
  uint32_t x32;
@@ -261,6 +267,11 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const
261
267
  return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
262
268
  _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
263
269
  }
270
+
271
+ static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) {
272
+ return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
273
+ _mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
274
+ }
264
275
  #endif
265
276
  #elif defined(__SSSE3__)
266
277
  // horizontally add 4x4 floats
@@ -746,6 +757,91 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
746
757
  #endif
747
758
  }
748
759
 
760
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
761
+ assert(nrc == 1);
762
+ UNUSED(nrc);
763
+ UNUSED(bx);
764
+ UNUSED(by);
765
+ UNUSED(bs);
766
+ assert(n % QK_MXFP4 == 0);
767
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
768
+
769
+ const block_mxfp4 * GGML_RESTRICT x = vx;
770
+ const block_q8_0 * GGML_RESTRICT y = vy;
771
+
772
+ const int nb = n / QK_MXFP4;
773
+
774
+ int ib = 0;
775
+ float sumf = 0;
776
+
777
+ #if defined __AVX2__
778
+
779
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
780
+ const __m128i m4b = _mm_set1_epi8(0x0f);
781
+ const __m256i mone = _mm256_set1_epi16(1);
782
+
783
+ __m256 accum1 = _mm256_setzero_ps();
784
+ __m256 accum2 = _mm256_setzero_ps();
785
+ for (; ib + 1 < nb; ib += 2) {
786
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
787
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
788
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
789
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
790
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
791
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
792
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
793
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
794
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
795
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
796
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
797
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
798
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)),
799
+ _mm256_cvtepi32_ps(p_1), accum1);
800
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)),
801
+ _mm256_cvtepi32_ps(p_2), accum2);
802
+ }
803
+
804
+ sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
805
+
806
+ #elif defined __AVX__
807
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
808
+ const __m128i m4b = _mm_set1_epi8(0x0f);
809
+
810
+ __m256 accum = _mm256_setzero_ps();
811
+ for (; ib + 1 < nb; ib += 2) {
812
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
813
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
814
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
815
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
816
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
817
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
818
+
819
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
820
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
821
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
822
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
823
+
824
+ const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
825
+ const __m256 deltas = quad_mx_delta_float(x[ib].e, y[ib].d, x[ib + 1].e, y[ib + 1].d);
826
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
827
+ }
828
+
829
+ sumf = hsum_float_8(accum);
830
+
831
+ #endif
832
+ for (; ib < nb; ++ib) {
833
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
834
+ int sumi1 = 0;
835
+ int sumi2 = 0;
836
+ for (int j = 0; j < QK_MXFP4/2; ++j) {
837
+ sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
838
+ sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
839
+ }
840
+ sumf += d * (sumi1 + sumi2);
841
+ }
842
+ *s = sumf;
843
+ }
844
+
749
845
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
750
846
  const int qk = QK8_0;
751
847
  const int nb = n / qk;
@@ -3206,14 +3302,6 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3206
3302
  #endif
3207
3303
  }
3208
3304
 
3209
- #if defined(__AVX2__)
3210
- static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
3211
- const __m256i ax = _mm256_sign_epi8(x, x);
3212
- const __m256i sy = _mm256_sign_epi8(y, x);
3213
- return _mm256_maddubs_epi16(ax, sy);
3214
- }
3215
- #endif
3216
-
3217
3305
  void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3218
3306
  assert(n % QK_K == 0);
3219
3307
  assert(nrc == 1);