@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -28,7 +28,7 @@
28
28
  #define UNUSED GGML_UNUSED
29
29
 
30
30
  // reference implementation for deterministic creation of model files
31
- void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
31
+ void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
32
32
  static const int qk = QK4_0;
33
33
 
34
34
  assert(k % qk == 0);
@@ -65,7 +65,7 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in
65
65
  }
66
66
  }
67
67
 
68
- void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
68
+ void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) {
69
69
  const int qk = QK4_1;
70
70
 
71
71
  assert(k % qk == 0);
@@ -102,7 +102,7 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in
102
102
  }
103
103
  }
104
104
 
105
- void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
105
+ void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) {
106
106
  static const int qk = QK5_0;
107
107
 
108
108
  assert(k % qk == 0);
@@ -146,7 +146,7 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in
146
146
  }
147
147
  }
148
148
 
149
- void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
149
+ void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) {
150
150
  const int qk = QK5_1;
151
151
 
152
152
  assert(k % qk == 0);
@@ -191,7 +191,7 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in
191
191
  }
192
192
 
193
193
  // reference implementation for deterministic creation of model files
194
- void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
194
+ void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) {
195
195
  assert(k % QK8_0 == 0);
196
196
  const int nb = k / QK8_0;
197
197
 
@@ -217,7 +217,7 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in
217
217
  }
218
218
 
219
219
  // reference implementation for deterministic creation of model files
220
- void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
220
+ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) {
221
221
  assert(QK8_1 == 32);
222
222
  assert(k % QK8_1 == 0);
223
223
  const int nb = k / QK8_1;
@@ -252,7 +252,7 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in
252
252
  }
253
253
  }
254
254
 
255
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
255
+ void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
256
256
  static const int qk = QK4_0;
257
257
 
258
258
  assert(k % qk == 0);
@@ -272,7 +272,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6
272
272
  }
273
273
  }
274
274
 
275
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
275
+ void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
276
276
  static const int qk = QK4_1;
277
277
 
278
278
  assert(k % qk == 0);
@@ -293,7 +293,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int6
293
293
  }
294
294
  }
295
295
 
296
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
296
+ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
297
297
  static const int qk = QK5_0;
298
298
 
299
299
  assert(k % qk == 0);
@@ -319,7 +319,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int6
319
319
  }
320
320
  }
321
321
 
322
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
322
+ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
323
323
  static const int qk = QK5_1;
324
324
 
325
325
  assert(k % qk == 0);
@@ -346,7 +346,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6
346
346
  }
347
347
  }
348
348
 
349
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
349
+ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
350
350
  static const int qk = QK8_0;
351
351
 
352
352
  assert(k % qk == 0);
@@ -376,8 +376,8 @@ static inline int nearest_int(float fval) {
376
376
  return (i & 0x007fffff) - 0x00400000;
377
377
  }
378
378
 
379
- static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
380
- const float * restrict qw) {
379
+ static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
380
+ const float * GGML_RESTRICT qw) {
381
381
  float max = 0;
382
382
  float amax = 0;
383
383
  for (int i = 0; i < n; ++i) {
@@ -445,7 +445,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
445
445
  return scale;
446
446
  }
447
447
 
448
- static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
448
+ static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
449
449
  float max = 0;
450
450
  float amax = 0;
451
451
  for (int i = 0; i < n; ++i) {
@@ -504,7 +504,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
504
504
  return 1/iscale;
505
505
  }
506
506
 
507
- static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
507
+ static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
508
508
  int ntry, float alpha) {
509
509
  float min = x[0];
510
510
  float max = x[0];
@@ -547,8 +547,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
547
547
  return scale;
548
548
  }
549
549
 
550
- static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
551
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
550
+ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
551
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
552
552
  float rmin, float rdelta, int nstep, bool use_mad) {
553
553
  float min = x[0];
554
554
  float max = x[0];
@@ -628,7 +628,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
628
628
  return scale;
629
629
  }
630
630
 
631
- static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
631
+ static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
632
632
  if (j < 4) {
633
633
  *d = q[j] & 63; *m = q[j + 4] & 63;
634
634
  } else {
@@ -639,7 +639,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
639
639
 
640
640
  //========================- 2-bit (de)-quantization
641
641
 
642
- void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
642
+ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) {
643
643
  assert(k % QK_K == 0);
644
644
  const int nb = k / QK_K;
645
645
 
@@ -709,7 +709,7 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
709
709
  }
710
710
  }
711
711
 
712
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
712
+ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
713
713
  assert(k % QK_K == 0);
714
714
  const int nb = k / QK_K;
715
715
 
@@ -741,8 +741,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
741
741
  }
742
742
  }
743
743
 
744
- static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
745
- uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
744
+ static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
745
+ uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
746
746
  float rmin, float rdelta, int nstep, bool use_mad) {
747
747
  float min = x[0];
748
748
  float max = x[0];
@@ -824,7 +824,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
824
824
  return scale;
825
825
  }
826
826
 
827
- static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
827
+ static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) {
828
828
  float max = 0;
829
829
  for (int i = 0; i < n; ++i) {
830
830
  max = MAX(max, x[i]);
@@ -897,7 +897,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
897
897
  return sumlx/suml2;
898
898
  }
899
899
 
900
- static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
900
+ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) {
901
901
  GGML_ASSERT(quant_weights);
902
902
  assert(k % QK_K == 0);
903
903
  const int nb = k / QK_K;
@@ -917,7 +917,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
917
917
  for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
918
918
  float sigma2 = sumx2/QK_K;
919
919
  for (int j = 0; j < QK_K/16; ++j) {
920
- const float * restrict qw = quant_weights + QK_K * i + 16*j;
920
+ const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
921
921
  for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
922
922
  for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
923
923
  scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
@@ -959,7 +959,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
959
959
  }
960
960
  }
961
961
 
962
- size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
962
+ size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
963
963
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
964
964
  if (!quant_weights) {
965
965
  quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -977,7 +977,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
977
977
 
978
978
  //========================= 3-bit (de)-quantization
979
979
 
980
- void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
980
+ void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) {
981
981
  assert(k % QK_K == 0);
982
982
  const int nb = k / QK_K;
983
983
 
@@ -1053,7 +1053,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
1053
1053
  }
1054
1054
  }
1055
1055
 
1056
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
1056
+ void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
1057
1057
  assert(k % QK_K == 0);
1058
1058
  const int nb = k / QK_K;
1059
1059
 
@@ -1067,8 +1067,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
1067
1067
 
1068
1068
  const float d_all = GGML_FP16_TO_FP32(x[i].d);
1069
1069
 
1070
- const uint8_t * restrict q = x[i].qs;
1071
- const uint8_t * restrict hm = x[i].hmask;
1070
+ const uint8_t * GGML_RESTRICT q = x[i].qs;
1071
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1072
1072
  uint8_t m = 1;
1073
1073
 
1074
1074
  memcpy(aux, x[i].scales, 12);
@@ -1103,7 +1103,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
1103
1103
  }
1104
1104
  }
1105
1105
 
1106
- static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
1106
+ static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) {
1107
1107
  assert(n_per_row % QK_K == 0);
1108
1108
  const int nb = n_per_row / QK_K;
1109
1109
 
@@ -1187,7 +1187,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
1187
1187
  }
1188
1188
  }
1189
1189
 
1190
- size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1190
+ size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1191
1191
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
1192
1192
  if (!quant_weights) {
1193
1193
  quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1205,7 +1205,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
1205
1205
 
1206
1206
  // ====================== 4-bit (de)-quantization
1207
1207
 
1208
- void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
1208
+ void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) {
1209
1209
  assert(k % QK_K == 0);
1210
1210
  const int nb = k / QK_K;
1211
1211
 
@@ -1277,7 +1277,7 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
1277
1277
  }
1278
1278
  }
1279
1279
 
1280
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
1280
+ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
1281
1281
  assert(k % QK_K == 0);
1282
1282
  const int nb = k / QK_K;
1283
1283
 
@@ -1301,7 +1301,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
1301
1301
  }
1302
1302
  }
1303
1303
 
1304
- static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
1304
+ static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1305
1305
  assert(n_per_row % QK_K == 0);
1306
1306
  const int64_t nb = n_per_row / QK_K;
1307
1307
 
@@ -1374,7 +1374,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
1374
1374
  }
1375
1375
  }
1376
1376
 
1377
- size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1377
+ size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1378
1378
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
1379
1379
  if (!quant_weights) {
1380
1380
  quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1392,7 +1392,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
1392
1392
 
1393
1393
  // ====================== 5-bit (de)-quantization
1394
1394
 
1395
- void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
1395
+ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) {
1396
1396
  assert(k % QK_K == 0);
1397
1397
  const int64_t nb = k / QK_K;
1398
1398
 
@@ -1454,8 +1454,8 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
1454
1454
  }
1455
1455
  }
1456
1456
 
1457
- uint8_t * restrict qh = y[i].qh;
1458
- uint8_t * restrict ql = y[i].qs;
1457
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
1458
+ uint8_t * GGML_RESTRICT ql = y[i].qs;
1459
1459
  memset(qh, 0, QK_K/8);
1460
1460
 
1461
1461
  uint8_t m1 = 1, m2 = 2;
@@ -1479,7 +1479,7 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
1479
1479
  }
1480
1480
  }
1481
1481
 
1482
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
1482
+ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
1483
1483
  assert(k % QK_K == 0);
1484
1484
  const int64_t nb = k / QK_K;
1485
1485
 
@@ -1506,7 +1506,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
1506
1506
  }
1507
1507
  }
1508
1508
 
1509
- static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
1509
+ static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1510
1510
  assert(n_per_row % QK_K == 0);
1511
1511
  const int64_t nb = n_per_row / QK_K;
1512
1512
 
@@ -1573,8 +1573,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
1573
1573
  }
1574
1574
  }
1575
1575
 
1576
- uint8_t * restrict qh = y[i].qh;
1577
- uint8_t * restrict ql = y[i].qs;
1576
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
1577
+ uint8_t * GGML_RESTRICT ql = y[i].qs;
1578
1578
  memset(qh, 0, QK_K/8);
1579
1579
 
1580
1580
  uint8_t m1 = 1, m2 = 2;
@@ -1599,7 +1599,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
1599
1599
  }
1600
1600
  }
1601
1601
 
1602
- size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1602
+ size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1603
1603
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
1604
1604
  if (!quant_weights) {
1605
1605
  quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1617,7 +1617,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
1617
1617
 
1618
1618
  // ====================== 6-bit (de)-quantization
1619
1619
 
1620
- void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
1620
+ void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
1621
1621
  assert(k % QK_K == 0);
1622
1622
  const int64_t nb = k / QK_K;
1623
1623
 
@@ -1667,8 +1667,8 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
1667
1667
  }
1668
1668
  }
1669
1669
 
1670
- uint8_t * restrict ql = y[i].ql;
1671
- uint8_t * restrict qh = y[i].qh;
1670
+ uint8_t * GGML_RESTRICT ql = y[i].ql;
1671
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
1672
1672
  for (int j = 0; j < QK_K; j += 128) {
1673
1673
  for (int l = 0; l < 32; ++l) {
1674
1674
  const uint8_t q1 = L[j + l + 0] & 0xF;
@@ -1687,16 +1687,16 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
1687
1687
  }
1688
1688
  }
1689
1689
 
1690
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
1690
+ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
1691
1691
  assert(k % QK_K == 0);
1692
1692
  const int64_t nb = k / QK_K;
1693
1693
 
1694
1694
  for (int i = 0; i < nb; i++) {
1695
1695
  const float d = GGML_FP16_TO_FP32(x[i].d);
1696
1696
 
1697
- const uint8_t * restrict ql = x[i].ql;
1698
- const uint8_t * restrict qh = x[i].qh;
1699
- const int8_t * restrict sc = x[i].scales;
1697
+ const uint8_t * GGML_RESTRICT ql = x[i].ql;
1698
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1699
+ const int8_t * GGML_RESTRICT sc = x[i].scales;
1700
1700
 
1701
1701
  for (int n = 0; n < QK_K; n += 128) {
1702
1702
  for (int l = 0; l < 32; ++l) {
@@ -1718,7 +1718,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
1718
1718
  }
1719
1719
  }
1720
1720
 
1721
- static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
1721
+ static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1722
1722
  assert(n_per_row % QK_K == 0);
1723
1723
  const int64_t nb = n_per_row / QK_K;
1724
1724
 
@@ -1781,8 +1781,8 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
1781
1781
  }
1782
1782
  }
1783
1783
 
1784
- uint8_t * restrict ql = y[i].ql;
1785
- uint8_t * restrict qh = y[i].qh;
1784
+ uint8_t * GGML_RESTRICT ql = y[i].ql;
1785
+ uint8_t * GGML_RESTRICT qh = y[i].qh;
1786
1786
  for (int j = 0; j < QK_K; j += 128) {
1787
1787
  for (int l = 0; l < 32; ++l) {
1788
1788
  const uint8_t q1 = L[j + l + 0] & 0xF;
@@ -1802,7 +1802,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
1802
1802
  }
1803
1803
  }
1804
1804
 
1805
- size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1805
+ size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1806
1806
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
1807
1807
  if (!quant_weights) {
1808
1808
  quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1818,7 +1818,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
1818
1818
  return nrow * row_size;
1819
1819
  }
1820
1820
 
1821
- static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
1821
+ static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1822
1822
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
1823
1823
 
1824
1824
  if (!quant_weights) {
@@ -1846,7 +1846,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
1846
1846
  }
1847
1847
  }
1848
1848
 
1849
- size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1849
+ size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1850
1850
  if (!quant_weights) {
1851
1851
  quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
1852
1852
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
@@ -1861,7 +1861,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
1861
1861
  return nrow * row_size;
1862
1862
  }
1863
1863
 
1864
- static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
1864
+ static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1865
1865
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
1866
1866
 
1867
1867
  if (!quant_weights) {
@@ -1891,7 +1891,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
1891
1891
  }
1892
1892
  }
1893
1893
 
1894
- size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1894
+ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1895
1895
  if (!quant_weights) {
1896
1896
  quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
1897
1897
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
@@ -1906,7 +1906,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nr
1906
1906
  return nrow * row_size;
1907
1907
  }
1908
1908
 
1909
- static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
1909
+ static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1910
1910
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
1911
1911
 
1912
1912
  if (!quant_weights) {
@@ -1945,7 +1945,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
1945
1945
  }
1946
1946
  }
1947
1947
 
1948
- size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1948
+ size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
1949
1949
  if (!quant_weights) {
1950
1950
  quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
1951
1951
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
@@ -1960,7 +1960,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nr
1960
1960
  return nrow * row_size;
1961
1961
  }
1962
1962
 
1963
- static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
1963
+ static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
1964
1964
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
1965
1965
 
1966
1966
  if (!quant_weights) {
@@ -1998,7 +1998,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
1998
1998
  }
1999
1999
  }
2000
2000
 
2001
- size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2001
+ size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2002
2002
  if (!quant_weights) {
2003
2003
  quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
2004
2004
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
@@ -2013,7 +2013,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
2013
2013
  return nrow * row_size;
2014
2014
  }
2015
2015
 
2016
- size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2016
+ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2017
2017
  (void)quant_weights; // not used
2018
2018
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
2019
2019
  quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -2022,7 +2022,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
2022
2022
 
2023
2023
  // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
2024
2024
 
2025
- void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
2025
+ void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
2026
2026
  assert(k % QK_K == 0);
2027
2027
  const int64_t nb = k / QK_K;
2028
2028
 
@@ -2088,7 +2088,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
2088
2088
  }
2089
2089
  }
2090
2090
 
2091
- void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
2091
+ void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) {
2092
2092
  assert(k % QK_K == 0);
2093
2093
  const int64_t nb = k / QK_K;
2094
2094
 
@@ -2120,21 +2120,21 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
2120
2120
  }
2121
2121
  }
2122
2122
 
2123
- size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2123
+ size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2124
2124
  (void)quant_weights; // not used
2125
2125
  const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
2126
2126
  quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
2127
2127
  return nrow * row_size;
2128
2128
  }
2129
2129
 
2130
- size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2130
+ size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2131
2131
  (void)quant_weights; // not used
2132
2132
  const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
2133
2133
  quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
2134
2134
  return nrow * row_size;
2135
2135
  }
2136
2136
 
2137
- void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
2137
+ void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2138
2138
  assert(k % QK_K == 0);
2139
2139
  const int64_t nb = k / QK_K;
2140
2140
 
@@ -2173,7 +2173,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in
2173
2173
  }
2174
2174
  }
2175
2175
 
2176
- void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
2176
+ void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2177
2177
  assert(k % QK_K == 0);
2178
2178
  const int64_t nb = k / QK_K;
2179
2179
 
@@ -2194,7 +2194,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in
2194
2194
 
2195
2195
  // ====================== "True" 2-bit (de)-quantization
2196
2196
 
2197
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
2197
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2198
2198
  assert(k % QK_K == 0);
2199
2199
  const int64_t nb = k / QK_K;
2200
2200
 
@@ -2222,7 +2222,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
2222
2222
 
2223
2223
  // ====================== 2.3125 bpw (de)-quantization
2224
2224
 
2225
- void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
2225
+ void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2226
2226
  assert(k % QK_K == 0);
2227
2227
  const int64_t nb = k / QK_K;
2228
2228
 
@@ -2249,7 +2249,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
2249
2249
 
2250
2250
  // ====================== 2.5625 bpw (de)-quantization
2251
2251
 
2252
- void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
2252
+ void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2253
2253
  assert(k % QK_K == 0);
2254
2254
  const int64_t nb = k / QK_K;
2255
2255
 
@@ -2281,7 +2281,7 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
2281
2281
 
2282
2282
  // ====================== 3.0625 bpw (de)-quantization
2283
2283
 
2284
- void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
2284
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2285
2285
  assert(k % QK_K == 0);
2286
2286
  const int64_t nb = k / QK_K;
2287
2287
 
@@ -2313,7 +2313,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
2313
2313
 
2314
2314
  // ====================== 3.3125 bpw (de)-quantization
2315
2315
 
2316
- void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
2316
+ void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2317
2317
  assert(k % QK_K == 0);
2318
2318
  const int64_t nb = k / QK_K;
2319
2319
 
@@ -2356,7 +2356,7 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
2356
2356
 
2357
2357
  // ====================== 1.5625 bpw (de)-quantization
2358
2358
 
2359
- void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
2359
+ void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2360
2360
  assert(k % QK_K == 0);
2361
2361
  const int64_t nb = k / QK_K;
2362
2362
 
@@ -2381,7 +2381,7 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
2381
2381
  }
2382
2382
  }
2383
2383
 
2384
- void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
2384
+ void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2385
2385
  assert(k % QK_K == 0);
2386
2386
  const int64_t nb = k / QK_K;
2387
2387
 
@@ -2433,7 +2433,7 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
2433
2433
 
2434
2434
  static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
2435
2435
 
2436
- void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
2436
+ void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2437
2437
  assert(k % QK4_NL == 0);
2438
2438
  const int64_t nb = k / QK4_NL;
2439
2439
 
@@ -2451,7 +2451,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
2451
2451
  }
2452
2452
  }
2453
2453
 
2454
- void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
2454
+ void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2455
2455
  assert(k % QK_K == 0);
2456
2456
  const int64_t nb = k / QK_K;
2457
2457
 
@@ -2476,7 +2476,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
2476
2476
 
2477
2477
  //===================================== Q8_K ==============================================
2478
2478
 
2479
- void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
2479
+ void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
2480
2480
  assert(k % QK_K == 0);
2481
2481
  const int64_t nb = k / QK_K;
2482
2482
 
@@ -2515,7 +2515,7 @@ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, in
2515
2515
  }
2516
2516
  }
2517
2517
 
2518
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
2518
+ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
2519
2519
  assert(k % QK_K == 0);
2520
2520
  const int64_t nb = k / QK_K;
2521
2521
 
@@ -2927,8 +2927,8 @@ void iq2xs_free_impl(enum ggml_type type) {
2927
2927
  }
2928
2928
  }
2929
2929
 
2930
- static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
2931
- const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
2930
+ static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
2931
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
2932
2932
  int num_neighbors = neighbours[0];
2933
2933
  GGML_ASSERT(num_neighbors > 0);
2934
2934
  float best_d2 = FLT_MAX;
@@ -2951,7 +2951,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
2951
2951
  return grid_index;
2952
2952
  }
2953
2953
 
2954
- static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
2954
+ static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
2955
2955
 
2956
2956
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
2957
2957
 
@@ -3124,7 +3124,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
3124
3124
  }
3125
3125
  }
3126
3126
 
3127
- static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
3127
+ static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
3128
3128
 
3129
3129
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
3130
3130
 
@@ -3304,7 +3304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
3304
3304
  }
3305
3305
  }
3306
3306
 
3307
- size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3307
+ size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3308
3308
  GGML_ASSERT(n_per_row%QK_K == 0);
3309
3309
  int64_t nblock = n_per_row/QK_K;
3310
3310
  char * qrow = (char *)dst;
@@ -3316,7 +3316,7 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
3316
3316
  return nrow * nblock * sizeof(block_iq2_xxs);
3317
3317
  }
3318
3318
 
3319
- size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3319
+ size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3320
3320
  GGML_ASSERT(n_per_row%QK_K == 0);
3321
3321
  int64_t nblock = n_per_row/QK_K;
3322
3322
  char * qrow = (char *)dst;
@@ -3521,8 +3521,8 @@ void iq3xs_free_impl(int grid_size) {
3521
3521
  }
3522
3522
  }
3523
3523
 
3524
- static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
3525
- const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
3524
+ static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid,
3525
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
3526
3526
  int num_neighbors = neighbours[0];
3527
3527
  GGML_ASSERT(num_neighbors > 0);
3528
3528
  float best_d2 = FLT_MAX;
@@ -3545,8 +3545,8 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
3545
3545
  return grid_index;
3546
3546
  }
3547
3547
 
3548
- static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
3549
- const float * restrict quant_weights) {
3548
+ static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n,
3549
+ const float * GGML_RESTRICT quant_weights) {
3550
3550
 
3551
3551
  const int gindex = iq3_data_index(grid_size);
3552
3552
 
@@ -3758,7 +3758,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
3758
3758
  }
3759
3759
  }
3760
3760
 
3761
- size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3761
+ size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3762
3762
  GGML_ASSERT(n_per_row%QK_K == 0);
3763
3763
  int64_t nblock = n_per_row/QK_K;
3764
3764
  char * qrow = (char *)dst;
@@ -3770,13 +3770,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
3770
3770
  return nrow * nblock * sizeof(block_iq3_xxs);
3771
3771
  }
3772
3772
 
3773
- void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
3773
+ void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) {
3774
3774
  assert(k % QK_K == 0);
3775
3775
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
3776
3776
  }
3777
3777
 
3778
- static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
3779
- const float * restrict quant_weights,
3778
+ static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n,
3779
+ const float * GGML_RESTRICT quant_weights,
3780
3780
  float * scales,
3781
3781
  float * weight,
3782
3782
  float * xval,
@@ -3958,7 +3958,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
3958
3958
  }
3959
3959
 
3960
3960
  #define IQ3S_BLOCK_SIZE 32
3961
- size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3961
+ size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3962
3962
  GGML_ASSERT(n_per_row%QK_K == 0);
3963
3963
  int64_t nblock = n_per_row/QK_K;
3964
3964
  float scales[QK_K/IQ3S_BLOCK_SIZE];
@@ -3980,7 +3980,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
3980
3980
  return nrow * nblock * sizeof(block_iq3_s);
3981
3981
  }
3982
3982
 
3983
- void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
3983
+ void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) {
3984
3984
  assert(k % QK_K == 0);
3985
3985
  quantize_iq3_s(x, y, 1, k, NULL);
3986
3986
  }
@@ -3988,8 +3988,8 @@ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y,
3988
3988
 
3989
3989
  // =================================== 1.5 bpw ===================================================
3990
3990
 
3991
- static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
3992
- const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
3991
+ static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
3992
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) {
3993
3993
  int num_neighbors = neighbours[0];
3994
3994
  GGML_ASSERT(num_neighbors > 0);
3995
3995
  float best_score = -FLT_MAX;
@@ -4048,8 +4048,8 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
4048
4048
  return grid_index;
4049
4049
  }
4050
4050
 
4051
- static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
4052
- const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
4051
+ static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
4052
+ const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) {
4053
4053
  int num_neighbors = neighbours[0];
4054
4054
  GGML_ASSERT(num_neighbors > 0);
4055
4055
  float best_score = FLT_MAX;
@@ -4113,7 +4113,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
4113
4113
 
4114
4114
  #define IQ1S_BLOCK_SIZE 32
4115
4115
  #define IQ1M_BLOCK_SIZE 16
4116
- static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
4116
+ static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
4117
4117
  float * scales,
4118
4118
  float * weight,
4119
4119
  float * sumx,
@@ -4271,7 +4271,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
4271
4271
  }
4272
4272
  }
4273
4273
 
4274
- size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4274
+ size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4275
4275
  GGML_ASSERT(n_per_row%QK_K == 0);
4276
4276
  float scales[QK_K/IQ1S_BLOCK_SIZE];
4277
4277
  float weight[IQ1S_BLOCK_SIZE];
@@ -4291,7 +4291,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n
4291
4291
  return nrow * nblock * sizeof(block_iq1_s);
4292
4292
  }
4293
4293
 
4294
- static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
4294
+ static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
4295
4295
  float * scales,
4296
4296
  float * weight,
4297
4297
  float * pairs,
@@ -4539,7 +4539,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
4539
4539
  }
4540
4540
  }
4541
4541
 
4542
- size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4542
+ size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4543
4543
  GGML_ASSERT(n_per_row%QK_K == 0);
4544
4544
  float scales[QK_K/IQ1M_BLOCK_SIZE];
4545
4545
  float weight[IQ1M_BLOCK_SIZE];
@@ -4570,7 +4570,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
4570
4570
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
4571
4571
  }
4572
4572
 
4573
- static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
4573
+ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
4574
4574
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
4575
4575
  float * scales, float * weight, uint8_t * L,
4576
4576
  const int8_t * values,
@@ -4681,7 +4681,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
4681
4681
  }
4682
4682
  }
4683
4683
 
4684
- size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4684
+ size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4685
4685
  GGML_ASSERT(n_per_row%QK4_NL == 0);
4686
4686
  int64_t nblock = n_per_row/QK4_NL;
4687
4687
  char * qrow = (char *)dst;
@@ -4703,8 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
4703
4703
  return nrow * nblock * sizeof(block_iq4_nl);
4704
4704
  }
4705
4705
 
4706
- //void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
4707
- void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
4706
+ //void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
4707
+ void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) {
4708
4708
  GGML_ASSERT(k%QK4_NL == 0);
4709
4709
  int64_t nblock = k/QK4_NL;
4710
4710
  uint8_t L[QK4_NL];
@@ -4719,7 +4719,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
4719
4719
  }
4720
4720
  }
4721
4721
 
4722
- size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4722
+ size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4723
4723
  GGML_ASSERT(n_per_row%QK_K == 0);
4724
4724
  int64_t nblock = n_per_row/QK_K;
4725
4725
  char * qrow = (char *)dst;
@@ -4739,14 +4739,14 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
4739
4739
  return nrow * nblock * sizeof(block_iq4_xs);
4740
4740
  }
4741
4741
 
4742
- void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
4742
+ void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) {
4743
4743
  assert(k % QK_K == 0);
4744
4744
  quantize_iq4_xs(x, y, 1, k, NULL);
4745
4745
  }
4746
4746
 
4747
4747
  // =============================== 2.5625 bpw
4748
4748
 
4749
- static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
4749
+ static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
4750
4750
 
4751
4751
  const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
4752
4752
 
@@ -4914,7 +4914,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
4914
4914
  }
4915
4915
  }
4916
4916
 
4917
- size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4917
+ size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
4918
4918
  GGML_ASSERT(n_per_row%QK_K == 0);
4919
4919
  int64_t nblock = n_per_row/QK_K;
4920
4920
  char * qrow = (char *)dst;
@@ -4926,7 +4926,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
4926
4926
  return nrow * nblock * sizeof(block_iq2_s);
4927
4927
  }
4928
4928
 
4929
- void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
4929
+ void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) {
4930
4930
  assert(k % QK_K == 0);
4931
4931
  quantize_iq2_s(x, y, 1, k, NULL);
4932
4932
  }