cui-llama.rn 1.3.0 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/android/src/main/CMakeLists.txt +6 -1
  2. package/android/src/main/jni.cpp +6 -6
  3. package/cpp/amx/amx.cpp +196 -0
  4. package/cpp/amx/amx.h +20 -0
  5. package/cpp/amx/common.h +101 -0
  6. package/cpp/amx/mmq.cpp +2524 -0
  7. package/cpp/amx/mmq.h +16 -0
  8. package/cpp/common.cpp +1981 -1682
  9. package/cpp/common.h +636 -600
  10. package/cpp/ggml-aarch64.c +129 -129
  11. package/cpp/ggml-aarch64.h +19 -19
  12. package/cpp/ggml-alloc.c +1038 -1040
  13. package/cpp/ggml-alloc.h +76 -76
  14. package/cpp/ggml-backend-impl.h +238 -216
  15. package/cpp/ggml-backend-reg.cpp +423 -195
  16. package/cpp/ggml-backend.cpp +1999 -1997
  17. package/cpp/ggml-backend.h +351 -328
  18. package/cpp/ggml-common.h +1859 -1853
  19. package/cpp/ggml-cpp.h +38 -38
  20. package/cpp/ggml-cpu-aarch64.c +3823 -3560
  21. package/cpp/ggml-cpu-aarch64.h +32 -30
  22. package/cpp/ggml-cpu-impl.h +386 -371
  23. package/cpp/ggml-cpu-quants.c +10835 -10822
  24. package/cpp/ggml-cpu-quants.h +63 -63
  25. package/cpp/ggml-cpu.c +99 -103
  26. package/cpp/ggml-cpu.cpp +69 -17
  27. package/cpp/ggml-cpu.h +152 -177
  28. package/cpp/ggml-impl.h +556 -550
  29. package/cpp/ggml-metal.h +66 -66
  30. package/cpp/ggml-metal.m +4426 -4294
  31. package/cpp/ggml-quants.c +5247 -5247
  32. package/cpp/ggml-quants.h +100 -100
  33. package/cpp/ggml-threading.cpp +12 -12
  34. package/cpp/ggml-threading.h +12 -12
  35. package/cpp/ggml.c +7618 -8180
  36. package/cpp/ggml.h +2255 -2411
  37. package/cpp/json-schema-to-grammar.cpp +1045 -0
  38. package/cpp/json-schema-to-grammar.h +8 -0
  39. package/cpp/json.hpp +24766 -0
  40. package/cpp/llama-grammar.cpp +1138 -1138
  41. package/cpp/llama-grammar.h +144 -144
  42. package/cpp/llama-impl.h +181 -181
  43. package/cpp/llama-sampling.cpp +2348 -2348
  44. package/cpp/llama-sampling.h +48 -48
  45. package/cpp/llama-vocab.cpp +1984 -1984
  46. package/cpp/llama-vocab.h +170 -170
  47. package/cpp/llama.cpp +22332 -22132
  48. package/cpp/llama.h +1259 -1253
  49. package/cpp/log.cpp +401 -401
  50. package/cpp/log.h +121 -121
  51. package/cpp/rn-llama.hpp +6 -6
  52. package/cpp/sampling.cpp +505 -466
  53. package/cpp/sampling.h +22 -1
  54. package/cpp/sgemm.cpp +1884 -1884
  55. package/cpp/speculative.cpp +270 -0
  56. package/cpp/speculative.h +28 -0
  57. package/cpp/unicode.cpp +11 -0
  58. package/ios/RNLlamaContext.mm +13 -0
  59. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  60. package/lib/commonjs/grammar.js +4 -2
  61. package/lib/commonjs/grammar.js.map +1 -1
  62. package/lib/commonjs/index.js.map +1 -1
  63. package/lib/module/NativeRNLlama.js.map +1 -1
  64. package/lib/module/grammar.js +2 -1
  65. package/lib/module/grammar.js.map +1 -1
  66. package/lib/module/index.js.map +1 -1
  67. package/lib/typescript/NativeRNLlama.d.ts +94 -4
  68. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  69. package/lib/typescript/grammar.d.ts +5 -6
  70. package/lib/typescript/grammar.d.ts.map +1 -1
  71. package/lib/typescript/index.d.ts +4 -2
  72. package/lib/typescript/index.d.ts.map +1 -1
  73. package/package.json +2 -1
  74. package/src/NativeRNLlama.ts +97 -10
  75. package/src/grammar.ts +10 -8
  76. package/src/index.ts +22 -1
@@ -1,63 +1,63 @@
1
- #pragma once
2
-
3
- #define LM_GGML_COMMON_DECL_C
4
- #include "ggml-common.h"
5
-
6
- #include "ggml.h"
7
-
8
- // GGML CPU internal header
9
-
10
- #ifdef __cplusplus
11
- extern "C" {
12
- #endif
13
-
14
- // Quantization
15
- void quantize_row_q4_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
16
- void quantize_row_q4_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
17
- void quantize_row_q5_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
18
- void quantize_row_q5_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
19
- void quantize_row_q8_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
20
- void quantize_row_q8_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
21
-
22
- void quantize_row_q2_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
23
- void quantize_row_q3_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
24
- void quantize_row_q4_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
25
- void quantize_row_q5_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
26
- void quantize_row_q6_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
27
- void quantize_row_q8_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
28
-
29
- void quantize_row_tq1_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
30
- void quantize_row_tq2_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
31
-
32
- void quantize_row_iq4_nl (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
33
- void quantize_row_iq4_xs (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
34
-
35
- // Dot product
36
- void lm_ggml_vec_dot_q4_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
37
- void lm_ggml_vec_dot_q4_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
38
- void lm_ggml_vec_dot_q5_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
39
- void lm_ggml_vec_dot_q5_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
40
- void lm_ggml_vec_dot_q8_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
41
-
42
- void lm_ggml_vec_dot_q2_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
43
- void lm_ggml_vec_dot_q3_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
44
- void lm_ggml_vec_dot_q4_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
45
- void lm_ggml_vec_dot_q5_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
46
- void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
47
-
48
- void lm_ggml_vec_dot_tq1_0_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
49
- void lm_ggml_vec_dot_tq2_0_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
50
-
51
- void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
52
- void lm_ggml_vec_dot_iq2_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
53
- void lm_ggml_vec_dot_iq2_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
54
- void lm_ggml_vec_dot_iq3_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
55
- void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
56
- void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
57
- void lm_ggml_vec_dot_iq4_nl_q8_0 (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
58
- void lm_ggml_vec_dot_iq4_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
59
- void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
60
-
61
- #ifdef __cplusplus
62
- }
63
- #endif
1
+ #pragma once
2
+
3
+ #define LM_GGML_COMMON_DECL_C
4
+ #include "ggml-common.h"
5
+
6
+ #include "ggml.h"
7
+
8
+ // GGML CPU internal header
9
+
10
+ #ifdef __cplusplus
11
+ extern "C" {
12
+ #endif
13
+
14
+ // Quantization
15
+ void quantize_row_q4_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
16
+ void quantize_row_q4_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
17
+ void quantize_row_q5_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
18
+ void quantize_row_q5_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
19
+ void quantize_row_q8_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
20
+ void quantize_row_q8_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
21
+
22
+ void quantize_row_q2_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
23
+ void quantize_row_q3_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
24
+ void quantize_row_q4_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
25
+ void quantize_row_q5_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
26
+ void quantize_row_q6_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
27
+ void quantize_row_q8_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
28
+
29
+ void quantize_row_tq1_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
30
+ void quantize_row_tq2_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
31
+
32
+ void quantize_row_iq4_nl (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
33
+ void quantize_row_iq4_xs (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
34
+
35
+ // Dot product
36
+ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
37
+ void lm_ggml_vec_dot_q4_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
38
+ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
39
+ void lm_ggml_vec_dot_q5_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
40
+ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
41
+
42
+ void lm_ggml_vec_dot_q2_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
43
+ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
44
+ void lm_ggml_vec_dot_q4_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
45
+ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
46
+ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
47
+
48
+ void lm_ggml_vec_dot_tq1_0_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
49
+ void lm_ggml_vec_dot_tq2_0_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
50
+
51
+ void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
52
+ void lm_ggml_vec_dot_iq2_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
53
+ void lm_ggml_vec_dot_iq2_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
54
+ void lm_ggml_vec_dot_iq3_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
55
+ void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
56
+ void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
57
+ void lm_ggml_vec_dot_iq4_nl_q8_0 (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
58
+ void lm_ggml_vec_dot_iq4_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
59
+ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
60
+
61
+ #ifdef __cplusplus
62
+ }
63
+ #endif
package/cpp/ggml-cpu.c CHANGED
@@ -10,6 +10,7 @@
10
10
  #include "ggml-quants.h"
11
11
  #include "ggml-cpu-quants.h"
12
12
  #include "ggml-threading.h"
13
+ #include "amx/amx.h"
13
14
  #include "ggml.h"
14
15
 
15
16
  #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -109,10 +110,11 @@ static lm_ggml_fp16_t lm_ggml_table_gelu_quick_f16[1 << 16];
109
110
  #if defined(__ARM_ARCH)
110
111
  struct lm_ggml_arm_arch_features_type {
111
112
  int has_neon;
113
+ int has_dotprod;
112
114
  int has_i8mm;
113
115
  int has_sve;
114
116
  int sve_cnt;
115
- } lm_ggml_arm_arch_features = {-1, -1, -1, 0};
117
+ } lm_ggml_arm_arch_features = {-1, -1, -1, -1, 0};
116
118
  #endif
117
119
 
118
120
 
@@ -446,6 +448,15 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
446
448
  .vec_dot_type = LM_GGML_TYPE_Q8_K,
447
449
  .nrows = 1,
448
450
  },
451
+ [LM_GGML_TYPE_IQ4_NL_4_4] = {
452
+ .from_float = NULL,
453
+ .vec_dot = NULL,
454
+ .vec_dot_type = LM_GGML_TYPE_Q8_0,
455
+ .nrows = 1,
456
+ .ncols = 4,
457
+ .gemv = lm_ggml_gemv_iq4_nl_4x4_q8_0,
458
+ .gemm = lm_ggml_gemm_iq4_nl_4x4_q8_0,
459
+ },
449
460
  };
450
461
 
451
462
  const struct lm_ggml_type_traits_cpu * lm_ggml_get_type_traits_cpu(enum lm_ggml_type type) {
@@ -614,7 +625,7 @@ do { \
614
625
  for (int i = 0; i < offset; ++i) { \
615
626
  x[i] = _mm512_add_ps(x[i], x[offset+i]); \
616
627
  } \
617
- res = _mm512_reduce_add_ps(x[0]); \
628
+ res = (lm_ggml_float) _mm512_reduce_add_ps(x[0]); \
618
629
  } while (0)
619
630
 
620
631
  // TODO: is this optimal ?
@@ -664,7 +675,7 @@ do { \
664
675
  for (int i = 0; i < offset; ++i) { \
665
676
  x[i] = _mm512_add_ps(x[i], x[offset+i]); \
666
677
  } \
667
- res = _mm512_reduce_add_ps(x[0]); \
678
+ res = (lm_ggml_float) _mm512_reduce_add_ps(x[0]); \
668
679
  } while (0)
669
680
 
670
681
  #define LM_GGML_F16_VEC LM_GGML_F32Cx16
@@ -675,8 +686,8 @@ do { \
675
686
  #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx16_FMA
676
687
  #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx16_ADD
677
688
  #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx16_MUL
678
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx16_REDUCE
679
689
 
690
+ #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx16_REDUCE
680
691
  #elif defined(__AVX__)
681
692
 
682
693
  #define LM_GGML_SIMD
@@ -1168,28 +1179,28 @@ static inline void __lasx_f32cx8_store(lm_ggml_fp16_t * x, __m256 y) {
1168
1179
  #define LM_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1169
1180
  #define LM_GGML_F32x4_ADD __lsx_vfadd_s
1170
1181
  #define LM_GGML_F32x4_MUL __lsx_vfmul_s
1171
- #define LM_GGML_F32x4_REDUCE(res, x) \
1172
- { \
1173
- int offset = LM_GGML_F32_ARR >> 1; \
1174
- for (int i = 0; i < offset; ++i) { \
1175
- x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1176
- } \
1177
- offset >>= 1; \
1178
- for (int i = 0; i < offset; ++i) { \
1179
- x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1180
- } \
1181
- offset >>= 1; \
1182
- for (int i = 0; i < offset; ++i) { \
1183
- x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1184
- } \
1185
- __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
1186
- tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
1187
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1188
- const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
1189
- tmp = __lsx_vsrli_d((__m128i)t0, 32); \
1190
- tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
1191
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1192
- res = (lm_ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1182
+ #define LM_GGML_F32x4_REDUCE(res, x) \
1183
+ { \
1184
+ int offset = LM_GGML_F32_ARR >> 1; \
1185
+ for (int i = 0; i < offset; ++i) { \
1186
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1187
+ } \
1188
+ offset >>= 1; \
1189
+ for (int i = 0; i < offset; ++i) { \
1190
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1191
+ } \
1192
+ offset >>= 1; \
1193
+ for (int i = 0; i < offset; ++i) { \
1194
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1195
+ } \
1196
+ __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1197
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1198
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1199
+ const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
1200
+ tmp = __lsx_vsrli_d((__m128i) t0, 32); \
1201
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1202
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1203
+ res = (lm_ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1193
1204
  }
1194
1205
 
1195
1206
  #define LM_GGML_F32_VEC LM_GGML_F32x4
@@ -1357,31 +1368,15 @@ struct lm_ggml_compute_state {
1357
1368
  int ith;
1358
1369
  };
1359
1370
 
1360
- struct lm_ggml_compute_params {
1361
- // ith = thread index, nth = number of threads
1362
- int ith, nth;
1363
-
1364
- // work buffer for all threads
1365
- size_t wsize;
1366
- void * wdata;
1367
-
1368
- struct lm_ggml_threadpool * threadpool;
1369
- };
1370
-
1371
1371
  //
1372
1372
  // fundamental operations
1373
1373
  //
1374
1374
 
1375
1375
  inline static void lm_ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1376
-
1377
1376
  inline static void lm_ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1378
-
1379
1377
  inline static void lm_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1380
-
1381
1378
  inline static void lm_ggml_vec_set_f16(const int n, lm_ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1382
-
1383
1379
  inline static void lm_ggml_vec_set_bf16(const int n, lm_ggml_bf16_t * x, const lm_ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1384
-
1385
1380
  inline static void lm_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
1386
1381
  inline static void lm_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
1387
1382
  inline static void lm_ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
@@ -2276,7 +2271,7 @@ struct lm_ggml_state {
2276
2271
 
2277
2272
  static struct lm_ggml_state g_state = {0};
2278
2273
 
2279
- static void lm_ggml_barrier(struct lm_ggml_threadpool * tp) {
2274
+ void lm_ggml_barrier(struct lm_ggml_threadpool * tp) {
2280
2275
  int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
2281
2276
  if (n_threads == 1) {
2282
2277
  return;
@@ -2369,7 +2364,7 @@ void lm_ggml_numa_init(enum lm_ggml_numa_strategy numa_flag) {
2369
2364
  // figure out which node we're on
2370
2365
  uint current_cpu;
2371
2366
  int getcpu_ret = 0;
2372
- #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
2367
+ #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
2373
2368
  getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2374
2369
  #else
2375
2370
  // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
@@ -2439,6 +2434,7 @@ static void lm_ggml_init_arm_arch_features(void) {
2439
2434
  uint32_t hwcap2 = getauxval(AT_HWCAP2);
2440
2435
 
2441
2436
  lm_ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
2437
+ lm_ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
2442
2438
  lm_ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2443
2439
  lm_ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2444
2440
 
@@ -2453,6 +2449,11 @@ static void lm_ggml_init_arm_arch_features(void) {
2453
2449
  }
2454
2450
  lm_ggml_arm_arch_features.has_neon = oldp;
2455
2451
 
2452
+ if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
2453
+ oldp = 0;
2454
+ }
2455
+ lm_ggml_arm_arch_features.has_dotprod = oldp;
2456
+
2456
2457
  if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
2457
2458
  oldp = 0;
2458
2459
  }
@@ -7439,6 +7440,13 @@ static void lm_ggml_compute_forward_mul_mat(
7439
7440
  type = (enum lm_ggml_type)(intptr_t)src0->extra;
7440
7441
  }
7441
7442
 
7443
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
7444
+ if (src0->buffer && lm_ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
7445
+ lm_ggml_backend_amx_mul_mat(params, dst);
7446
+ return;
7447
+ }
7448
+ #endif
7449
+
7442
7450
  enum lm_ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7443
7451
  lm_ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
7444
7452
  lm_ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
@@ -7560,14 +7568,6 @@ UseGgmlGemm2:;
7560
7568
  // This is the size of the rest of the dimensions of the result
7561
7569
  const int64_t nr1 = ne1 * ne2 * ne3;
7562
7570
 
7563
- // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
7564
- int64_t num_rows_per_vec_dot = vec_dot_num_rows;
7565
- // TODO: currently the mmla kernels support only even numbered rows/cols.
7566
- // this check can be removed once they are extended to support odd numbered rows/cols too
7567
- if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
7568
- num_rows_per_vec_dot = 1;
7569
- }
7570
-
7571
7571
  // Now select a reasonable chunk size.
7572
7572
  int chunk_size = 16;
7573
7573
 
@@ -7630,6 +7630,15 @@ UseGgmlGemm2:;
7630
7630
  const int64_t ir1_start = dr1 * ith1;
7631
7631
  const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
7632
7632
 
7633
+ // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
7634
+ int64_t num_rows_per_vec_dot = vec_dot_num_rows;
7635
+
7636
+ // these checks are needed to avoid crossing dim1 boundaries
7637
+ // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
7638
+ if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
7639
+ num_rows_per_vec_dot = 1;
7640
+ }
7641
+
7633
7642
  lm_ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
7634
7643
 
7635
7644
  if (nth >= nchunk0 * nchunk1) {
@@ -9133,6 +9142,7 @@ static void lm_ggml_compute_forward_clamp(
9133
9142
  case LM_GGML_TYPE_Q4_0_4_4:
9134
9143
  case LM_GGML_TYPE_Q4_0_4_8:
9135
9144
  case LM_GGML_TYPE_Q4_0_8_8:
9145
+ case LM_GGML_TYPE_IQ4_NL_4_4:
9136
9146
  case LM_GGML_TYPE_I8:
9137
9147
  case LM_GGML_TYPE_I16:
9138
9148
  case LM_GGML_TYPE_I32:
@@ -12216,11 +12226,16 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
12216
12226
  const struct lm_ggml_compute_params * params,
12217
12227
  struct lm_ggml_tensor * dst) {
12218
12228
 
12219
- const struct lm_ggml_tensor * src0 = dst->src[0];
12220
- const struct lm_ggml_tensor * src0_grad = dst->src[1];
12221
- const struct lm_ggml_tensor * src0_grad_m = dst->src[2];
12222
- const struct lm_ggml_tensor * src0_grad_v = dst->src[3];
12229
+ const struct lm_ggml_tensor * src0 = dst->src[0];
12230
+ const struct lm_ggml_tensor * src0_grad = dst->src[1];
12231
+ const struct lm_ggml_tensor * src0_grad_m = dst->src[2];
12232
+ const struct lm_ggml_tensor * src0_grad_v = dst->src[3];
12233
+ const struct lm_ggml_tensor * adamw_params = dst->src[4];
12234
+
12223
12235
  LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad));
12236
+ LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_m));
12237
+ LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_v));
12238
+ LM_GGML_ASSERT(lm_ggml_nelements(adamw_params) == 7);
12224
12239
 
12225
12240
  const int ith = params->ith;
12226
12241
  const int nth = params->nth;
@@ -12237,16 +12252,14 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
12237
12252
  const int ir0 = dr*ith;
12238
12253
  const int ir1 = MIN(ir0 + dr, nr);
12239
12254
 
12240
- /* const float gnorm = 1.0f; */
12241
- int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
12242
- const float alpha = lm_ggml_get_op_params_f32(dst, 2);
12243
- const float beta1 = lm_ggml_get_op_params_f32(dst, 3);
12244
- const float beta2 = lm_ggml_get_op_params_f32(dst, 4);
12245
- const float eps = lm_ggml_get_op_params_f32(dst, 5);
12246
- const float wd = lm_ggml_get_op_params_f32(dst, 6);
12247
-
12248
- const float beta1h = alpha/(1.0f - powf(beta1, iter));
12249
- const float beta2h = 1.0f/(1.0f - powf(beta2, iter));
12255
+ const float * adamw_params_ptr = lm_ggml_get_data_f32(adamw_params);
12256
+ const float alpha = adamw_params_ptr[0];
12257
+ const float beta1 = adamw_params_ptr[1];
12258
+ const float beta2 = adamw_params_ptr[2];
12259
+ const float eps = adamw_params_ptr[3];
12260
+ const float wd = adamw_params_ptr[4];
12261
+ const float beta1h = adamw_params_ptr[5];
12262
+ const float beta2h = adamw_params_ptr[6];
12250
12263
 
12251
12264
  for (int ir = ir0; ir < ir1; ++ir) {
12252
12265
  const int64_t i03 = ir/(ne02*ne01);
@@ -12270,17 +12283,9 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
12270
12283
  // The weight decay is applied independently of the Adam momenta m and v.
12271
12284
  // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
12272
12285
  // See: https://arxiv.org/pdf/1711.05101v3.pdf
12273
- w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
12286
+ w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
12274
12287
  }
12275
12288
  }
12276
-
12277
- lm_ggml_barrier(params->threadpool);
12278
- if (ith != 0) {
12279
- return;
12280
- }
12281
-
12282
- iter++;
12283
- memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
12284
12289
  }
12285
12290
 
12286
12291
  static void lm_ggml_compute_forward_opt_step_adamw(
@@ -13281,10 +13286,16 @@ struct lm_ggml_cplan lm_ggml_graph_plan(
13281
13286
  } break;
13282
13287
  case LM_GGML_OP_MUL_MAT:
13283
13288
  {
13289
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
13290
+ if (node->src[0]->buffer && lm_ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
13291
+ cur = lm_ggml_backend_amx_desired_wsize(node);
13292
+ }
13293
+ #endif
13284
13294
  const enum lm_ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
13285
13295
 
13286
13296
  if (node->src[1]->type != vec_dot_type) {
13287
- cur = lm_ggml_row_size(vec_dot_type, lm_ggml_nelements(node->src[1]));
13297
+ size_t cur2 = lm_ggml_row_size(vec_dot_type, lm_ggml_nelements(node->src[1]));
13298
+ cur = MAX(cur, cur2);
13288
13299
  }
13289
13300
  } break;
13290
13301
  case LM_GGML_OP_MUL_MAT_ID:
@@ -13583,29 +13594,6 @@ static void lm_ggml_graph_compute_kickoff(struct lm_ggml_threadpool * threadpool
13583
13594
 
13584
13595
  #endif // LM_GGML_USE_OPENMP
13585
13596
 
13586
- void lm_ggml_threadpool_params_init(struct lm_ggml_threadpool_params * p, int n_threads) {
13587
- p->n_threads = n_threads;
13588
- p->prio = 0; // default priority (usually means normal or inherited)
13589
- p->poll = 50; // hybrid-polling enabled
13590
- p->strict_cpu = false; // no strict placement (all threads share same cpumask)
13591
- p->paused = false; // threads are ready to go
13592
- memset(p->cpumask, 0, LM_GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
13593
- }
13594
-
13595
- struct lm_ggml_threadpool_params lm_ggml_threadpool_params_default(int n_threads) {
13596
- struct lm_ggml_threadpool_params p;
13597
- lm_ggml_threadpool_params_init(&p, n_threads);
13598
- return p;
13599
- }
13600
-
13601
- bool lm_ggml_threadpool_params_match(const struct lm_ggml_threadpool_params * p0, const struct lm_ggml_threadpool_params * p1) {
13602
- if (p0->n_threads != p1->n_threads ) return false;
13603
- if (p0->prio != p1->prio ) return false;
13604
- if (p0->poll != p1->poll ) return false;
13605
- if (p0->strict_cpu != p1->strict_cpu ) return false;
13606
- return memcmp(p0->cpumask, p1->cpumask, LM_GGML_MAX_N_THREADS) == 0;
13607
- }
13608
-
13609
13597
  static struct lm_ggml_threadpool * lm_ggml_threadpool_new_impl(
13610
13598
  struct lm_ggml_threadpool_params * tpp,
13611
13599
  struct lm_ggml_cgraph * cgraph,
@@ -13901,15 +13889,23 @@ int lm_ggml_cpu_has_vsx(void) {
13901
13889
  }
13902
13890
 
13903
13891
  int lm_ggml_cpu_has_neon(void) {
13904
- #if defined(__ARM_ARCH)
13892
+ #if defined(__ARM_ARCH) && defined(__ARM_NEON)
13905
13893
  return lm_ggml_arm_arch_features.has_neon;
13906
13894
  #else
13907
13895
  return 0;
13908
13896
  #endif
13909
13897
  }
13910
13898
 
13899
+ int lm_ggml_cpu_has_dotprod(void) {
13900
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
13901
+ return lm_ggml_arm_arch_features.has_dotprod;
13902
+ #else
13903
+ return 0;
13904
+ #endif
13905
+ }
13906
+
13911
13907
  int lm_ggml_cpu_has_sve(void) {
13912
- #if defined(__ARM_ARCH)
13908
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
13913
13909
  return lm_ggml_arm_arch_features.has_sve;
13914
13910
  #else
13915
13911
  return 0;
@@ -13917,7 +13913,7 @@ int lm_ggml_cpu_has_sve(void) {
13917
13913
  }
13918
13914
 
13919
13915
  int lm_ggml_cpu_has_matmul_int8(void) {
13920
- #if defined(__ARM_ARCH)
13916
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
13921
13917
  return lm_ggml_arm_arch_features.has_i8mm;
13922
13918
  #else
13923
13919
  return 0;
@@ -13925,7 +13921,7 @@ int lm_ggml_cpu_has_matmul_int8(void) {
13925
13921
  }
13926
13922
 
13927
13923
  int lm_ggml_cpu_get_sve_cnt(void) {
13928
- #if defined(__ARM_ARCH)
13924
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
13929
13925
  return lm_ggml_arm_arch_features.sve_cnt;
13930
13926
  #else
13931
13927
  return 0;