cui-llama.rn 1.3.0 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +6 -1
- package/android/src/main/jni.cpp +6 -6
- package/cpp/amx/amx.cpp +196 -0
- package/cpp/amx/amx.h +20 -0
- package/cpp/amx/common.h +101 -0
- package/cpp/amx/mmq.cpp +2524 -0
- package/cpp/amx/mmq.h +16 -0
- package/cpp/common.cpp +1981 -1682
- package/cpp/common.h +636 -600
- package/cpp/ggml-aarch64.c +129 -129
- package/cpp/ggml-aarch64.h +19 -19
- package/cpp/ggml-alloc.c +1038 -1040
- package/cpp/ggml-alloc.h +76 -76
- package/cpp/ggml-backend-impl.h +238 -216
- package/cpp/ggml-backend-reg.cpp +423 -195
- package/cpp/ggml-backend.cpp +1999 -1997
- package/cpp/ggml-backend.h +351 -328
- package/cpp/ggml-common.h +1859 -1853
- package/cpp/ggml-cpp.h +38 -38
- package/cpp/ggml-cpu-aarch64.c +3823 -3560
- package/cpp/ggml-cpu-aarch64.h +32 -30
- package/cpp/ggml-cpu-impl.h +386 -371
- package/cpp/ggml-cpu-quants.c +10835 -10822
- package/cpp/ggml-cpu-quants.h +63 -63
- package/cpp/ggml-cpu.c +99 -103
- package/cpp/ggml-cpu.cpp +69 -17
- package/cpp/ggml-cpu.h +152 -177
- package/cpp/ggml-impl.h +556 -550
- package/cpp/ggml-metal.h +66 -66
- package/cpp/ggml-metal.m +4426 -4294
- package/cpp/ggml-quants.c +5247 -5247
- package/cpp/ggml-quants.h +100 -100
- package/cpp/ggml-threading.cpp +12 -12
- package/cpp/ggml-threading.h +12 -12
- package/cpp/ggml.c +7618 -8180
- package/cpp/ggml.h +2255 -2411
- package/cpp/json-schema-to-grammar.cpp +1045 -0
- package/cpp/json-schema-to-grammar.h +8 -0
- package/cpp/json.hpp +24766 -0
- package/cpp/llama-grammar.cpp +1138 -1138
- package/cpp/llama-grammar.h +144 -144
- package/cpp/llama-impl.h +181 -181
- package/cpp/llama-sampling.cpp +2348 -2348
- package/cpp/llama-sampling.h +48 -48
- package/cpp/llama-vocab.cpp +1984 -1984
- package/cpp/llama-vocab.h +170 -170
- package/cpp/llama.cpp +22332 -22132
- package/cpp/llama.h +1259 -1253
- package/cpp/log.cpp +401 -401
- package/cpp/log.h +121 -121
- package/cpp/rn-llama.hpp +6 -6
- package/cpp/sampling.cpp +505 -466
- package/cpp/sampling.h +22 -1
- package/cpp/sgemm.cpp +1884 -1884
- package/cpp/speculative.cpp +270 -0
- package/cpp/speculative.h +28 -0
- package/cpp/unicode.cpp +11 -0
- package/ios/RNLlamaContext.mm +13 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +4 -2
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +2 -1
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +94 -4
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts +5 -6
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +4 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/NativeRNLlama.ts +97 -10
- package/src/grammar.ts +10 -8
- package/src/index.ts +22 -1
package/cpp/ggml-cpu-quants.h
CHANGED
@@ -1,63 +1,63 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#define LM_GGML_COMMON_DECL_C
|
4
|
-
#include "ggml-common.h"
|
5
|
-
|
6
|
-
#include "ggml.h"
|
7
|
-
|
8
|
-
// GGML CPU internal header
|
9
|
-
|
10
|
-
#ifdef __cplusplus
|
11
|
-
extern "C" {
|
12
|
-
#endif
|
13
|
-
|
14
|
-
// Quantization
|
15
|
-
void quantize_row_q4_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
16
|
-
void quantize_row_q4_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
17
|
-
void quantize_row_q5_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
18
|
-
void quantize_row_q5_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
19
|
-
void quantize_row_q8_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
20
|
-
void quantize_row_q8_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
21
|
-
|
22
|
-
void quantize_row_q2_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
23
|
-
void quantize_row_q3_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
24
|
-
void quantize_row_q4_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
25
|
-
void quantize_row_q5_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
26
|
-
void quantize_row_q6_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
27
|
-
void quantize_row_q8_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
28
|
-
|
29
|
-
void quantize_row_tq1_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
30
|
-
void quantize_row_tq2_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
31
|
-
|
32
|
-
void quantize_row_iq4_nl (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
33
|
-
void quantize_row_iq4_xs (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
34
|
-
|
35
|
-
// Dot product
|
36
|
-
void lm_ggml_vec_dot_q4_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
37
|
-
void lm_ggml_vec_dot_q4_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
38
|
-
void lm_ggml_vec_dot_q5_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
39
|
-
void lm_ggml_vec_dot_q5_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
40
|
-
void lm_ggml_vec_dot_q8_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
41
|
-
|
42
|
-
void lm_ggml_vec_dot_q2_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
43
|
-
void lm_ggml_vec_dot_q3_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
44
|
-
void lm_ggml_vec_dot_q4_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
45
|
-
void lm_ggml_vec_dot_q5_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
46
|
-
void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
47
|
-
|
48
|
-
void lm_ggml_vec_dot_tq1_0_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
49
|
-
void lm_ggml_vec_dot_tq2_0_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
50
|
-
|
51
|
-
void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
52
|
-
void lm_ggml_vec_dot_iq2_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
53
|
-
void lm_ggml_vec_dot_iq2_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
54
|
-
void lm_ggml_vec_dot_iq3_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
55
|
-
void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
56
|
-
void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
57
|
-
void lm_ggml_vec_dot_iq4_nl_q8_0 (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
58
|
-
void lm_ggml_vec_dot_iq4_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
59
|
-
void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
60
|
-
|
61
|
-
#ifdef __cplusplus
|
62
|
-
}
|
63
|
-
#endif
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#define LM_GGML_COMMON_DECL_C
|
4
|
+
#include "ggml-common.h"
|
5
|
+
|
6
|
+
#include "ggml.h"
|
7
|
+
|
8
|
+
// GGML CPU internal header
|
9
|
+
|
10
|
+
#ifdef __cplusplus
|
11
|
+
extern "C" {
|
12
|
+
#endif
|
13
|
+
|
14
|
+
// Quantization
|
15
|
+
void quantize_row_q4_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
16
|
+
void quantize_row_q4_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
17
|
+
void quantize_row_q5_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
18
|
+
void quantize_row_q5_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
19
|
+
void quantize_row_q8_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
20
|
+
void quantize_row_q8_1(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
21
|
+
|
22
|
+
void quantize_row_q2_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
23
|
+
void quantize_row_q3_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
24
|
+
void quantize_row_q4_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
25
|
+
void quantize_row_q5_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
26
|
+
void quantize_row_q6_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
27
|
+
void quantize_row_q8_K(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
28
|
+
|
29
|
+
void quantize_row_tq1_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
30
|
+
void quantize_row_tq2_0(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
31
|
+
|
32
|
+
void quantize_row_iq4_nl (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
33
|
+
void quantize_row_iq4_xs (const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
34
|
+
|
35
|
+
// Dot product
|
36
|
+
void lm_ggml_vec_dot_q4_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
37
|
+
void lm_ggml_vec_dot_q4_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
38
|
+
void lm_ggml_vec_dot_q5_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
39
|
+
void lm_ggml_vec_dot_q5_1_q8_1(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
40
|
+
void lm_ggml_vec_dot_q8_0_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
41
|
+
|
42
|
+
void lm_ggml_vec_dot_q2_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
43
|
+
void lm_ggml_vec_dot_q3_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
44
|
+
void lm_ggml_vec_dot_q4_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
45
|
+
void lm_ggml_vec_dot_q5_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
46
|
+
void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
47
|
+
|
48
|
+
void lm_ggml_vec_dot_tq1_0_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
49
|
+
void lm_ggml_vec_dot_tq2_0_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
50
|
+
|
51
|
+
void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
52
|
+
void lm_ggml_vec_dot_iq2_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
53
|
+
void lm_ggml_vec_dot_iq2_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
54
|
+
void lm_ggml_vec_dot_iq3_xxs_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
55
|
+
void lm_ggml_vec_dot_iq1_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
56
|
+
void lm_ggml_vec_dot_iq1_m_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
57
|
+
void lm_ggml_vec_dot_iq4_nl_q8_0 (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
58
|
+
void lm_ggml_vec_dot_iq4_xs_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
59
|
+
void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc);
|
60
|
+
|
61
|
+
#ifdef __cplusplus
|
62
|
+
}
|
63
|
+
#endif
|
package/cpp/ggml-cpu.c
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#include "ggml-quants.h"
|
11
11
|
#include "ggml-cpu-quants.h"
|
12
12
|
#include "ggml-threading.h"
|
13
|
+
#include "amx/amx.h"
|
13
14
|
#include "ggml.h"
|
14
15
|
|
15
16
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
@@ -109,10 +110,11 @@ static lm_ggml_fp16_t lm_ggml_table_gelu_quick_f16[1 << 16];
|
|
109
110
|
#if defined(__ARM_ARCH)
|
110
111
|
struct lm_ggml_arm_arch_features_type {
|
111
112
|
int has_neon;
|
113
|
+
int has_dotprod;
|
112
114
|
int has_i8mm;
|
113
115
|
int has_sve;
|
114
116
|
int sve_cnt;
|
115
|
-
} lm_ggml_arm_arch_features = {-1, -1, -1, 0};
|
117
|
+
} lm_ggml_arm_arch_features = {-1, -1, -1, -1, 0};
|
116
118
|
#endif
|
117
119
|
|
118
120
|
|
@@ -446,6 +448,15 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
|
|
446
448
|
.vec_dot_type = LM_GGML_TYPE_Q8_K,
|
447
449
|
.nrows = 1,
|
448
450
|
},
|
451
|
+
[LM_GGML_TYPE_IQ4_NL_4_4] = {
|
452
|
+
.from_float = NULL,
|
453
|
+
.vec_dot = NULL,
|
454
|
+
.vec_dot_type = LM_GGML_TYPE_Q8_0,
|
455
|
+
.nrows = 1,
|
456
|
+
.ncols = 4,
|
457
|
+
.gemv = lm_ggml_gemv_iq4_nl_4x4_q8_0,
|
458
|
+
.gemm = lm_ggml_gemm_iq4_nl_4x4_q8_0,
|
459
|
+
},
|
449
460
|
};
|
450
461
|
|
451
462
|
const struct lm_ggml_type_traits_cpu * lm_ggml_get_type_traits_cpu(enum lm_ggml_type type) {
|
@@ -614,7 +625,7 @@ do { \
|
|
614
625
|
for (int i = 0; i < offset; ++i) { \
|
615
626
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
616
627
|
} \
|
617
|
-
res = _mm512_reduce_add_ps(x[0]);
|
628
|
+
res = (lm_ggml_float) _mm512_reduce_add_ps(x[0]); \
|
618
629
|
} while (0)
|
619
630
|
|
620
631
|
// TODO: is this optimal ?
|
@@ -664,7 +675,7 @@ do { \
|
|
664
675
|
for (int i = 0; i < offset; ++i) { \
|
665
676
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
666
677
|
} \
|
667
|
-
res = _mm512_reduce_add_ps(x[0]);
|
678
|
+
res = (lm_ggml_float) _mm512_reduce_add_ps(x[0]); \
|
668
679
|
} while (0)
|
669
680
|
|
670
681
|
#define LM_GGML_F16_VEC LM_GGML_F32Cx16
|
@@ -675,8 +686,8 @@ do { \
|
|
675
686
|
#define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx16_FMA
|
676
687
|
#define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx16_ADD
|
677
688
|
#define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx16_MUL
|
678
|
-
#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx16_REDUCE
|
679
689
|
|
690
|
+
#define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx16_REDUCE
|
680
691
|
#elif defined(__AVX__)
|
681
692
|
|
682
693
|
#define LM_GGML_SIMD
|
@@ -1168,28 +1179,28 @@ static inline void __lasx_f32cx8_store(lm_ggml_fp16_t * x, __m256 y) {
|
|
1168
1179
|
#define LM_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
1169
1180
|
#define LM_GGML_F32x4_ADD __lsx_vfadd_s
|
1170
1181
|
#define LM_GGML_F32x4_MUL __lsx_vfmul_s
|
1171
|
-
#define LM_GGML_F32x4_REDUCE(res, x)
|
1172
|
-
{
|
1173
|
-
int offset = LM_GGML_F32_ARR >> 1;
|
1174
|
-
for (int i = 0; i < offset; ++i) {
|
1175
|
-
x[i] = __lsx_vfadd_s(x[i], x[offset+i]);
|
1176
|
-
}
|
1177
|
-
offset >>= 1;
|
1178
|
-
for (int i = 0; i < offset; ++i) {
|
1179
|
-
x[i] = __lsx_vfadd_s(x[i], x[offset+i]);
|
1180
|
-
}
|
1181
|
-
offset >>= 1;
|
1182
|
-
for (int i = 0; i < offset; ++i) {
|
1183
|
-
x[i] = __lsx_vfadd_s(x[i], x[offset+i]);
|
1184
|
-
}
|
1185
|
-
__m128i tmp
|
1186
|
-
tmp
|
1187
|
-
tmp
|
1188
|
-
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);
|
1189
|
-
tmp
|
1190
|
-
tmp
|
1191
|
-
tmp
|
1192
|
-
res
|
1182
|
+
#define LM_GGML_F32x4_REDUCE(res, x) \
|
1183
|
+
{ \
|
1184
|
+
int offset = LM_GGML_F32_ARR >> 1; \
|
1185
|
+
for (int i = 0; i < offset; ++i) { \
|
1186
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
1187
|
+
} \
|
1188
|
+
offset >>= 1; \
|
1189
|
+
for (int i = 0; i < offset; ++i) { \
|
1190
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
1191
|
+
} \
|
1192
|
+
offset >>= 1; \
|
1193
|
+
for (int i = 0; i < offset; ++i) { \
|
1194
|
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
1195
|
+
} \
|
1196
|
+
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
1197
|
+
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
1198
|
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
1199
|
+
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
1200
|
+
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
1201
|
+
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
1202
|
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
1203
|
+
res = (lm_ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
1193
1204
|
}
|
1194
1205
|
|
1195
1206
|
#define LM_GGML_F32_VEC LM_GGML_F32x4
|
@@ -1357,31 +1368,15 @@ struct lm_ggml_compute_state {
|
|
1357
1368
|
int ith;
|
1358
1369
|
};
|
1359
1370
|
|
1360
|
-
struct lm_ggml_compute_params {
|
1361
|
-
// ith = thread index, nth = number of threads
|
1362
|
-
int ith, nth;
|
1363
|
-
|
1364
|
-
// work buffer for all threads
|
1365
|
-
size_t wsize;
|
1366
|
-
void * wdata;
|
1367
|
-
|
1368
|
-
struct lm_ggml_threadpool * threadpool;
|
1369
|
-
};
|
1370
|
-
|
1371
1371
|
//
|
1372
1372
|
// fundamental operations
|
1373
1373
|
//
|
1374
1374
|
|
1375
1375
|
inline static void lm_ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
1376
|
-
|
1377
1376
|
inline static void lm_ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
1378
|
-
|
1379
1377
|
inline static void lm_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
1380
|
-
|
1381
1378
|
inline static void lm_ggml_vec_set_f16(const int n, lm_ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
1382
|
-
|
1383
1379
|
inline static void lm_ggml_vec_set_bf16(const int n, lm_ggml_bf16_t * x, const lm_ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
1384
|
-
|
1385
1380
|
inline static void lm_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
1386
1381
|
inline static void lm_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
1387
1382
|
inline static void lm_ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
@@ -2276,7 +2271,7 @@ struct lm_ggml_state {
|
|
2276
2271
|
|
2277
2272
|
static struct lm_ggml_state g_state = {0};
|
2278
2273
|
|
2279
|
-
|
2274
|
+
void lm_ggml_barrier(struct lm_ggml_threadpool * tp) {
|
2280
2275
|
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
2281
2276
|
if (n_threads == 1) {
|
2282
2277
|
return;
|
@@ -2369,7 +2364,7 @@ void lm_ggml_numa_init(enum lm_ggml_numa_strategy numa_flag) {
|
|
2369
2364
|
// figure out which node we're on
|
2370
2365
|
uint current_cpu;
|
2371
2366
|
int getcpu_ret = 0;
|
2372
|
-
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >
|
2367
|
+
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
|
2373
2368
|
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
2374
2369
|
#else
|
2375
2370
|
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
@@ -2439,6 +2434,7 @@ static void lm_ggml_init_arm_arch_features(void) {
|
|
2439
2434
|
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
2440
2435
|
|
2441
2436
|
lm_ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
2437
|
+
lm_ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
|
2442
2438
|
lm_ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
2443
2439
|
lm_ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
2444
2440
|
|
@@ -2453,6 +2449,11 @@ static void lm_ggml_init_arm_arch_features(void) {
|
|
2453
2449
|
}
|
2454
2450
|
lm_ggml_arm_arch_features.has_neon = oldp;
|
2455
2451
|
|
2452
|
+
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
|
2453
|
+
oldp = 0;
|
2454
|
+
}
|
2455
|
+
lm_ggml_arm_arch_features.has_dotprod = oldp;
|
2456
|
+
|
2456
2457
|
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
2457
2458
|
oldp = 0;
|
2458
2459
|
}
|
@@ -7439,6 +7440,13 @@ static void lm_ggml_compute_forward_mul_mat(
|
|
7439
7440
|
type = (enum lm_ggml_type)(intptr_t)src0->extra;
|
7440
7441
|
}
|
7441
7442
|
|
7443
|
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
7444
|
+
if (src0->buffer && lm_ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
|
7445
|
+
lm_ggml_backend_amx_mul_mat(params, dst);
|
7446
|
+
return;
|
7447
|
+
}
|
7448
|
+
#endif
|
7449
|
+
|
7442
7450
|
enum lm_ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
7443
7451
|
lm_ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
7444
7452
|
lm_ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
|
@@ -7560,14 +7568,6 @@ UseGgmlGemm2:;
|
|
7560
7568
|
// This is the size of the rest of the dimensions of the result
|
7561
7569
|
const int64_t nr1 = ne1 * ne2 * ne3;
|
7562
7570
|
|
7563
|
-
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
7564
|
-
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
7565
|
-
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
7566
|
-
// this check can be removed once they are extended to support odd numbered rows/cols too
|
7567
|
-
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
7568
|
-
num_rows_per_vec_dot = 1;
|
7569
|
-
}
|
7570
|
-
|
7571
7571
|
// Now select a reasonable chunk size.
|
7572
7572
|
int chunk_size = 16;
|
7573
7573
|
|
@@ -7630,6 +7630,15 @@ UseGgmlGemm2:;
|
|
7630
7630
|
const int64_t ir1_start = dr1 * ith1;
|
7631
7631
|
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
7632
7632
|
|
7633
|
+
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
7634
|
+
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
7635
|
+
|
7636
|
+
// these checks are needed to avoid crossing dim1 boundaries
|
7637
|
+
// can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
|
7638
|
+
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
|
7639
|
+
num_rows_per_vec_dot = 1;
|
7640
|
+
}
|
7641
|
+
|
7633
7642
|
lm_ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
7634
7643
|
|
7635
7644
|
if (nth >= nchunk0 * nchunk1) {
|
@@ -9133,6 +9142,7 @@ static void lm_ggml_compute_forward_clamp(
|
|
9133
9142
|
case LM_GGML_TYPE_Q4_0_4_4:
|
9134
9143
|
case LM_GGML_TYPE_Q4_0_4_8:
|
9135
9144
|
case LM_GGML_TYPE_Q4_0_8_8:
|
9145
|
+
case LM_GGML_TYPE_IQ4_NL_4_4:
|
9136
9146
|
case LM_GGML_TYPE_I8:
|
9137
9147
|
case LM_GGML_TYPE_I16:
|
9138
9148
|
case LM_GGML_TYPE_I32:
|
@@ -12216,11 +12226,16 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
|
|
12216
12226
|
const struct lm_ggml_compute_params * params,
|
12217
12227
|
struct lm_ggml_tensor * dst) {
|
12218
12228
|
|
12219
|
-
const struct lm_ggml_tensor * src0
|
12220
|
-
const struct lm_ggml_tensor * src0_grad
|
12221
|
-
const struct lm_ggml_tensor * src0_grad_m
|
12222
|
-
const struct lm_ggml_tensor * src0_grad_v
|
12229
|
+
const struct lm_ggml_tensor * src0 = dst->src[0];
|
12230
|
+
const struct lm_ggml_tensor * src0_grad = dst->src[1];
|
12231
|
+
const struct lm_ggml_tensor * src0_grad_m = dst->src[2];
|
12232
|
+
const struct lm_ggml_tensor * src0_grad_v = dst->src[3];
|
12233
|
+
const struct lm_ggml_tensor * adamw_params = dst->src[4];
|
12234
|
+
|
12223
12235
|
LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad));
|
12236
|
+
LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_m));
|
12237
|
+
LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_v));
|
12238
|
+
LM_GGML_ASSERT(lm_ggml_nelements(adamw_params) == 7);
|
12224
12239
|
|
12225
12240
|
const int ith = params->ith;
|
12226
12241
|
const int nth = params->nth;
|
@@ -12237,16 +12252,14 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
|
|
12237
12252
|
const int ir0 = dr*ith;
|
12238
12253
|
const int ir1 = MIN(ir0 + dr, nr);
|
12239
12254
|
|
12240
|
-
|
12241
|
-
|
12242
|
-
const float
|
12243
|
-
const float
|
12244
|
-
const float
|
12245
|
-
const float
|
12246
|
-
const float
|
12247
|
-
|
12248
|
-
const float beta1h = alpha/(1.0f - powf(beta1, iter));
|
12249
|
-
const float beta2h = 1.0f/(1.0f - powf(beta2, iter));
|
12255
|
+
const float * adamw_params_ptr = lm_ggml_get_data_f32(adamw_params);
|
12256
|
+
const float alpha = adamw_params_ptr[0];
|
12257
|
+
const float beta1 = adamw_params_ptr[1];
|
12258
|
+
const float beta2 = adamw_params_ptr[2];
|
12259
|
+
const float eps = adamw_params_ptr[3];
|
12260
|
+
const float wd = adamw_params_ptr[4];
|
12261
|
+
const float beta1h = adamw_params_ptr[5];
|
12262
|
+
const float beta2h = adamw_params_ptr[6];
|
12250
12263
|
|
12251
12264
|
for (int ir = ir0; ir < ir1; ++ir) {
|
12252
12265
|
const int64_t i03 = ir/(ne02*ne01);
|
@@ -12270,17 +12283,9 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
|
|
12270
12283
|
// The weight decay is applied independently of the Adam momenta m and v.
|
12271
12284
|
// This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
|
12272
12285
|
// See: https://arxiv.org/pdf/1711.05101v3.pdf
|
12273
|
-
w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
|
12286
|
+
w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
|
12274
12287
|
}
|
12275
12288
|
}
|
12276
|
-
|
12277
|
-
lm_ggml_barrier(params->threadpool);
|
12278
|
-
if (ith != 0) {
|
12279
|
-
return;
|
12280
|
-
}
|
12281
|
-
|
12282
|
-
iter++;
|
12283
|
-
memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
|
12284
12289
|
}
|
12285
12290
|
|
12286
12291
|
static void lm_ggml_compute_forward_opt_step_adamw(
|
@@ -13281,10 +13286,16 @@ struct lm_ggml_cplan lm_ggml_graph_plan(
|
|
13281
13286
|
} break;
|
13282
13287
|
case LM_GGML_OP_MUL_MAT:
|
13283
13288
|
{
|
13289
|
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
13290
|
+
if (node->src[0]->buffer && lm_ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
|
13291
|
+
cur = lm_ggml_backend_amx_desired_wsize(node);
|
13292
|
+
}
|
13293
|
+
#endif
|
13284
13294
|
const enum lm_ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
|
13285
13295
|
|
13286
13296
|
if (node->src[1]->type != vec_dot_type) {
|
13287
|
-
|
13297
|
+
size_t cur2 = lm_ggml_row_size(vec_dot_type, lm_ggml_nelements(node->src[1]));
|
13298
|
+
cur = MAX(cur, cur2);
|
13288
13299
|
}
|
13289
13300
|
} break;
|
13290
13301
|
case LM_GGML_OP_MUL_MAT_ID:
|
@@ -13583,29 +13594,6 @@ static void lm_ggml_graph_compute_kickoff(struct lm_ggml_threadpool * threadpool
|
|
13583
13594
|
|
13584
13595
|
#endif // LM_GGML_USE_OPENMP
|
13585
13596
|
|
13586
|
-
void lm_ggml_threadpool_params_init(struct lm_ggml_threadpool_params * p, int n_threads) {
|
13587
|
-
p->n_threads = n_threads;
|
13588
|
-
p->prio = 0; // default priority (usually means normal or inherited)
|
13589
|
-
p->poll = 50; // hybrid-polling enabled
|
13590
|
-
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
|
13591
|
-
p->paused = false; // threads are ready to go
|
13592
|
-
memset(p->cpumask, 0, LM_GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
|
13593
|
-
}
|
13594
|
-
|
13595
|
-
struct lm_ggml_threadpool_params lm_ggml_threadpool_params_default(int n_threads) {
|
13596
|
-
struct lm_ggml_threadpool_params p;
|
13597
|
-
lm_ggml_threadpool_params_init(&p, n_threads);
|
13598
|
-
return p;
|
13599
|
-
}
|
13600
|
-
|
13601
|
-
bool lm_ggml_threadpool_params_match(const struct lm_ggml_threadpool_params * p0, const struct lm_ggml_threadpool_params * p1) {
|
13602
|
-
if (p0->n_threads != p1->n_threads ) return false;
|
13603
|
-
if (p0->prio != p1->prio ) return false;
|
13604
|
-
if (p0->poll != p1->poll ) return false;
|
13605
|
-
if (p0->strict_cpu != p1->strict_cpu ) return false;
|
13606
|
-
return memcmp(p0->cpumask, p1->cpumask, LM_GGML_MAX_N_THREADS) == 0;
|
13607
|
-
}
|
13608
|
-
|
13609
13597
|
static struct lm_ggml_threadpool * lm_ggml_threadpool_new_impl(
|
13610
13598
|
struct lm_ggml_threadpool_params * tpp,
|
13611
13599
|
struct lm_ggml_cgraph * cgraph,
|
@@ -13901,15 +13889,23 @@ int lm_ggml_cpu_has_vsx(void) {
|
|
13901
13889
|
}
|
13902
13890
|
|
13903
13891
|
int lm_ggml_cpu_has_neon(void) {
|
13904
|
-
#if defined(__ARM_ARCH)
|
13892
|
+
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
13905
13893
|
return lm_ggml_arm_arch_features.has_neon;
|
13906
13894
|
#else
|
13907
13895
|
return 0;
|
13908
13896
|
#endif
|
13909
13897
|
}
|
13910
13898
|
|
13899
|
+
int lm_ggml_cpu_has_dotprod(void) {
|
13900
|
+
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
|
13901
|
+
return lm_ggml_arm_arch_features.has_dotprod;
|
13902
|
+
#else
|
13903
|
+
return 0;
|
13904
|
+
#endif
|
13905
|
+
}
|
13906
|
+
|
13911
13907
|
int lm_ggml_cpu_has_sve(void) {
|
13912
|
-
#if defined(__ARM_ARCH)
|
13908
|
+
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
13913
13909
|
return lm_ggml_arm_arch_features.has_sve;
|
13914
13910
|
#else
|
13915
13911
|
return 0;
|
@@ -13917,7 +13913,7 @@ int lm_ggml_cpu_has_sve(void) {
|
|
13917
13913
|
}
|
13918
13914
|
|
13919
13915
|
int lm_ggml_cpu_has_matmul_int8(void) {
|
13920
|
-
#if defined(__ARM_ARCH)
|
13916
|
+
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
|
13921
13917
|
return lm_ggml_arm_arch_features.has_i8mm;
|
13922
13918
|
#else
|
13923
13919
|
return 0;
|
@@ -13925,7 +13921,7 @@ int lm_ggml_cpu_has_matmul_int8(void) {
|
|
13925
13921
|
}
|
13926
13922
|
|
13927
13923
|
int lm_ggml_cpu_get_sve_cnt(void) {
|
13928
|
-
#if defined(__ARM_ARCH)
|
13924
|
+
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
13929
13925
|
return lm_ggml_arm_arch_features.sve_cnt;
|
13930
13926
|
#else
|
13931
13927
|
return 0;
|