cui-llama.rn 1.2.6 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/android/src/main/CMakeLists.txt +26 -6
- package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
- package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
- package/android/src/main/jni.cpp +228 -40
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/cpp/amx/amx.cpp +196 -0
- package/cpp/amx/amx.h +20 -0
- package/cpp/amx/common.h +101 -0
- package/cpp/amx/mmq.cpp +2524 -0
- package/cpp/amx/mmq.h +16 -0
- package/cpp/common.cpp +118 -251
- package/cpp/common.h +53 -30
- package/cpp/ggml-aarch64.c +46 -3395
- package/cpp/ggml-aarch64.h +0 -20
- package/cpp/ggml-alloc.c +6 -8
- package/cpp/ggml-backend-impl.h +33 -11
- package/cpp/ggml-backend-reg.cpp +423 -0
- package/cpp/ggml-backend.cpp +14 -676
- package/cpp/ggml-backend.h +46 -9
- package/cpp/ggml-common.h +6 -0
- package/cpp/ggml-cpu-aarch64.c +3823 -0
- package/cpp/ggml-cpu-aarch64.h +32 -0
- package/cpp/ggml-cpu-impl.h +14 -242
- package/cpp/ggml-cpu-quants.c +10835 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu.c +13971 -13720
- package/cpp/ggml-cpu.cpp +715 -0
- package/cpp/ggml-cpu.h +65 -63
- package/cpp/ggml-impl.h +285 -25
- package/cpp/ggml-metal.h +8 -8
- package/cpp/ggml-metal.m +1221 -728
- package/cpp/ggml-quants.c +189 -10681
- package/cpp/ggml-quants.h +78 -125
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +12 -0
- package/cpp/ggml.c +688 -1460
- package/cpp/ggml.h +58 -244
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-sampling.cpp +5 -2
- package/cpp/llama.cpp +409 -123
- package/cpp/llama.h +8 -4
- package/cpp/rn-llama.hpp +89 -25
- package/cpp/sampling.cpp +42 -3
- package/cpp/sampling.h +22 -1
- package/cpp/sgemm.cpp +608 -0
- package/cpp/speculative.cpp +270 -0
- package/cpp/speculative.h +28 -0
- package/cpp/unicode.cpp +11 -0
- package/ios/RNLlama.mm +43 -20
- package/ios/RNLlamaContext.h +9 -3
- package/ios/RNLlamaContext.mm +146 -33
- package/jest/mock.js +0 -1
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +4 -2
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +52 -15
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +2 -1
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +51 -15
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +122 -8
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts +5 -6
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +15 -6
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/NativeRNLlama.ts +135 -13
- package/src/grammar.ts +10 -8
- package/src/index.ts +104 -28
package/cpp/amx/amx.h
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#include "ggml-backend.h"
|
2
|
+
#include "ggml-cpu-impl.h"
|
3
|
+
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
8
|
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
9
|
+
|
10
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_amx_buffer_type(void);
|
11
|
+
bool lm_ggml_backend_amx_buft_is_amx(lm_ggml_backend_buffer_type_t buft);
|
12
|
+
bool lm_ggml_backend_amx_device_supports_op(const struct lm_ggml_tensor * op);
|
13
|
+
void lm_ggml_backend_amx_mul_mat(const struct lm_ggml_compute_params * params, struct lm_ggml_tensor * dst);
|
14
|
+
size_t lm_ggml_backend_amx_desired_wsize(const struct lm_ggml_tensor * dst);
|
15
|
+
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#ifdef __cplusplus
|
19
|
+
}
|
20
|
+
#endif
|
package/cpp/amx/common.h
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
#include "ggml-cpu-impl.h"
|
5
|
+
|
6
|
+
#include <algorithm>
|
7
|
+
#include <memory>
|
8
|
+
#include <type_traits>
|
9
|
+
|
10
|
+
#if defined(_OPENMP)
|
11
|
+
#include <omp.h>
|
12
|
+
#endif
|
13
|
+
|
14
|
+
#define TILE_M 16
|
15
|
+
#define TILE_N 16
|
16
|
+
#define TILE_K 32
|
17
|
+
#define VNNI_BLK 4
|
18
|
+
|
19
|
+
#define AMX_BLK_SIZE 32
|
20
|
+
|
21
|
+
#define TMM0 0
|
22
|
+
#define TMM1 1
|
23
|
+
#define TMM2 2
|
24
|
+
#define TMM3 3
|
25
|
+
#define TMM4 4
|
26
|
+
#define TMM5 5
|
27
|
+
#define TMM6 6
|
28
|
+
#define TMM7 7
|
29
|
+
|
30
|
+
// parallel routines
|
31
|
+
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
|
32
|
+
inline T div_up(T x, T y) { return (x + y - 1) / y; }
|
33
|
+
|
34
|
+
template <typename T>
|
35
|
+
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
|
36
|
+
#if 0
|
37
|
+
// onednn partition pattern
|
38
|
+
T& n_my = n_end;
|
39
|
+
if (nth <= 1 || n == 0) {
|
40
|
+
n_start = 0;
|
41
|
+
n_my = n;
|
42
|
+
} else {
|
43
|
+
T n1 = div_up(n, nth);
|
44
|
+
T n2 = n1 - 1;
|
45
|
+
T T1 = n - n2 * nth;
|
46
|
+
n_my = ith < T1 ? n1 : n2;
|
47
|
+
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
|
48
|
+
}
|
49
|
+
n_end += n_start;
|
50
|
+
#else
|
51
|
+
// pytorch aten partition pattern
|
52
|
+
T n_my = div_up(n, nth);
|
53
|
+
n_start = ith * n_my;
|
54
|
+
n_end = std::min(n_start + n_my, n);
|
55
|
+
#endif
|
56
|
+
}
|
57
|
+
|
58
|
+
template <typename func_t>
|
59
|
+
inline void parallel_for(int nth, int n, const func_t& f) {
|
60
|
+
#if defined(_OPENMP)
|
61
|
+
#pragma omp parallel num_threads(nth)
|
62
|
+
{
|
63
|
+
//int nth = omp_get_num_threads();
|
64
|
+
int ith = omp_get_thread_num();
|
65
|
+
int tbegin, tend;
|
66
|
+
balance211(n, nth, ith, tbegin, tend);
|
67
|
+
f(tbegin, tend);
|
68
|
+
}
|
69
|
+
#else
|
70
|
+
f(0, n);
|
71
|
+
|
72
|
+
LM_GGML_UNUSED(nth);
|
73
|
+
#endif
|
74
|
+
}
|
75
|
+
|
76
|
+
template <typename func_t>
|
77
|
+
inline void parallel_for_ggml(const lm_ggml_compute_params * params, int n, const func_t & f) {
|
78
|
+
int tbegin, tend;
|
79
|
+
balance211(n, params->nth, params->ith, tbegin, tend);
|
80
|
+
f(tbegin, tend);
|
81
|
+
lm_ggml_barrier(params->threadpool); // TODO: might not always be needed
|
82
|
+
}
|
83
|
+
|
84
|
+
// quantized types that have AMX support
|
85
|
+
inline bool qtype_has_amx_kernels(const enum lm_ggml_type type) {
|
86
|
+
// TODO: fix padding for vnni format
|
87
|
+
return (type == LM_GGML_TYPE_Q4_0) ||
|
88
|
+
(type == LM_GGML_TYPE_Q4_1) ||
|
89
|
+
(type == LM_GGML_TYPE_Q8_0) ||
|
90
|
+
(type == LM_GGML_TYPE_Q4_K) ||
|
91
|
+
(type == LM_GGML_TYPE_Q5_K) ||
|
92
|
+
(type == LM_GGML_TYPE_Q6_K) ||
|
93
|
+
(type == LM_GGML_TYPE_IQ4_XS);
|
94
|
+
}
|
95
|
+
|
96
|
+
// ggml backend context
|
97
|
+
struct lm_ggml_backend_amx_context {
|
98
|
+
int n_threads = LM_GGML_DEFAULT_N_THREADS;
|
99
|
+
std::unique_ptr<char[]> work_data;
|
100
|
+
size_t work_size = 0;
|
101
|
+
};
|