@novastera-oss/llamarn 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/CMakeLists.txt +47 -21
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/PureCppImpl.cpp +80 -6
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/convert_hf_to_gguf.py +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +1 -2
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +99 -364
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +14 -13
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +15 -3
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +36 -25
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +80 -7
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +19 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +22 -0
- package/cpp/llama.cpp/src/llama-arch.h +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +21 -1
- package/cpp/llama.cpp/src/models/models.h +4 -0
- package/cpp/llama.cpp/src/models/rnd1.cpp +126 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6403 -6395
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6366 -6358
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4815 -4809
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -66,6 +66,13 @@ static inline bool dma_queue_push(dma_queue * q,
|
|
|
66
66
|
desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
|
|
67
67
|
desc->dstbypass = 1;
|
|
68
68
|
desc->srcbypass = 1;
|
|
69
|
+
#if __HVX_ARCH__ >= 73
|
|
70
|
+
desc->dstbypass = 1;
|
|
71
|
+
desc->srcbypass = 1;
|
|
72
|
+
#else
|
|
73
|
+
desc->dstbypass = 0;
|
|
74
|
+
desc->srcbypass = 1;
|
|
75
|
+
#endif
|
|
69
76
|
desc->order = 0;
|
|
70
77
|
desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
|
|
71
78
|
desc->src = (void *) src;
|
|
@@ -16,13 +16,8 @@
|
|
|
16
16
|
#include "hvx-utils.h"
|
|
17
17
|
#include "ops-utils.h"
|
|
18
18
|
|
|
19
|
-
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
|
|
20
|
-
|
|
21
|
-
static const float kMaxExp = 88.02f; // log(INF)
|
|
22
|
-
|
|
23
|
-
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
|
24
|
-
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
|
25
|
-
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
|
|
19
|
+
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
|
|
20
|
+
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
|
|
26
21
|
|
|
27
22
|
HVX_Vector out = hvx_vec_exp_fp32(in_vec);
|
|
28
23
|
|
|
@@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
|
|
47
42
|
|
|
48
43
|
HVX_Vector vec_out = Q6_V_vzero();
|
|
49
44
|
|
|
45
|
+
static const float kInf = INFINITY;
|
|
46
|
+
static const float kMaxExp = 88.02f; // log(INF)
|
|
47
|
+
|
|
48
|
+
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
|
49
|
+
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
|
50
|
+
|
|
50
51
|
if (0 == unaligned_loop) {
|
|
51
52
|
HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
|
|
52
53
|
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
|
|
@@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
|
|
55
56
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
56
57
|
if (true == negate) {
|
|
57
58
|
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
|
|
58
|
-
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in);
|
|
59
|
+
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
|
59
60
|
} else {
|
|
60
|
-
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1
|
|
61
|
+
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
|
|
61
62
|
}
|
|
62
63
|
}
|
|
63
64
|
} else {
|
|
@@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
|
|
67
68
|
|
|
68
69
|
if (true == negate) {
|
|
69
70
|
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
|
70
|
-
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
|
|
71
|
+
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
|
71
72
|
} else {
|
|
72
|
-
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
|
|
73
|
+
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
|
|
73
74
|
}
|
|
74
75
|
}
|
|
75
76
|
}
|
|
@@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
|
|
83
84
|
if (true == negate) {
|
|
84
85
|
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
|
85
86
|
|
|
86
|
-
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
|
|
87
|
+
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
|
87
88
|
} else {
|
|
88
|
-
vec_out = hvx_vec_exp_fp32_guard(in);
|
|
89
|
+
vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
|
|
89
90
|
}
|
|
90
91
|
|
|
91
92
|
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
|
|
@@ -16,6 +16,15 @@
|
|
|
16
16
|
#include "hvx-utils.h"
|
|
17
17
|
#include "ops-utils.h"
|
|
18
18
|
|
|
19
|
+
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
|
|
20
|
+
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
|
|
21
|
+
|
|
22
|
+
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
|
|
23
|
+
const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
|
|
24
|
+
|
|
25
|
+
return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
|
|
26
|
+
}
|
|
27
|
+
|
|
19
28
|
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
|
20
29
|
int left_over = num_elems & (VLEN_FP32 - 1);
|
|
21
30
|
int num_elems_whole = num_elems - left_over;
|
|
@@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
|
|
|
32
41
|
FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
|
|
33
42
|
}
|
|
34
43
|
|
|
44
|
+
static const uint32_t kNanInfMask = 0x7f800000;
|
|
45
|
+
const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
|
|
46
|
+
|
|
35
47
|
if (0 == unaligned_loop) {
|
|
36
48
|
HVX_Vector * p_vec_in = (HVX_Vector *) src;
|
|
37
49
|
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
|
|
38
50
|
|
|
39
51
|
#pragma unroll(4)
|
|
40
52
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
41
|
-
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in
|
|
53
|
+
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
|
|
42
54
|
}
|
|
43
55
|
} else {
|
|
44
56
|
#pragma unroll(4)
|
|
45
57
|
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
|
46
58
|
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
|
47
|
-
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
|
|
59
|
+
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
|
|
48
60
|
}
|
|
49
61
|
}
|
|
50
62
|
|
|
@@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
|
|
|
53
65
|
float * dstf = (float *) dst + num_elems_whole;
|
|
54
66
|
|
|
55
67
|
HVX_Vector in = *(HVX_UVector *) srcf;
|
|
56
|
-
HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
|
|
68
|
+
HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
|
|
57
69
|
|
|
58
70
|
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
|
|
59
71
|
}
|
|
@@ -21,6 +21,26 @@ typedef union {
|
|
|
21
21
|
float fp32[VLEN_FP32];
|
|
22
22
|
} __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
|
|
23
23
|
|
|
24
|
+
/* Q6_Vsf_equals_Vw is only available on v73+.*/
|
|
25
|
+
#if __HVX_ARCH__ < 73
|
|
26
|
+
static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
|
|
27
|
+
{
|
|
28
|
+
HVX_Vector const vzero = Q6_V_vzero();
|
|
29
|
+
HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
|
|
30
|
+
HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
|
|
31
|
+
HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
|
|
32
|
+
HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
|
|
33
|
+
HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
|
|
34
|
+
HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
|
|
35
|
+
return ret;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
|
|
39
|
+
{
|
|
40
|
+
return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
|
|
41
|
+
}
|
|
42
|
+
#endif
|
|
43
|
+
|
|
24
44
|
static inline HVX_Vector hvx_vec_splat_fp32(float i) {
|
|
25
45
|
union {
|
|
26
46
|
float f;
|
|
@@ -726,24 +746,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
|
|
|
726
746
|
return Q6_Vsf_equals_Vqf32(r_qf);
|
|
727
747
|
}
|
|
728
748
|
|
|
729
|
-
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
|
|
730
|
-
static const float kInf = INFINITY;
|
|
731
|
-
static const uint32_t kNanMask = 0x7fffffff;
|
|
732
|
-
static const uint32_t kNanMin = 0x7f800000;
|
|
733
|
-
|
|
734
|
-
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
|
735
|
-
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
|
|
736
|
-
|
|
737
|
-
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
|
|
738
|
-
|
|
739
|
-
const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
|
|
740
|
-
const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin);
|
|
741
|
-
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask);
|
|
742
|
-
const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
|
|
743
|
-
|
|
744
|
-
return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
|
|
745
|
-
}
|
|
746
|
-
|
|
747
749
|
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
|
|
748
750
|
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
|
|
749
751
|
#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
|
|
@@ -958,14 +960,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
|
|
|
958
960
|
return Q6_Vsf_equals_Vqf32(temp);
|
|
959
961
|
}
|
|
960
962
|
|
|
961
|
-
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
const HVX_VectorPred
|
|
963
|
+
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
|
|
964
|
+
HVX_Vector one,
|
|
965
|
+
HVX_Vector max_exp,
|
|
966
|
+
HVX_Vector min_exp) {
|
|
967
|
+
const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
|
|
968
|
+
const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
|
|
966
969
|
|
|
967
970
|
HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
|
|
968
|
-
|
|
971
|
+
out = Q6_V_vmux_QVV(pred_max, out, one);
|
|
972
|
+
return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
|
|
969
973
|
}
|
|
970
974
|
|
|
971
975
|
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
|
@@ -977,9 +981,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
|
|
|
977
981
|
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
|
|
978
982
|
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
|
979
983
|
|
|
984
|
+
static const float kMinExp = -87.f; // 0
|
|
985
|
+
static const float kMaxExp = 87.f; // 1
|
|
986
|
+
|
|
987
|
+
const HVX_Vector one = hvx_vec_splat_fp32(1.f);
|
|
988
|
+
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
|
989
|
+
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
|
990
|
+
|
|
980
991
|
#pragma unroll(4)
|
|
981
992
|
for (int i = 0; i < step_of_1; i++) {
|
|
982
|
-
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
|
|
993
|
+
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
|
|
983
994
|
}
|
|
984
995
|
}
|
|
985
996
|
|
|
@@ -143,16 +143,25 @@ AEEResult htp_iface_disable_etm(remote_handle64 handle) {
|
|
|
143
143
|
}
|
|
144
144
|
|
|
145
145
|
static int vtcm_acquire(struct htp_context * ctx) {
|
|
146
|
+
int err;
|
|
146
147
|
if (!ctx->vtcm_valid) {
|
|
147
148
|
// Temporarily bump thread priority to make sure it's higher than other sessions.
|
|
148
149
|
// This way the resource manager will notify the other thread to release VTCM.
|
|
149
150
|
// Note that we need to reaquire VTCM at normal priority for this to work next time.
|
|
150
151
|
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
|
|
151
|
-
HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
|
152
|
+
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
|
153
|
+
if (err != 0) {
|
|
154
|
+
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
|
155
|
+
abort();
|
|
156
|
+
}
|
|
152
157
|
HAP_compute_res_release_cached(ctx->vtcm_rctx);
|
|
153
158
|
qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
|
|
154
159
|
|
|
155
|
-
HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
|
160
|
+
err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
|
|
161
|
+
if (err != 0) {
|
|
162
|
+
FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
|
|
163
|
+
abort();
|
|
164
|
+
}
|
|
156
165
|
ctx->vtcm_valid = true;
|
|
157
166
|
}
|
|
158
167
|
|
|
@@ -201,7 +210,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
|
|
|
201
210
|
HAP_compute_res_attr_init(&attr);
|
|
202
211
|
HAP_compute_res_attr_set_serialize(&attr, 0);
|
|
203
212
|
HAP_compute_res_attr_set_cache_mode(&attr, 1);
|
|
204
|
-
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size,
|
|
213
|
+
HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
|
|
205
214
|
HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
|
|
206
215
|
HAP_compute_res_attr_set_hmx_param(&attr, 1);
|
|
207
216
|
|
|
@@ -24,6 +24,10 @@
|
|
|
24
24
|
#include "hvx-utils.h"
|
|
25
25
|
#include "ops-utils.h"
|
|
26
26
|
|
|
27
|
+
// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
|
|
28
|
+
#define HTP_ROPE_TYPE_NORMAL 0
|
|
29
|
+
#define HTP_ROPE_TYPE_NEOX 2
|
|
30
|
+
|
|
27
31
|
#define htp_rope_preamble \
|
|
28
32
|
const uint32_t ne00 = src0->ne[0]; \
|
|
29
33
|
const uint32_t ne01 = src0->ne[1]; \
|
|
@@ -146,6 +150,57 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
|
|
|
146
150
|
rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
|
|
147
151
|
}
|
|
148
152
|
|
|
153
|
+
static void hvx_calc_rope_neox_f32(const float * restrict src0,
|
|
154
|
+
float * restrict dst,
|
|
155
|
+
const int num_elems,
|
|
156
|
+
const float * restrict theta_cache) {
|
|
157
|
+
// for (int i = 0; i < num_elems; i += 2) {
|
|
158
|
+
//const float cos_theta = theta_cache[i + 0];
|
|
159
|
+
//const float sin_theta = theta_cache[i + 1];
|
|
160
|
+
|
|
161
|
+
//const float x0 = src[0];
|
|
162
|
+
//const float x1 = src[num_elems/2];
|
|
163
|
+
|
|
164
|
+
//dst[0] = x0*cos_theta - x1*sin_theta;
|
|
165
|
+
//dst[num_elems/2] = x0*sin_theta + x1*cos_theta;
|
|
166
|
+
|
|
167
|
+
//src += 1;
|
|
168
|
+
//dst += 1;
|
|
169
|
+
// }
|
|
170
|
+
|
|
171
|
+
const uint8_t * restrict src0_curr = (const uint8_t *) src0;
|
|
172
|
+
const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
|
|
173
|
+
uint8_t * restrict dst_curr = (uint8_t *) dst;
|
|
174
|
+
|
|
175
|
+
int step_of_1 = num_elems >> 6; // 6 because we process two vectors at once
|
|
176
|
+
int half_size = (sizeof(float) * (num_elems / 2));
|
|
177
|
+
|
|
178
|
+
for (int i = 0; i < step_of_1; i++) {
|
|
179
|
+
HVX_Vector v0 = *(HVX_Vector *) src0_curr;
|
|
180
|
+
HVX_Vector v1 = *(HVX_Vector *) (src0_curr + half_size);
|
|
181
|
+
|
|
182
|
+
HVX_Vector v2 = *(HVX_Vector *) theta_curr;
|
|
183
|
+
HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
|
|
184
|
+
|
|
185
|
+
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
|
|
186
|
+
|
|
187
|
+
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
|
|
188
|
+
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
|
|
189
|
+
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin));
|
|
190
|
+
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin));
|
|
191
|
+
|
|
192
|
+
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
|
193
|
+
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
|
194
|
+
|
|
195
|
+
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
|
|
196
|
+
*(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
|
|
197
|
+
|
|
198
|
+
src0_curr += VLEN;
|
|
199
|
+
theta_curr += 2 * VLEN;
|
|
200
|
+
dst_curr += VLEN;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
149
204
|
static void hvx_calc_rope_f32(const float * restrict src0,
|
|
150
205
|
float * restrict dst,
|
|
151
206
|
const int num_elems,
|
|
@@ -212,6 +267,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
|
|
212
267
|
const struct htp_tensor * src2 = &octx->src2;
|
|
213
268
|
struct htp_tensor * dst = &octx->dst;
|
|
214
269
|
|
|
270
|
+
const int32_t mode = rope_ctx->mode;
|
|
271
|
+
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
|
|
272
|
+
|
|
215
273
|
htp_rope_preamble;
|
|
216
274
|
|
|
217
275
|
const int32_t * pos = (const int32_t *) src1->data;
|
|
@@ -247,20 +305,35 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
|
|
|
247
305
|
float * dst_data_loc = dst_data;
|
|
248
306
|
|
|
249
307
|
if (1 == opt_path) {
|
|
250
|
-
|
|
308
|
+
if (is_neox) {
|
|
309
|
+
hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
|
310
|
+
} else {
|
|
311
|
+
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
|
|
312
|
+
}
|
|
251
313
|
} else {
|
|
252
314
|
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
|
|
253
315
|
const float cos_theta = wp0[i0 + 0];
|
|
254
316
|
const float sin_theta = wp0[i0 + 1];
|
|
255
317
|
|
|
256
|
-
|
|
257
|
-
|
|
318
|
+
if (is_neox) {
|
|
319
|
+
const float x0 = src_loc[0];
|
|
320
|
+
const float x1 = src_loc[rope_ctx->n_dims/2];
|
|
321
|
+
|
|
322
|
+
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
|
323
|
+
dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
|
|
324
|
+
|
|
325
|
+
src_loc += 1;
|
|
326
|
+
dst_data_loc += 1;
|
|
327
|
+
} else {
|
|
328
|
+
const float x0 = src_loc[0];
|
|
329
|
+
const float x1 = src_loc[1];
|
|
258
330
|
|
|
259
|
-
|
|
260
|
-
|
|
331
|
+
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
|
|
332
|
+
dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
|
|
261
333
|
|
|
262
|
-
|
|
263
|
-
|
|
334
|
+
src_loc += 2;
|
|
335
|
+
dst_data_loc += 2;
|
|
336
|
+
}
|
|
264
337
|
}
|
|
265
338
|
}
|
|
266
339
|
|
|
@@ -427,6 +427,7 @@ class MODEL_ARCH(IntEnum):
|
|
|
427
427
|
APERTUS = auto()
|
|
428
428
|
COGVLM = auto()
|
|
429
429
|
MINIMAXM2 = auto()
|
|
430
|
+
RND1 = auto()
|
|
430
431
|
PANGU_EMBED = auto()
|
|
431
432
|
|
|
432
433
|
|
|
@@ -797,6 +798,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
797
798
|
MODEL_ARCH.APERTUS: "apertus",
|
|
798
799
|
MODEL_ARCH.MINIMAXM2: "minimax-m2",
|
|
799
800
|
MODEL_ARCH.COGVLM: "cogvlm",
|
|
801
|
+
MODEL_ARCH.RND1: "rnd1",
|
|
800
802
|
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
|
|
801
803
|
}
|
|
802
804
|
|
|
@@ -2991,6 +2993,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
2991
2993
|
MODEL_TENSOR.VISEXP_UP,
|
|
2992
2994
|
MODEL_TENSOR.VISEXP_DOWN,
|
|
2993
2995
|
],
|
|
2996
|
+
MODEL_ARCH.RND1: [
|
|
2997
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
2998
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2999
|
+
MODEL_TENSOR.OUTPUT,
|
|
3000
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
3001
|
+
MODEL_TENSOR.ATTN_Q,
|
|
3002
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
|
3003
|
+
MODEL_TENSOR.ATTN_K,
|
|
3004
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
|
3005
|
+
MODEL_TENSOR.ATTN_V,
|
|
3006
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
3007
|
+
MODEL_TENSOR.FFN_NORM,
|
|
3008
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
|
3009
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
|
3010
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
3011
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
|
3012
|
+
],
|
|
2994
3013
|
MODEL_ARCH.PANGU_EMBED: [
|
|
2995
3014
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
2996
3015
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
@@ -108,6 +108,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
108
108
|
{ LLM_ARCH_APERTUS, "apertus" },
|
|
109
109
|
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
|
|
110
110
|
{ LLM_ARCH_COGVLM, "cogvlm" },
|
|
111
|
+
{ LLM_ARCH_RND1, "rnd1" },
|
|
111
112
|
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
|
112
113
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
113
114
|
};
|
|
@@ -2446,6 +2447,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2446
2447
|
{ LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
|
|
2447
2448
|
},
|
|
2448
2449
|
},
|
|
2450
|
+
{
|
|
2451
|
+
LLM_ARCH_RND1,
|
|
2452
|
+
{
|
|
2453
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2454
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2455
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2456
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2457
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2458
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2459
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2460
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2461
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2462
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2463
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2464
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2465
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2466
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2467
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2468
|
+
},
|
|
2469
|
+
},
|
|
2449
2470
|
{
|
|
2450
2471
|
LLM_ARCH_UNKNOWN,
|
|
2451
2472
|
{
|
|
@@ -2722,6 +2743,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|
|
2722
2743
|
case LLM_ARCH_DREAM:
|
|
2723
2744
|
case LLM_ARCH_LLADA:
|
|
2724
2745
|
case LLM_ARCH_LLADA_MOE:
|
|
2746
|
+
case LLM_ARCH_RND1:
|
|
2725
2747
|
return true;
|
|
2726
2748
|
default:
|
|
2727
2749
|
return false;
|
|
@@ -1036,6 +1036,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1036
1036
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1037
1037
|
}
|
|
1038
1038
|
} break;
|
|
1039
|
+
case LLM_ARCH_RND1:
|
|
1040
|
+
{
|
|
1041
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
1042
|
+
|
|
1043
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1044
|
+
switch (hparams.n_layer) {
|
|
1045
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
|
1046
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1047
|
+
}
|
|
1048
|
+
// Set non-causal attention for diffusion models
|
|
1049
|
+
hparams.causal_attn = false;
|
|
1050
|
+
} break;
|
|
1039
1051
|
case LLM_ARCH_QWEN2MOE:
|
|
1040
1052
|
{
|
|
1041
1053
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -3402,6 +3414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3402
3414
|
} break;
|
|
3403
3415
|
case LLM_ARCH_QWEN3MOE:
|
|
3404
3416
|
case LLM_ARCH_QWEN3VLMOE:
|
|
3417
|
+
case LLM_ARCH_RND1:
|
|
3405
3418
|
{
|
|
3406
3419
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3407
3420
|
|
|
@@ -6720,7 +6733,7 @@ void llama_model::print_info() const {
|
|
|
6720
6733
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
6721
6734
|
}
|
|
6722
6735
|
|
|
6723
|
-
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
|
|
6736
|
+
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
|
6724
6737
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6725
6738
|
}
|
|
6726
6739
|
|
|
@@ -6882,6 +6895,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
6882
6895
|
case LLM_ARCH_DREAM:
|
|
6883
6896
|
case LLM_ARCH_LLADA:
|
|
6884
6897
|
case LLM_ARCH_LLADA_MOE:
|
|
6898
|
+
case LLM_ARCH_RND1:
|
|
6885
6899
|
{
|
|
6886
6900
|
res = nullptr;
|
|
6887
6901
|
} break;
|
|
@@ -7075,6 +7089,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7075
7089
|
llm = std::make_unique<llm_build_llada_moe>(*this, params);
|
|
7076
7090
|
}
|
|
7077
7091
|
break;
|
|
7092
|
+
case LLM_ARCH_RND1:
|
|
7093
|
+
{
|
|
7094
|
+
llm = std::make_unique<llm_build_rnd1>(*this, params);
|
|
7095
|
+
}
|
|
7096
|
+
break;
|
|
7078
7097
|
case LLM_ARCH_QWEN2VL:
|
|
7079
7098
|
{
|
|
7080
7099
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -7595,6 +7614,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7595
7614
|
case LLM_ARCH_QWEN3:
|
|
7596
7615
|
case LLM_ARCH_QWEN3MOE:
|
|
7597
7616
|
case LLM_ARCH_LLADA_MOE:
|
|
7617
|
+
case LLM_ARCH_RND1:
|
|
7598
7618
|
case LLM_ARCH_OLMO2:
|
|
7599
7619
|
case LLM_ARCH_OLMOE:
|
|
7600
7620
|
case LLM_ARCH_PHI2:
|
|
@@ -431,6 +431,10 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
431
431
|
llm_build_refact(const llama_model & model, const llm_graph_params & params);
|
|
432
432
|
};
|
|
433
433
|
|
|
434
|
+
struct llm_build_rnd1 : public llm_graph_context {
|
|
435
|
+
llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
|
|
436
|
+
};
|
|
437
|
+
|
|
434
438
|
struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
435
439
|
llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
|
|
436
440
|
};
|