@novastera-oss/llamarn 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/android/CMakeLists.txt +47 -21
  2. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  3. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  4. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  5. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  6. package/cpp/PureCppImpl.cpp +80 -6
  7. package/cpp/build-info.cpp +2 -2
  8. package/cpp/llama.cpp/convert_hf_to_gguf.py +15 -0
  9. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +1 -2
  10. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +10 -0
  11. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +99 -364
  12. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +7 -0
  13. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +14 -13
  14. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +15 -3
  15. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +36 -25
  16. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +12 -3
  17. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +80 -7
  18. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +6 -0
  19. package/cpp/llama.cpp/gguf-py/gguf/constants.py +19 -0
  20. package/cpp/llama.cpp/src/CMakeLists.txt +1 -0
  21. package/cpp/llama.cpp/src/llama-arch.cpp +22 -0
  22. package/cpp/llama.cpp/src/llama-arch.h +1 -0
  23. package/cpp/llama.cpp/src/llama-model.cpp +21 -1
  24. package/cpp/llama.cpp/src/models/models.h +4 -0
  25. package/cpp/llama.cpp/src/models/rnd1.cpp +126 -0
  26. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  27. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6403 -6395
  28. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  29. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  30. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6366 -6358
  31. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4815 -4809
  32. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  33. package/package.json +1 -1
@@ -66,6 +66,13 @@ static inline bool dma_queue_push(dma_queue * q,
66
66
  desc->desctype = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
67
67
  desc->dstbypass = 1;
68
68
  desc->srcbypass = 1;
69
+ #if __HVX_ARCH__ >= 73
70
+ desc->dstbypass = 1;
71
+ desc->srcbypass = 1;
72
+ #else
73
+ desc->dstbypass = 0;
74
+ desc->srcbypass = 1;
75
+ #endif
69
76
  desc->order = 0;
70
77
  desc->dstate = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
71
78
  desc->src = (void *) src;
@@ -16,13 +16,8 @@
16
16
  #include "hvx-utils.h"
17
17
  #include "ops-utils.h"
18
18
 
19
- static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
20
- static const float kInf = INFINITY;
21
- static const float kMaxExp = 88.02f; // log(INF)
22
-
23
- const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
24
- const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
25
- const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
19
+ static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
20
+ const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
26
21
 
27
22
  HVX_Vector out = hvx_vec_exp_fp32(in_vec);
28
23
 
@@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
47
42
 
48
43
  HVX_Vector vec_out = Q6_V_vzero();
49
44
 
45
+ static const float kInf = INFINITY;
46
+ static const float kMaxExp = 88.02f; // log(INF)
47
+
48
+ const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
49
+ const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
50
+
50
51
  if (0 == unaligned_loop) {
51
52
  HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
52
53
  HVX_Vector * p_vec_out = (HVX_Vector *) dst;
@@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
55
56
  for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
56
57
  if (true == negate) {
57
58
  HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
58
- *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in);
59
+ *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
59
60
  } else {
60
- *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
61
+ *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
61
62
  }
62
63
  }
63
64
  } else {
@@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
67
68
 
68
69
  if (true == negate) {
69
70
  HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
70
- *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
71
+ *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
71
72
  } else {
72
- *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
73
+ *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
73
74
  }
74
75
  }
75
76
  }
@@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
83
84
  if (true == negate) {
84
85
  HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
85
86
 
86
- vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
87
+ vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
87
88
  } else {
88
- vec_out = hvx_vec_exp_fp32_guard(in);
89
+ vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
89
90
  }
90
91
 
91
92
  hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
@@ -16,6 +16,15 @@
16
16
  #include "hvx-utils.h"
17
17
  #include "ops-utils.h"
18
18
 
19
+ static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
20
+ HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
21
+
22
+ HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
23
+ const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
24
+
25
+ return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
26
+ }
27
+
19
28
  void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
20
29
  int left_over = num_elems & (VLEN_FP32 - 1);
21
30
  int num_elems_whole = num_elems - left_over;
@@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
32
41
  FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
33
42
  }
34
43
 
44
+ static const uint32_t kNanInfMask = 0x7f800000;
45
+ const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
46
+
35
47
  if (0 == unaligned_loop) {
36
48
  HVX_Vector * p_vec_in = (HVX_Vector *) src;
37
49
  HVX_Vector * p_vec_out = (HVX_Vector *) dst;
38
50
 
39
51
  #pragma unroll(4)
40
52
  for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
41
- *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
53
+ *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
42
54
  }
43
55
  } else {
44
56
  #pragma unroll(4)
45
57
  for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
46
58
  HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
47
- *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
59
+ *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
48
60
  }
49
61
  }
50
62
 
@@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
53
65
  float * dstf = (float *) dst + num_elems_whole;
54
66
 
55
67
  HVX_Vector in = *(HVX_UVector *) srcf;
56
- HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
68
+ HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
57
69
 
58
70
  hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
59
71
  }
@@ -21,6 +21,26 @@ typedef union {
21
21
  float fp32[VLEN_FP32];
22
22
  } __attribute__((aligned(VLEN), packed)) HVX_VectorAlias;
23
23
 
24
+ /* Q6_Vsf_equals_Vw is only available on v73+.*/
25
+ #if __HVX_ARCH__ < 73
26
+ static inline HVX_Vector int32_to_qfloat(HVX_Vector const in)
27
+ {
28
+ HVX_Vector const vzero = Q6_V_vzero();
29
+ HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
30
+ HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
31
+ HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
32
+ HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
33
+ HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
34
+ HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
35
+ return ret;
36
+ }
37
+
38
+ static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
39
+ {
40
+ return Q6_Vsf_equals_Vqf32(int32_to_qfloat(in));
41
+ }
42
+ #endif
43
+
24
44
  static inline HVX_Vector hvx_vec_splat_fp32(float i) {
25
45
  union {
26
46
  float f;
@@ -726,24 +746,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
726
746
  return Q6_Vsf_equals_Vqf32(r_qf);
727
747
  }
728
748
 
729
- static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
730
- static const float kInf = INFINITY;
731
- static const uint32_t kNanMask = 0x7fffffff;
732
- static const uint32_t kNanMin = 0x7f800000;
733
-
734
- const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
735
- const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
736
-
737
- HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
738
-
739
- const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
740
- const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin);
741
- HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask);
742
- const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
743
-
744
- return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
745
- }
746
-
747
749
  #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
748
750
  #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
749
751
  #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
@@ -958,14 +960,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
958
960
  return Q6_Vsf_equals_Vqf32(temp);
959
961
  }
960
962
 
961
- static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
962
- static const float kMaxExp = -88.02f; // log(INF)
963
-
964
- const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
965
- const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
963
+ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
964
+ HVX_Vector one,
965
+ HVX_Vector max_exp,
966
+ HVX_Vector min_exp) {
967
+ const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
968
+ const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
966
969
 
967
970
  HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
968
- return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
971
+ out = Q6_V_vmux_QVV(pred_max, out, one);
972
+ return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
969
973
  }
970
974
 
971
975
  static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
@@ -977,9 +981,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
977
981
  const HVX_Vector * restrict v_src = (HVX_Vector *) src;
978
982
  HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
979
983
 
984
+ static const float kMinExp = -87.f; // 0
985
+ static const float kMaxExp = 87.f; // 1
986
+
987
+ const HVX_Vector one = hvx_vec_splat_fp32(1.f);
988
+ const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
989
+ const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
990
+
980
991
  #pragma unroll(4)
981
992
  for (int i = 0; i < step_of_1; i++) {
982
- v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
993
+ v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
983
994
  }
984
995
  }
985
996
 
@@ -143,16 +143,25 @@ AEEResult htp_iface_disable_etm(remote_handle64 handle) {
143
143
  }
144
144
 
145
145
  static int vtcm_acquire(struct htp_context * ctx) {
146
+ int err;
146
147
  if (!ctx->vtcm_valid) {
147
148
  // Temporarily bump thread priority to make sure it's higher than other sessions.
148
149
  // This way the resource manager will notify the other thread to release VTCM.
149
150
  // Note that we need to reaquire VTCM at normal priority for this to work next time.
150
151
  qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
151
- HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
152
+ err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
153
+ if (err != 0) {
154
+ FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
155
+ abort();
156
+ }
152
157
  HAP_compute_res_release_cached(ctx->vtcm_rctx);
153
158
  qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
154
159
 
155
- HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
160
+ err = HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
161
+ if (err != 0) {
162
+ FARF(ERROR, "Failed to acquire VTCM: 0x%08x", (unsigned)err);
163
+ abort();
164
+ }
156
165
  ctx->vtcm_valid = true;
157
166
  }
158
167
 
@@ -201,7 +210,7 @@ static int vtcm_alloc(struct htp_context * ctx) {
201
210
  HAP_compute_res_attr_init(&attr);
202
211
  HAP_compute_res_attr_set_serialize(&attr, 0);
203
212
  HAP_compute_res_attr_set_cache_mode(&attr, 1);
204
- HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size);
213
+ HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, 0, vtcm_size);
205
214
  HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
206
215
  HAP_compute_res_attr_set_hmx_param(&attr, 1);
207
216
 
@@ -24,6 +24,10 @@
24
24
  #include "hvx-utils.h"
25
25
  #include "ops-utils.h"
26
26
 
27
+ // Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
28
+ #define HTP_ROPE_TYPE_NORMAL 0
29
+ #define HTP_ROPE_TYPE_NEOX 2
30
+
27
31
  #define htp_rope_preamble \
28
32
  const uint32_t ne00 = src0->ne[0]; \
29
33
  const uint32_t ne01 = src0->ne[1]; \
@@ -146,6 +150,57 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
146
150
  rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
147
151
  }
148
152
 
153
+ static void hvx_calc_rope_neox_f32(const float * restrict src0,
154
+ float * restrict dst,
155
+ const int num_elems,
156
+ const float * restrict theta_cache) {
157
+ // for (int i = 0; i < num_elems; i += 2) {
158
+ //const float cos_theta = theta_cache[i + 0];
159
+ //const float sin_theta = theta_cache[i + 1];
160
+
161
+ //const float x0 = src[0];
162
+ //const float x1 = src[num_elems/2];
163
+
164
+ //dst[0] = x0*cos_theta - x1*sin_theta;
165
+ //dst[num_elems/2] = x0*sin_theta + x1*cos_theta;
166
+
167
+ //src += 1;
168
+ //dst += 1;
169
+ // }
170
+
171
+ const uint8_t * restrict src0_curr = (const uint8_t *) src0;
172
+ const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
173
+ uint8_t * restrict dst_curr = (uint8_t *) dst;
174
+
175
+ int step_of_1 = num_elems >> 6; // 6 because we process two vectors at once
176
+ int half_size = (sizeof(float) * (num_elems / 2));
177
+
178
+ for (int i = 0; i < step_of_1; i++) {
179
+ HVX_Vector v0 = *(HVX_Vector *) src0_curr;
180
+ HVX_Vector v1 = *(HVX_Vector *) (src0_curr + half_size);
181
+
182
+ HVX_Vector v2 = *(HVX_Vector *) theta_curr;
183
+ HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
184
+
185
+ HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
186
+
187
+ HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
188
+ HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
189
+ HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin));
190
+ HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin));
191
+
192
+ HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
193
+ HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
194
+
195
+ *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
196
+ *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
197
+
198
+ src0_curr += VLEN;
199
+ theta_curr += 2 * VLEN;
200
+ dst_curr += VLEN;
201
+ }
202
+ }
203
+
149
204
  static void hvx_calc_rope_f32(const float * restrict src0,
150
205
  float * restrict dst,
151
206
  const int num_elems,
@@ -212,6 +267,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
212
267
  const struct htp_tensor * src2 = &octx->src2;
213
268
  struct htp_tensor * dst = &octx->dst;
214
269
 
270
+ const int32_t mode = rope_ctx->mode;
271
+ const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
272
+
215
273
  htp_rope_preamble;
216
274
 
217
275
  const int32_t * pos = (const int32_t *) src1->data;
@@ -247,20 +305,35 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
247
305
  float * dst_data_loc = dst_data;
248
306
 
249
307
  if (1 == opt_path) {
250
- hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
308
+ if (is_neox) {
309
+ hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
310
+ } else {
311
+ hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
312
+ }
251
313
  } else {
252
314
  for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
253
315
  const float cos_theta = wp0[i0 + 0];
254
316
  const float sin_theta = wp0[i0 + 1];
255
317
 
256
- const float x0 = src_loc[0];
257
- const float x1 = src_loc[1];
318
+ if (is_neox) {
319
+ const float x0 = src_loc[0];
320
+ const float x1 = src_loc[rope_ctx->n_dims/2];
321
+
322
+ dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
323
+ dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
324
+
325
+ src_loc += 1;
326
+ dst_data_loc += 1;
327
+ } else {
328
+ const float x0 = src_loc[0];
329
+ const float x1 = src_loc[1];
258
330
 
259
- dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
260
- dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
331
+ dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
332
+ dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
261
333
 
262
- src_loc += 2;
263
- dst_data_loc += 2;
334
+ src_loc += 2;
335
+ dst_data_loc += 2;
336
+ }
264
337
  }
265
338
  }
266
339
 
@@ -390,6 +390,12 @@ int get_hex_arch_ver(int domain, int * arch) {
390
390
  }
391
391
 
392
392
  switch (arch_ver.capability & 0xff) {
393
+ case 0x68:
394
+ *arch = 68;
395
+ return 0;
396
+ case 0x69:
397
+ *arch = 69;
398
+ return 0;
393
399
  case 0x73:
394
400
  *arch = 73;
395
401
  return 0;
@@ -427,6 +427,7 @@ class MODEL_ARCH(IntEnum):
427
427
  APERTUS = auto()
428
428
  COGVLM = auto()
429
429
  MINIMAXM2 = auto()
430
+ RND1 = auto()
430
431
  PANGU_EMBED = auto()
431
432
 
432
433
 
@@ -797,6 +798,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
797
798
  MODEL_ARCH.APERTUS: "apertus",
798
799
  MODEL_ARCH.MINIMAXM2: "minimax-m2",
799
800
  MODEL_ARCH.COGVLM: "cogvlm",
801
+ MODEL_ARCH.RND1: "rnd1",
800
802
  MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
801
803
  }
802
804
 
@@ -2991,6 +2993,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2991
2993
  MODEL_TENSOR.VISEXP_UP,
2992
2994
  MODEL_TENSOR.VISEXP_DOWN,
2993
2995
  ],
2996
+ MODEL_ARCH.RND1: [
2997
+ MODEL_TENSOR.TOKEN_EMBD,
2998
+ MODEL_TENSOR.OUTPUT_NORM,
2999
+ MODEL_TENSOR.OUTPUT,
3000
+ MODEL_TENSOR.ATTN_NORM,
3001
+ MODEL_TENSOR.ATTN_Q,
3002
+ MODEL_TENSOR.ATTN_Q_NORM,
3003
+ MODEL_TENSOR.ATTN_K,
3004
+ MODEL_TENSOR.ATTN_K_NORM,
3005
+ MODEL_TENSOR.ATTN_V,
3006
+ MODEL_TENSOR.ATTN_OUT,
3007
+ MODEL_TENSOR.FFN_NORM,
3008
+ MODEL_TENSOR.FFN_GATE_INP,
3009
+ MODEL_TENSOR.FFN_GATE_EXP,
3010
+ MODEL_TENSOR.FFN_DOWN_EXP,
3011
+ MODEL_TENSOR.FFN_UP_EXP,
3012
+ ],
2994
3013
  MODEL_ARCH.PANGU_EMBED: [
2995
3014
  MODEL_TENSOR.TOKEN_EMBD,
2996
3015
  MODEL_TENSOR.OUTPUT_NORM,
@@ -115,6 +115,7 @@ add_library(llama
115
115
  models/qwen3vl-moe.cpp
116
116
  models/qwen3moe.cpp
117
117
  models/refact.cpp
118
+ models/rnd1.cpp
118
119
  models/rwkv6-base.cpp
119
120
  models/rwkv6.cpp
120
121
  models/rwkv6qwen2.cpp
@@ -108,6 +108,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
108
108
  { LLM_ARCH_APERTUS, "apertus" },
109
109
  { LLM_ARCH_MINIMAX_M2, "minimax-m2" },
110
110
  { LLM_ARCH_COGVLM, "cogvlm" },
111
+ { LLM_ARCH_RND1, "rnd1" },
111
112
  { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
112
113
  { LLM_ARCH_UNKNOWN, "(unknown)" },
113
114
  };
@@ -2446,6 +2447,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2446
2447
  { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" },
2447
2448
  },
2448
2449
  },
2450
+ {
2451
+ LLM_ARCH_RND1,
2452
+ {
2453
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2454
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2455
+ { LLM_TENSOR_OUTPUT, "output" },
2456
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2457
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2458
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2459
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2460
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2461
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2462
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2463
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2464
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2465
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2466
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2467
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2468
+ },
2469
+ },
2449
2470
  {
2450
2471
  LLM_ARCH_UNKNOWN,
2451
2472
  {
@@ -2722,6 +2743,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
2722
2743
  case LLM_ARCH_DREAM:
2723
2744
  case LLM_ARCH_LLADA:
2724
2745
  case LLM_ARCH_LLADA_MOE:
2746
+ case LLM_ARCH_RND1:
2725
2747
  return true;
2726
2748
  default:
2727
2749
  return false;
@@ -112,6 +112,7 @@ enum llm_arch {
112
112
  LLM_ARCH_APERTUS,
113
113
  LLM_ARCH_MINIMAX_M2,
114
114
  LLM_ARCH_COGVLM,
115
+ LLM_ARCH_RND1,
115
116
  LLM_ARCH_PANGU_EMBED,
116
117
  LLM_ARCH_UNKNOWN,
117
118
  };
@@ -1036,6 +1036,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1036
1036
  default: type = LLM_TYPE_UNKNOWN;
1037
1037
  }
1038
1038
  } break;
1039
+ case LLM_ARCH_RND1:
1040
+ {
1041
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1042
+
1043
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1044
+ switch (hparams.n_layer) {
1045
+ case 48: type = LLM_TYPE_30B_A3B; break;
1046
+ default: type = LLM_TYPE_UNKNOWN;
1047
+ }
1048
+ // Set non-causal attention for diffusion models
1049
+ hparams.causal_attn = false;
1050
+ } break;
1039
1051
  case LLM_ARCH_QWEN2MOE:
1040
1052
  {
1041
1053
  ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
@@ -3402,6 +3414,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3402
3414
  } break;
3403
3415
  case LLM_ARCH_QWEN3MOE:
3404
3416
  case LLM_ARCH_QWEN3VLMOE:
3417
+ case LLM_ARCH_RND1:
3405
3418
  {
3406
3419
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3407
3420
 
@@ -6720,7 +6733,7 @@ void llama_model::print_info() const {
6720
6733
  LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6721
6734
  }
6722
6735
 
6723
- if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
6736
+ if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
6724
6737
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6725
6738
  }
6726
6739
 
@@ -6882,6 +6895,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
6882
6895
  case LLM_ARCH_DREAM:
6883
6896
  case LLM_ARCH_LLADA:
6884
6897
  case LLM_ARCH_LLADA_MOE:
6898
+ case LLM_ARCH_RND1:
6885
6899
  {
6886
6900
  res = nullptr;
6887
6901
  } break;
@@ -7075,6 +7089,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7075
7089
  llm = std::make_unique<llm_build_llada_moe>(*this, params);
7076
7090
  }
7077
7091
  break;
7092
+ case LLM_ARCH_RND1:
7093
+ {
7094
+ llm = std::make_unique<llm_build_rnd1>(*this, params);
7095
+ }
7096
+ break;
7078
7097
  case LLM_ARCH_QWEN2VL:
7079
7098
  {
7080
7099
  llm = std::make_unique<llm_build_qwen2vl>(*this, params);
@@ -7595,6 +7614,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7595
7614
  case LLM_ARCH_QWEN3:
7596
7615
  case LLM_ARCH_QWEN3MOE:
7597
7616
  case LLM_ARCH_LLADA_MOE:
7617
+ case LLM_ARCH_RND1:
7598
7618
  case LLM_ARCH_OLMO2:
7599
7619
  case LLM_ARCH_OLMOE:
7600
7620
  case LLM_ARCH_PHI2:
@@ -431,6 +431,10 @@ struct llm_build_refact : public llm_graph_context {
431
431
  llm_build_refact(const llama_model & model, const llm_graph_params & params);
432
432
  };
433
433
 
434
+ struct llm_build_rnd1 : public llm_graph_context {
435
+ llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
436
+ };
437
+
434
438
  struct llm_build_rwkv6 : public llm_build_rwkv6_base {
435
439
  llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
436
440
  };