numkong 7.4.4 → 7.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +1 -0
  2. package/binding.gyp +81 -5
  3. package/c/dispatch_f16.c +23 -0
  4. package/c/numkong.c +0 -13
  5. package/include/numkong/attention/sme.h +34 -31
  6. package/include/numkong/capabilities.h +2 -15
  7. package/include/numkong/cast/neon.h +15 -0
  8. package/include/numkong/curved/smef64.h +82 -62
  9. package/include/numkong/dot/rvvbf16.h +1 -1
  10. package/include/numkong/dot/rvvhalf.h +1 -1
  11. package/include/numkong/dot/sve.h +6 -5
  12. package/include/numkong/dot/svebfdot.h +2 -1
  13. package/include/numkong/dot/svehalf.h +6 -5
  14. package/include/numkong/dot/svesdot.h +3 -2
  15. package/include/numkong/dots/graniteamx.h +733 -0
  16. package/include/numkong/dots/serial.h +11 -4
  17. package/include/numkong/dots/sme.h +172 -140
  18. package/include/numkong/dots/smebi32.h +14 -11
  19. package/include/numkong/dots/smef64.h +31 -26
  20. package/include/numkong/dots.h +29 -3
  21. package/include/numkong/each/serial.h +22 -0
  22. package/include/numkong/geospatial/haswell.h +1 -1
  23. package/include/numkong/geospatial/neon.h +1 -1
  24. package/include/numkong/geospatial/serial.h +1 -1
  25. package/include/numkong/geospatial/skylake.h +1 -1
  26. package/include/numkong/maxsim/sme.h +94 -55
  27. package/include/numkong/mesh/README.md +13 -27
  28. package/include/numkong/mesh/haswell.h +25 -122
  29. package/include/numkong/mesh/neon.h +21 -110
  30. package/include/numkong/mesh/neonbfdot.h +4 -43
  31. package/include/numkong/mesh/rvv.h +7 -82
  32. package/include/numkong/mesh/serial.h +48 -53
  33. package/include/numkong/mesh/skylake.h +7 -123
  34. package/include/numkong/mesh/v128relaxed.h +9 -93
  35. package/include/numkong/mesh.h +2 -2
  36. package/include/numkong/mesh.hpp +35 -96
  37. package/include/numkong/reduce/neon.h +29 -0
  38. package/include/numkong/reduce/neonbfdot.h +2 -2
  39. package/include/numkong/reduce/neonfhm.h +4 -4
  40. package/include/numkong/reduce/sve.h +52 -0
  41. package/include/numkong/reduce.h +4 -0
  42. package/include/numkong/set/sve.h +6 -5
  43. package/include/numkong/sets/smebi32.h +35 -30
  44. package/include/numkong/sparse/sve2.h +3 -2
  45. package/include/numkong/spatial/sve.h +7 -6
  46. package/include/numkong/spatial/svebfdot.h +7 -4
  47. package/include/numkong/spatial/svehalf.h +5 -4
  48. package/include/numkong/spatial/svesdot.h +9 -8
  49. package/include/numkong/spatials/graniteamx.h +173 -0
  50. package/include/numkong/spatials/serial.h +22 -0
  51. package/include/numkong/spatials/sme.h +391 -350
  52. package/include/numkong/spatials/smef64.h +79 -70
  53. package/include/numkong/spatials.h +37 -4
  54. package/include/numkong/types.h +59 -0
  55. package/javascript/dist/cjs/numkong.js +13 -0
  56. package/javascript/dist/esm/numkong.js +13 -0
  57. package/javascript/numkong.c +56 -12
  58. package/javascript/numkong.ts +13 -0
  59. package/package.json +7 -7
  60. package/probes/probe.js +2 -2
  61. package/wasm/numkong.wasm +0 -0
@@ -52,9 +52,10 @@
52
52
  #if NK_TARGET_SMEF64
53
53
 
54
54
  #include "numkong/types.h"
55
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
55
56
  #include "numkong/spatial/neon.h" // `nk_f64_sqrt_neon`
56
- #include "numkong/dots/sme.h" // nk_sme_zero_za64_tile_0_, etc. (for f32 FMOPA)
57
- #include "numkong/curved/serial.h" // `nk_bilinear_f64_serial`, etc.
57
+ #include "numkong/dots/sme.h" // `nk_sme_zero_za64_tile_0_`
58
+ #include "numkong/curved/serial.h" // `nk_bilinear_f64_serial`
58
59
 
59
60
  #if defined(__cplusplus)
60
61
  extern "C" {
@@ -90,8 +91,8 @@ NK_PUBLIC void nk_dot2_f64_sve_accumulate_(svbool_t predicate_b64x, svfloat64_t
90
91
  * @brief f32 bilinear: GEMV via FMOPA (widening f32→f64, exact accumulation).
91
92
  * ZA0.D = C staging, ZA1.D = GEMV accumulator.
92
93
  */
93
- __arm_locally_streaming __arm_new("za") static void nk_bilinear_f32_smef64_streaming_(
94
- nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions, nk_f64_t *result) {
94
+ __arm_new("za") static void nk_bilinear_f32_smef64_streaming_( //
95
+ nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions, nk_f64_t *result) NK_STREAMING_ {
95
96
  svbool_t predicate_body_b64x = svptrue_b64();
96
97
  nk_size_t tile_dimension = svcntd();
97
98
  nk_f64_t outer_sum_f64 = 0.0;
@@ -124,24 +125,25 @@ __arm_locally_streaming __arm_new("za") static void nk_bilinear_f32_smef64_strea
124
125
  svfloat64_t v_f64x = svread_ver_za64_f64_m(svdup_f64(0.0), row_predicate_b64x, 1, 0);
125
126
  svfloat64_t a_f64x = svcvt_f64_f32_x(
126
127
  row_predicate_b64x, svreinterpret_f32_u64(svld1uw_u64(row_predicate_b64x, (nk_u32_t const *)(a + row))));
127
- outer_sum_f64 += svaddv_f64(predicate_body_b64x, svmul_f64_x(row_predicate_b64x, a_f64x, v_f64x));
128
+ outer_sum_f64 += nk_svaddv_f64_(predicate_body_b64x, svmul_f64_x(row_predicate_b64x, a_f64x, v_f64x));
128
129
  }
129
130
 
130
131
  *result = outer_sum_f64;
131
132
  }
132
133
 
133
- NK_PUBLIC void nk_bilinear_f32_smef64(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions,
134
- nk_f64_t *result) {
134
+ NK_PUBLIC void nk_bilinear_f32_smef64( //
135
+ nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions, nk_f64_t *result) {
136
+ nk_sme_start_streaming_();
135
137
  nk_bilinear_f32_smef64_streaming_(a, b, c, dimensions, result);
138
+ nk_sme_stop_streaming_();
136
139
  }
137
140
 
138
141
  /**
139
142
  * @brief f32 Mahalanobis: GEMV v = C×d via FMOPA, where d = a − b (exact in f64).
140
143
  * ZA0.D = C staging, ZA1.D = GEMV accumulator.
141
144
  */
142
- __arm_locally_streaming __arm_new("za") static nk_f64_t
143
- nk_mahalanobis_f32_smef64_streaming_(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c,
144
- nk_size_t dimensions) {
145
+ __arm_new("za") static nk_f64_t nk_mahalanobis_f32_smef64_streaming_( //
146
+ nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions) NK_STREAMING_ {
145
147
 
146
148
  svbool_t predicate_body_b64x = svptrue_b64();
147
149
  nk_size_t tile_dimension = svcntd();
@@ -179,15 +181,17 @@ __arm_locally_streaming __arm_new("za") static nk_f64_t
179
181
  svfloat64_t b_f64x = svcvt_f64_f32_x(
180
182
  row_predicate_b64x, svreinterpret_f32_u64(svld1uw_u64(row_predicate_b64x, (nk_u32_t const *)(b + row))));
181
183
  svfloat64_t d_f64x = svsub_f64_x(row_predicate_b64x, a_f64x, b_f64x);
182
- outer_sum_f64 += svaddv_f64(predicate_body_b64x, svmul_f64_x(row_predicate_b64x, d_f64x, v_f64x));
184
+ outer_sum_f64 += nk_svaddv_f64_(predicate_body_b64x, svmul_f64_x(row_predicate_b64x, d_f64x, v_f64x));
183
185
  }
184
186
 
185
187
  return outer_sum_f64;
186
188
  }
187
189
 
188
- NK_PUBLIC void nk_mahalanobis_f32_smef64(nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions,
189
- nk_f64_t *result) {
190
+ NK_PUBLIC void nk_mahalanobis_f32_smef64( //
191
+ nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions, nk_f64_t *result) {
192
+ nk_sme_start_streaming_();
190
193
  nk_f64_t quadratic = nk_mahalanobis_f32_smef64_streaming_(a, b, c, dimensions);
194
+ nk_sme_stop_streaming_();
191
195
  *result = nk_f64_sqrt_neon(quadratic > 0 ? quadratic : 0);
192
196
  }
193
197
 
@@ -195,9 +199,8 @@ NK_PUBLIC void nk_mahalanobis_f32_smef64(nk_f32_t const *a, nk_f32_t const *b, n
195
199
  * @brief f64 bilinear: row-by-row streaming SVE with Dot2 compensation.
196
200
  * 4-row fast path shares b_f64x loads; 1-row tail for remainder.
197
201
  */
198
- __arm_locally_streaming static void nk_bilinear_f64_smef64_streaming_(nk_f64_t const *a, nk_f64_t const *b,
199
- nk_f64_t const *c, nk_size_t dimensions,
200
- nk_f64_t *result) {
202
+ static void nk_bilinear_f64_smef64_ssve_( //
203
+ nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions, nk_f64_t *result) NK_STREAMING_ {
201
204
  svbool_t predicate_all_b64x = svptrue_b64();
202
205
  nk_f64_t outer_sum = 0.0, outer_comp = 0.0;
203
206
  nk_size_t row = 0;
@@ -226,14 +229,18 @@ __arm_locally_streaming static void nk_bilinear_f64_smef64_streaming_(nk_f64_t c
226
229
  predicate_b64x = svwhilelt_b64(j, dimensions);
227
230
  }
228
231
 
229
- nk_f64_dot2_(&outer_sum, &outer_comp, a0,
230
- svaddv_f64(predicate_all_b64x, sum_0_f64x) + svaddv_f64(predicate_all_b64x, compensation_0_f64x));
231
- nk_f64_dot2_(&outer_sum, &outer_comp, a1,
232
- svaddv_f64(predicate_all_b64x, sum_1_f64x) + svaddv_f64(predicate_all_b64x, compensation_1_f64x));
233
- nk_f64_dot2_(&outer_sum, &outer_comp, a2,
234
- svaddv_f64(predicate_all_b64x, sum_2_f64x) + svaddv_f64(predicate_all_b64x, compensation_2_f64x));
235
- nk_f64_dot2_(&outer_sum, &outer_comp, a3,
236
- svaddv_f64(predicate_all_b64x, sum_3_f64x) + svaddv_f64(predicate_all_b64x, compensation_3_f64x));
232
+ nk_f64_dot2_(
233
+ &outer_sum, &outer_comp, a0,
234
+ nk_svaddv_f64_(predicate_all_b64x, sum_0_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_0_f64x));
235
+ nk_f64_dot2_(
236
+ &outer_sum, &outer_comp, a1,
237
+ nk_svaddv_f64_(predicate_all_b64x, sum_1_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_1_f64x));
238
+ nk_f64_dot2_(
239
+ &outer_sum, &outer_comp, a2,
240
+ nk_svaddv_f64_(predicate_all_b64x, sum_2_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_2_f64x));
241
+ nk_f64_dot2_(
242
+ &outer_sum, &outer_comp, a3,
243
+ nk_svaddv_f64_(predicate_all_b64x, sum_3_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_3_f64x));
237
244
  }
238
245
 
239
246
  // 1-row tail
@@ -250,24 +257,27 @@ __arm_locally_streaming static void nk_bilinear_f64_smef64_streaming_(nk_f64_t c
250
257
  predicate_b64x = svwhilelt_b64(j, dimensions);
251
258
  }
252
259
 
253
- nk_f64_t cb_j = svaddv_f64(predicate_all_b64x, sum_f64x) + svaddv_f64(predicate_all_b64x, compensation_f64x);
260
+ nk_f64_t cb_j = nk_svaddv_f64_(predicate_all_b64x, sum_f64x) +
261
+ nk_svaddv_f64_(predicate_all_b64x, compensation_f64x);
254
262
  nk_f64_dot2_(&outer_sum, &outer_comp, a[row], cb_j);
255
263
  }
256
264
 
257
265
  *result = outer_sum + outer_comp;
258
266
  }
259
267
 
260
- NK_PUBLIC void nk_bilinear_f64_smef64(nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions,
261
- nk_f64_t *result) {
262
- nk_bilinear_f64_smef64_streaming_(a, b, c, dimensions, result);
268
+ NK_PUBLIC void nk_bilinear_f64_smef64( //
269
+ nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions, nk_f64_t *result) {
270
+ nk_sme_start_streaming_();
271
+ nk_bilinear_f64_smef64_ssve_(a, b, c, dimensions, result);
272
+ nk_sme_stop_streaming_();
263
273
  }
264
274
 
265
275
  /**
266
276
  * @brief f64 Mahalanobis: row-by-row streaming SVE with Dot2 compensation.
267
277
  * 4-row fast path shares (a−b) column vector; 1-row tail for remainder.
268
278
  */
269
- __arm_locally_streaming static nk_f64_t nk_mahalanobis_f64_smef64_streaming_(nk_f64_t const *a, nk_f64_t const *b,
270
- nk_f64_t const *c, nk_size_t dimensions) {
279
+ static nk_f64_t nk_mahalanobis_f64_smef64_ssve_( //
280
+ nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions) NK_STREAMING_ {
271
281
  svbool_t predicate_all_b64x = svptrue_b64();
272
282
  nk_f64_t outer_sum = 0.0, outer_comp = 0.0;
273
283
  nk_size_t row = 0;
@@ -298,14 +308,18 @@ __arm_locally_streaming static nk_f64_t nk_mahalanobis_f64_smef64_streaming_(nk_
298
308
  predicate_b64x = svwhilelt_b64(j, dimensions);
299
309
  }
300
310
 
301
- nk_f64_dot2_(&outer_sum, &outer_comp, d0,
302
- svaddv_f64(predicate_all_b64x, sum_0_f64x) + svaddv_f64(predicate_all_b64x, compensation_0_f64x));
303
- nk_f64_dot2_(&outer_sum, &outer_comp, d1,
304
- svaddv_f64(predicate_all_b64x, sum_1_f64x) + svaddv_f64(predicate_all_b64x, compensation_1_f64x));
305
- nk_f64_dot2_(&outer_sum, &outer_comp, d2,
306
- svaddv_f64(predicate_all_b64x, sum_2_f64x) + svaddv_f64(predicate_all_b64x, compensation_2_f64x));
307
- nk_f64_dot2_(&outer_sum, &outer_comp, d3,
308
- svaddv_f64(predicate_all_b64x, sum_3_f64x) + svaddv_f64(predicate_all_b64x, compensation_3_f64x));
311
+ nk_f64_dot2_(
312
+ &outer_sum, &outer_comp, d0,
313
+ nk_svaddv_f64_(predicate_all_b64x, sum_0_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_0_f64x));
314
+ nk_f64_dot2_(
315
+ &outer_sum, &outer_comp, d1,
316
+ nk_svaddv_f64_(predicate_all_b64x, sum_1_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_1_f64x));
317
+ nk_f64_dot2_(
318
+ &outer_sum, &outer_comp, d2,
319
+ nk_svaddv_f64_(predicate_all_b64x, sum_2_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_2_f64x));
320
+ nk_f64_dot2_(
321
+ &outer_sum, &outer_comp, d3,
322
+ nk_svaddv_f64_(predicate_all_b64x, sum_3_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_3_f64x));
309
323
  }
310
324
 
311
325
  // 1-row tail
@@ -324,16 +338,19 @@ __arm_locally_streaming static nk_f64_t nk_mahalanobis_f64_smef64_streaming_(nk_
324
338
  predicate_b64x = svwhilelt_b64(j, dimensions);
325
339
  }
326
340
 
327
- nk_f64_t cb_j = svaddv_f64(predicate_all_b64x, sum_f64x) + svaddv_f64(predicate_all_b64x, compensation_f64x);
341
+ nk_f64_t cb_j = nk_svaddv_f64_(predicate_all_b64x, sum_f64x) +
342
+ nk_svaddv_f64_(predicate_all_b64x, compensation_f64x);
328
343
  nk_f64_dot2_(&outer_sum, &outer_comp, diff_row, cb_j);
329
344
  }
330
345
 
331
346
  return outer_sum + outer_comp;
332
347
  }
333
348
 
334
- NK_PUBLIC void nk_mahalanobis_f64_smef64(nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions,
335
- nk_f64_t *result) {
336
- nk_f64_t quadratic = nk_mahalanobis_f64_smef64_streaming_(a, b, c, dimensions);
349
+ NK_PUBLIC void nk_mahalanobis_f64_smef64( //
350
+ nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions, nk_f64_t *result) {
351
+ nk_sme_start_streaming_();
352
+ nk_f64_t quadratic = nk_mahalanobis_f64_smef64_ssve_(a, b, c, dimensions);
353
+ nk_sme_stop_streaming_();
337
354
  *result = nk_f64_sqrt_neon(quadratic > 0 ? quadratic : 0);
338
355
  }
339
356
 
@@ -341,11 +358,9 @@ NK_PUBLIC void nk_mahalanobis_f64_smef64(nk_f64_t const *a, nk_f64_t const *b, n
341
358
  * @brief f32c bilinear: complex GEMV via FMOPA (widening f32→f64).
342
359
  * ZA0.D = C staging, ZA1.D = v_real accumulator, ZA2.D = v_imag accumulator.
343
360
  */
344
- __arm_locally_streaming __arm_new("za") static void nk_bilinear_f32c_smef64_streaming_(nk_f32c_t const *a_pairs,
345
- nk_f32c_t const *b_pairs,
346
- nk_f32c_t const *c_pairs,
347
- nk_size_t dimensions,
348
- nk_f64c_t *results) {
361
+ __arm_new("za") static void nk_bilinear_f32c_smef64_streaming_( //
362
+ nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_f32c_t const *c_pairs, nk_size_t dimensions,
363
+ nk_f64c_t *results) NK_STREAMING_ {
349
364
  svbool_t predicate_body_b64x = svptrue_b64();
350
365
  nk_size_t tile_dimension = svcntd();
351
366
  nk_f64_t outer_sum_real_f64 = 0.0, outer_sum_imag_f64 = 0.0;
@@ -407,10 +422,10 @@ __arm_locally_streaming __arm_new("za") static void nk_bilinear_f32c_smef64_stre
407
422
  svfloat64_t a_im_f64x = svcvt_f64_f32_x(row_predicate_b64x, svtrn2_f32(a_f32x, a_f32x));
408
423
 
409
424
  // Complex dot: a × v
410
- outer_sum_real_f64 += svaddv_f64(
425
+ outer_sum_real_f64 += nk_svaddv_f64_(
411
426
  predicate_body_b64x, svsub_f64_x(row_predicate_b64x, svmul_f64_x(row_predicate_b64x, a_re_f64x, v_re_f64x),
412
427
  svmul_f64_x(row_predicate_b64x, a_im_f64x, v_im_f64x)));
413
- outer_sum_imag_f64 += svaddv_f64(
428
+ outer_sum_imag_f64 += nk_svaddv_f64_(
414
429
  predicate_body_b64x, svadd_f64_x(row_predicate_b64x, svmul_f64_x(row_predicate_b64x, a_re_f64x, v_im_f64x),
415
430
  svmul_f64_x(row_predicate_b64x, a_im_f64x, v_re_f64x)));
416
431
  }
@@ -419,19 +434,21 @@ __arm_locally_streaming __arm_new("za") static void nk_bilinear_f32c_smef64_stre
419
434
  results->imag = outer_sum_imag_f64;
420
435
  }
421
436
 
422
- NK_PUBLIC void nk_bilinear_f32c_smef64(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_f32c_t const *c_pairs,
423
- nk_size_t dimensions, nk_f64c_t *results) {
437
+ NK_PUBLIC void nk_bilinear_f32c_smef64( //
438
+ nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_f32c_t const *c_pairs, nk_size_t dimensions,
439
+ nk_f64c_t *results) {
440
+ nk_sme_start_streaming_();
424
441
  nk_bilinear_f32c_smef64_streaming_(a_pairs, b_pairs, c_pairs, dimensions, results);
442
+ nk_sme_stop_streaming_();
425
443
  }
426
444
 
427
445
  /**
428
446
  * @brief f64c bilinear: interleaved Dot2 with permute + deferred XOR sign-flip.
429
447
  * 2 accumulators instead of 4, halving inner loop work (~15 vs ~28 SVE ops).
430
448
  */
431
- __arm_locally_streaming static void nk_bilinear_f64c_smef64_streaming_(nk_f64c_t const *a_pairs,
432
- nk_f64c_t const *b_pairs,
433
- nk_f64c_t const *c_pairs, nk_size_t dimensions,
434
- nk_f64c_t *results) {
449
+ static void nk_bilinear_f64c_smef64_ssve_( //
450
+ nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_f64c_t const *c_pairs, nk_size_t dimensions,
451
+ nk_f64c_t *results) NK_STREAMING_ {
435
452
  svbool_t predicate_all_b64x = svptrue_b64();
436
453
  nk_f64_t outer_sum_real = 0.0, outer_comp_real = 0.0;
437
454
  nk_f64_t outer_sum_imag = 0.0, outer_comp_imag = 0.0;
@@ -474,10 +491,10 @@ __arm_locally_streaming static void nk_bilinear_f64c_smef64_streaming_(nk_f64c_t
474
491
  sveor_u64_x(predicate_all_b64x, svreinterpret_u64_f64(sum_real_f64x), sign_mask_u64x));
475
492
  comp_real_f64x = svreinterpret_f64_u64(
476
493
  sveor_u64_x(predicate_all_b64x, svreinterpret_u64_f64(comp_real_f64x), sign_mask_u64x));
477
- nk_f64_t inner_real = svaddv_f64(predicate_all_b64x,
478
- svadd_f64_x(predicate_all_b64x, sum_real_f64x, comp_real_f64x));
479
- nk_f64_t inner_imag = svaddv_f64(predicate_all_b64x,
480
- svadd_f64_x(predicate_all_b64x, sum_imag_f64x, comp_imag_f64x));
494
+ nk_f64_t inner_real = nk_svaddv_f64_(predicate_all_b64x,
495
+ svadd_f64_x(predicate_all_b64x, sum_real_f64x, comp_real_f64x));
496
+ nk_f64_t inner_imag = nk_svaddv_f64_(predicate_all_b64x,
497
+ svadd_f64_x(predicate_all_b64x, sum_imag_f64x, comp_imag_f64x));
481
498
 
482
499
  // Outer Dot2 complex multiply: a × inner
483
500
  nk_f64_dot2_(&outer_sum_real, &outer_comp_real, a_real, inner_real);
@@ -490,9 +507,12 @@ __arm_locally_streaming static void nk_bilinear_f64c_smef64_streaming_(nk_f64c_t
490
507
  results->imag = outer_sum_imag + outer_comp_imag;
491
508
  }
492
509
 
493
- NK_PUBLIC void nk_bilinear_f64c_smef64(nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_f64c_t const *c_pairs,
494
- nk_size_t dimensions, nk_f64c_t *results) {
495
- nk_bilinear_f64c_smef64_streaming_(a_pairs, b_pairs, c_pairs, dimensions, results);
510
+ NK_PUBLIC void nk_bilinear_f64c_smef64( //
511
+ nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_f64c_t const *c_pairs, nk_size_t dimensions,
512
+ nk_f64c_t *results) {
513
+ nk_sme_start_streaming_();
514
+ nk_bilinear_f64c_smef64_ssve_(a_pairs, b_pairs, c_pairs, dimensions, results);
515
+ nk_sme_stop_streaming_();
496
516
  }
497
517
 
498
518
  #if defined(__clang__)
@@ -22,7 +22,7 @@
22
22
  #if NK_TARGET_RVVBF16
23
23
 
24
24
  #include "numkong/types.h"
25
- #include "numkong/cast/rvv.h" // `nk_e4m3m1_to_bf16m2_rvv_`, `nk_e5m2m1_to_bf16m2_rvv_`, etc.
25
+ #include "numkong/cast/rvv.h" // `nk_e4m3m1_to_bf16m2_rvv_`, `nk_e5m2m1_to_bf16m2_rvv_`
26
26
 
27
27
  #if defined(__clang__)
28
28
  #pragma clang attribute push(__attribute__((target("arch=+v,+zvfbfwma"))), apply_to = function)
@@ -23,7 +23,7 @@
23
23
  #if NK_TARGET_RVVHALF
24
24
 
25
25
  #include "numkong/types.h"
26
- #include "numkong/cast/rvv.h" // `nk_e4m3m1_to_f16m2_rvv_`, `nk_e2m3m1_to_f16m2_rvv_`, etc.
26
+ #include "numkong/cast/rvv.h" // `nk_e4m3m1_to_f16m2_rvv_`, `nk_e2m3m1_to_f16m2_rvv_`
27
27
 
28
28
  #if defined(__clang__)
29
29
  #pragma clang attribute push(__attribute__((target("arch=+v,+zvfh"))), apply_to = function)
@@ -39,6 +39,7 @@
39
39
  #if NK_TARGET_SVE
40
40
 
41
41
  #include "numkong/types.h" // `nk_f32_t`
42
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
42
43
  #include "numkong/dot/serial.h" // `nk_u1x8_popcount_`
43
44
 
44
45
  #if defined(__cplusplus)
@@ -110,7 +111,7 @@ NK_PUBLIC void nk_dot_f32_sve(nk_f32_t const *a_scalars, nk_f32_t const *b_scala
110
111
  ab_f64x = svmla_f64_m(pred_odd_b64x, ab_f64x, svcvt_f64_f32_x(pred_odd_b64x, svext_f32(a_f32x, a_f32x, 1)),
111
112
  svcvt_f64_f32_x(pred_odd_b64x, svext_f32(b_f32x, b_f32x, 1)));
112
113
  }
113
- *result = svaddv_f64(svptrue_b64(), ab_f64x);
114
+ *result = nk_svaddv_f64_(svptrue_b64(), ab_f64x);
114
115
  }
115
116
 
116
117
  NK_PUBLIC void nk_dot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
@@ -149,8 +150,8 @@ NK_PUBLIC void nk_dot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pair
149
150
  ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_real_odd_f64x, b_imag_odd_f64x);
150
151
  ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_imag_odd_f64x, b_real_odd_f64x);
151
152
  }
152
- results->real = svaddv_f64(svptrue_b64(), ab_real_f64x);
153
- results->imag = svaddv_f64(svptrue_b64(), ab_imag_f64x);
153
+ results->real = nk_svaddv_f64_(svptrue_b64(), ab_real_f64x);
154
+ results->imag = nk_svaddv_f64_(svptrue_b64(), ab_imag_f64x);
154
155
  }
155
156
 
156
157
  NK_PUBLIC void nk_vdot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
@@ -189,8 +190,8 @@ NK_PUBLIC void nk_vdot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pai
189
190
  ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_real_odd_f64x, b_imag_odd_f64x);
190
191
  ab_imag_f64x = svmls_f64_m(pred_odd_b64x, ab_imag_f64x, a_imag_odd_f64x, b_real_odd_f64x);
191
192
  }
192
- results->real = svaddv_f64(svptrue_b64(), ab_real_f64x);
193
- results->imag = svaddv_f64(svptrue_b64(), ab_imag_f64x);
193
+ results->real = nk_svaddv_f64_(svptrue_b64(), ab_real_f64x);
194
+ results->imag = nk_svaddv_f64_(svptrue_b64(), ab_imag_f64x);
194
195
  }
195
196
 
196
197
  NK_PUBLIC void nk_dot_f64_sve(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
@@ -31,6 +31,7 @@
31
31
  #if NK_TARGET_SVEBFDOT
32
32
 
33
33
  #include "numkong/types.h"
34
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
34
35
 
35
36
  #if defined(__cplusplus)
36
37
  extern "C" {
@@ -56,7 +57,7 @@ NK_PUBLIC void nk_dot_bf16_svebfdot(nk_bf16_t const *a_scalars, nk_bf16_t const
56
57
  sum_f32x = svbfdot_f32(sum_f32x, a_bf16x, b_bf16x);
57
58
  idx_scalars += svcnth();
58
59
  } while (idx_scalars < count_scalars);
59
- *result = svaddv_f32(svptrue_b32(), sum_f32x);
60
+ *result = nk_svaddv_f32_(svptrue_b32(), sum_f32x);
60
61
  }
61
62
 
62
63
  #if defined(__clang__)
@@ -33,6 +33,7 @@
33
33
  #if NK_TARGET_SVEHALF
34
34
 
35
35
  #include "numkong/types.h" // `nk_f16_t`
36
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
36
37
  #include "numkong/dot/serial.h" // `nk_u1x8_popcount_`
37
38
 
38
39
  #if defined(__cplusplus)
@@ -67,7 +68,7 @@ NK_PUBLIC void nk_dot_f16_svehalf(nk_f16_t const *a_scalars, nk_f16_t const *b_s
67
68
 
68
69
  idx_scalars += svcnth();
69
70
  } while (idx_scalars < count_scalars);
70
- *result = svaddv_f32(svptrue_b32(), ab_f32x);
71
+ *result = nk_svaddv_f32_(svptrue_b32(), ab_f32x);
71
72
  }
72
73
 
73
74
  NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
@@ -107,8 +108,8 @@ NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_
107
108
 
108
109
  idx_scalars += svcnth();
109
110
  } while (idx_scalars < count_pairs);
110
- results->real = svaddv_f32(svptrue_b32(), ab_real_f32x);
111
- results->imag = svaddv_f32(svptrue_b32(), ab_imag_f32x);
111
+ results->real = nk_svaddv_f32_(svptrue_b32(), ab_real_f32x);
112
+ results->imag = nk_svaddv_f32_(svptrue_b32(), ab_imag_f32x);
112
113
  }
113
114
 
114
115
  NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
@@ -148,8 +149,8 @@ NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b
148
149
 
149
150
  idx_scalars += svcnth();
150
151
  } while (idx_scalars < count_pairs);
151
- results->real = svaddv_f32(svptrue_b32(), ab_real_f32x);
152
- results->imag = svaddv_f32(svptrue_b32(), ab_imag_f32x);
152
+ results->real = nk_svaddv_f32_(svptrue_b32(), ab_real_f32x);
153
+ results->imag = nk_svaddv_f32_(svptrue_b32(), ab_imag_f32x);
153
154
  }
154
155
 
155
156
  #if defined(__clang__)
@@ -34,6 +34,7 @@
34
34
  #if NK_TARGET_SVESDOT
35
35
 
36
36
  #include "numkong/types.h"
37
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
37
38
 
38
39
  #if defined(__cplusplus)
39
40
  extern "C" {
@@ -57,7 +58,7 @@ NK_PUBLIC void nk_dot_i8_svesdot(nk_i8_t const *a_scalars, nk_i8_t const *b_scal
57
58
  sum_i32x = svdot_s32(sum_i32x, a_i8x, b_i8x);
58
59
  idx_scalars += svcntb();
59
60
  } while (idx_scalars < count_scalars);
60
- *result = (nk_i32_t)svaddv_s32(svptrue_b32(), sum_i32x);
61
+ *result = (nk_i32_t)nk_svaddv_s32_(svptrue_b32(), sum_i32x);
61
62
  }
62
63
 
63
64
  NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scalars, nk_size_t count_scalars,
@@ -71,7 +72,7 @@ NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scal
71
72
  sum_u32x = svdot_u32(sum_u32x, a_u8x, b_u8x);
72
73
  idx_scalars += svcntb();
73
74
  } while (idx_scalars < count_scalars);
74
- *result = (nk_u32_t)svaddv_u32(svptrue_b32(), sum_u32x);
75
+ *result = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), sum_u32x);
75
76
  }
76
77
 
77
78
  #if defined(__clang__)