numkong 7.4.4 → 7.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/binding.gyp +81 -5
- package/c/dispatch_f16.c +23 -0
- package/c/numkong.c +0 -13
- package/include/numkong/attention/sme.h +34 -31
- package/include/numkong/capabilities.h +2 -15
- package/include/numkong/cast/neon.h +15 -0
- package/include/numkong/curved/smef64.h +82 -62
- package/include/numkong/dot/rvvbf16.h +1 -1
- package/include/numkong/dot/rvvhalf.h +1 -1
- package/include/numkong/dot/sve.h +6 -5
- package/include/numkong/dot/svebfdot.h +2 -1
- package/include/numkong/dot/svehalf.h +6 -5
- package/include/numkong/dot/svesdot.h +3 -2
- package/include/numkong/dots/graniteamx.h +733 -0
- package/include/numkong/dots/serial.h +11 -4
- package/include/numkong/dots/sme.h +172 -140
- package/include/numkong/dots/smebi32.h +14 -11
- package/include/numkong/dots/smef64.h +31 -26
- package/include/numkong/dots.h +29 -3
- package/include/numkong/each/serial.h +22 -0
- package/include/numkong/geospatial/haswell.h +1 -1
- package/include/numkong/geospatial/neon.h +1 -1
- package/include/numkong/geospatial/serial.h +1 -1
- package/include/numkong/geospatial/skylake.h +1 -1
- package/include/numkong/maxsim/sme.h +94 -55
- package/include/numkong/mesh/README.md +13 -27
- package/include/numkong/mesh/haswell.h +25 -122
- package/include/numkong/mesh/neon.h +21 -110
- package/include/numkong/mesh/neonbfdot.h +4 -43
- package/include/numkong/mesh/rvv.h +7 -82
- package/include/numkong/mesh/serial.h +48 -53
- package/include/numkong/mesh/skylake.h +7 -123
- package/include/numkong/mesh/v128relaxed.h +9 -93
- package/include/numkong/mesh.h +2 -2
- package/include/numkong/mesh.hpp +35 -96
- package/include/numkong/reduce/neon.h +29 -0
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +4 -4
- package/include/numkong/reduce/sve.h +52 -0
- package/include/numkong/reduce.h +4 -0
- package/include/numkong/set/sve.h +6 -5
- package/include/numkong/sets/smebi32.h +35 -30
- package/include/numkong/sparse/sve2.h +3 -2
- package/include/numkong/spatial/sve.h +7 -6
- package/include/numkong/spatial/svebfdot.h +7 -4
- package/include/numkong/spatial/svehalf.h +5 -4
- package/include/numkong/spatial/svesdot.h +9 -8
- package/include/numkong/spatials/graniteamx.h +173 -0
- package/include/numkong/spatials/serial.h +22 -0
- package/include/numkong/spatials/sme.h +391 -350
- package/include/numkong/spatials/smef64.h +79 -70
- package/include/numkong/spatials.h +37 -4
- package/include/numkong/types.h +59 -0
- package/javascript/dist/cjs/numkong.js +13 -0
- package/javascript/dist/esm/numkong.js +13 -0
- package/javascript/numkong.c +56 -12
- package/javascript/numkong.ts +13 -0
- package/package.json +7 -7
- package/probes/probe.js +2 -2
- package/wasm/numkong.wasm +0 -0
|
@@ -52,9 +52,10 @@
|
|
|
52
52
|
#if NK_TARGET_SMEF64
|
|
53
53
|
|
|
54
54
|
#include "numkong/types.h"
|
|
55
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
55
56
|
#include "numkong/spatial/neon.h" // `nk_f64_sqrt_neon`
|
|
56
|
-
#include "numkong/dots/sme.h" // nk_sme_zero_za64_tile_0_
|
|
57
|
-
#include "numkong/curved/serial.h" // `nk_bilinear_f64_serial
|
|
57
|
+
#include "numkong/dots/sme.h" // `nk_sme_zero_za64_tile_0_`
|
|
58
|
+
#include "numkong/curved/serial.h" // `nk_bilinear_f64_serial`
|
|
58
59
|
|
|
59
60
|
#if defined(__cplusplus)
|
|
60
61
|
extern "C" {
|
|
@@ -90,8 +91,8 @@ NK_PUBLIC void nk_dot2_f64_sve_accumulate_(svbool_t predicate_b64x, svfloat64_t
|
|
|
90
91
|
* @brief f32 bilinear: GEMV via FMOPA (widening f32→f64, exact accumulation).
|
|
91
92
|
* ZA0.D = C staging, ZA1.D = GEMV accumulator.
|
|
92
93
|
*/
|
|
93
|
-
|
|
94
|
-
nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions, nk_f64_t *result) {
|
|
94
|
+
__arm_new("za") static void nk_bilinear_f32_smef64_streaming_( //
|
|
95
|
+
nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions, nk_f64_t *result) NK_STREAMING_ {
|
|
95
96
|
svbool_t predicate_body_b64x = svptrue_b64();
|
|
96
97
|
nk_size_t tile_dimension = svcntd();
|
|
97
98
|
nk_f64_t outer_sum_f64 = 0.0;
|
|
@@ -124,24 +125,25 @@ __arm_locally_streaming __arm_new("za") static void nk_bilinear_f32_smef64_strea
|
|
|
124
125
|
svfloat64_t v_f64x = svread_ver_za64_f64_m(svdup_f64(0.0), row_predicate_b64x, 1, 0);
|
|
125
126
|
svfloat64_t a_f64x = svcvt_f64_f32_x(
|
|
126
127
|
row_predicate_b64x, svreinterpret_f32_u64(svld1uw_u64(row_predicate_b64x, (nk_u32_t const *)(a + row))));
|
|
127
|
-
outer_sum_f64 +=
|
|
128
|
+
outer_sum_f64 += nk_svaddv_f64_(predicate_body_b64x, svmul_f64_x(row_predicate_b64x, a_f64x, v_f64x));
|
|
128
129
|
}
|
|
129
130
|
|
|
130
131
|
*result = outer_sum_f64;
|
|
131
132
|
}
|
|
132
133
|
|
|
133
|
-
NK_PUBLIC void nk_bilinear_f32_smef64(
|
|
134
|
-
|
|
134
|
+
NK_PUBLIC void nk_bilinear_f32_smef64( //
|
|
135
|
+
nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions, nk_f64_t *result) {
|
|
136
|
+
nk_sme_start_streaming_();
|
|
135
137
|
nk_bilinear_f32_smef64_streaming_(a, b, c, dimensions, result);
|
|
138
|
+
nk_sme_stop_streaming_();
|
|
136
139
|
}
|
|
137
140
|
|
|
138
141
|
/**
|
|
139
142
|
* @brief f32 Mahalanobis: GEMV v = C×d via FMOPA, where d = a − b (exact in f64).
|
|
140
143
|
* ZA0.D = C staging, ZA1.D = GEMV accumulator.
|
|
141
144
|
*/
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
nk_size_t dimensions) {
|
|
145
|
+
__arm_new("za") static nk_f64_t nk_mahalanobis_f32_smef64_streaming_( //
|
|
146
|
+
nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions) NK_STREAMING_ {
|
|
145
147
|
|
|
146
148
|
svbool_t predicate_body_b64x = svptrue_b64();
|
|
147
149
|
nk_size_t tile_dimension = svcntd();
|
|
@@ -179,15 +181,17 @@ __arm_locally_streaming __arm_new("za") static nk_f64_t
|
|
|
179
181
|
svfloat64_t b_f64x = svcvt_f64_f32_x(
|
|
180
182
|
row_predicate_b64x, svreinterpret_f32_u64(svld1uw_u64(row_predicate_b64x, (nk_u32_t const *)(b + row))));
|
|
181
183
|
svfloat64_t d_f64x = svsub_f64_x(row_predicate_b64x, a_f64x, b_f64x);
|
|
182
|
-
outer_sum_f64 +=
|
|
184
|
+
outer_sum_f64 += nk_svaddv_f64_(predicate_body_b64x, svmul_f64_x(row_predicate_b64x, d_f64x, v_f64x));
|
|
183
185
|
}
|
|
184
186
|
|
|
185
187
|
return outer_sum_f64;
|
|
186
188
|
}
|
|
187
189
|
|
|
188
|
-
NK_PUBLIC void nk_mahalanobis_f32_smef64(
|
|
189
|
-
|
|
190
|
+
NK_PUBLIC void nk_mahalanobis_f32_smef64( //
|
|
191
|
+
nk_f32_t const *a, nk_f32_t const *b, nk_f32_t const *c, nk_size_t dimensions, nk_f64_t *result) {
|
|
192
|
+
nk_sme_start_streaming_();
|
|
190
193
|
nk_f64_t quadratic = nk_mahalanobis_f32_smef64_streaming_(a, b, c, dimensions);
|
|
194
|
+
nk_sme_stop_streaming_();
|
|
191
195
|
*result = nk_f64_sqrt_neon(quadratic > 0 ? quadratic : 0);
|
|
192
196
|
}
|
|
193
197
|
|
|
@@ -195,9 +199,8 @@ NK_PUBLIC void nk_mahalanobis_f32_smef64(nk_f32_t const *a, nk_f32_t const *b, n
|
|
|
195
199
|
* @brief f64 bilinear: row-by-row streaming SVE with Dot2 compensation.
|
|
196
200
|
* 4-row fast path shares b_f64x loads; 1-row tail for remainder.
|
|
197
201
|
*/
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
nk_f64_t *result) {
|
|
202
|
+
static void nk_bilinear_f64_smef64_ssve_( //
|
|
203
|
+
nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions, nk_f64_t *result) NK_STREAMING_ {
|
|
201
204
|
svbool_t predicate_all_b64x = svptrue_b64();
|
|
202
205
|
nk_f64_t outer_sum = 0.0, outer_comp = 0.0;
|
|
203
206
|
nk_size_t row = 0;
|
|
@@ -226,14 +229,18 @@ __arm_locally_streaming static void nk_bilinear_f64_smef64_streaming_(nk_f64_t c
|
|
|
226
229
|
predicate_b64x = svwhilelt_b64(j, dimensions);
|
|
227
230
|
}
|
|
228
231
|
|
|
229
|
-
nk_f64_dot2_(
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
nk_f64_dot2_(
|
|
236
|
-
|
|
232
|
+
nk_f64_dot2_(
|
|
233
|
+
&outer_sum, &outer_comp, a0,
|
|
234
|
+
nk_svaddv_f64_(predicate_all_b64x, sum_0_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_0_f64x));
|
|
235
|
+
nk_f64_dot2_(
|
|
236
|
+
&outer_sum, &outer_comp, a1,
|
|
237
|
+
nk_svaddv_f64_(predicate_all_b64x, sum_1_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_1_f64x));
|
|
238
|
+
nk_f64_dot2_(
|
|
239
|
+
&outer_sum, &outer_comp, a2,
|
|
240
|
+
nk_svaddv_f64_(predicate_all_b64x, sum_2_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_2_f64x));
|
|
241
|
+
nk_f64_dot2_(
|
|
242
|
+
&outer_sum, &outer_comp, a3,
|
|
243
|
+
nk_svaddv_f64_(predicate_all_b64x, sum_3_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_3_f64x));
|
|
237
244
|
}
|
|
238
245
|
|
|
239
246
|
// 1-row tail
|
|
@@ -250,24 +257,27 @@ __arm_locally_streaming static void nk_bilinear_f64_smef64_streaming_(nk_f64_t c
|
|
|
250
257
|
predicate_b64x = svwhilelt_b64(j, dimensions);
|
|
251
258
|
}
|
|
252
259
|
|
|
253
|
-
nk_f64_t cb_j =
|
|
260
|
+
nk_f64_t cb_j = nk_svaddv_f64_(predicate_all_b64x, sum_f64x) +
|
|
261
|
+
nk_svaddv_f64_(predicate_all_b64x, compensation_f64x);
|
|
254
262
|
nk_f64_dot2_(&outer_sum, &outer_comp, a[row], cb_j);
|
|
255
263
|
}
|
|
256
264
|
|
|
257
265
|
*result = outer_sum + outer_comp;
|
|
258
266
|
}
|
|
259
267
|
|
|
260
|
-
NK_PUBLIC void nk_bilinear_f64_smef64(
|
|
261
|
-
|
|
262
|
-
|
|
268
|
+
NK_PUBLIC void nk_bilinear_f64_smef64( //
|
|
269
|
+
nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions, nk_f64_t *result) {
|
|
270
|
+
nk_sme_start_streaming_();
|
|
271
|
+
nk_bilinear_f64_smef64_ssve_(a, b, c, dimensions, result);
|
|
272
|
+
nk_sme_stop_streaming_();
|
|
263
273
|
}
|
|
264
274
|
|
|
265
275
|
/**
|
|
266
276
|
* @brief f64 Mahalanobis: row-by-row streaming SVE with Dot2 compensation.
|
|
267
277
|
* 4-row fast path shares (a−b) column vector; 1-row tail for remainder.
|
|
268
278
|
*/
|
|
269
|
-
|
|
270
|
-
|
|
279
|
+
static nk_f64_t nk_mahalanobis_f64_smef64_ssve_( //
|
|
280
|
+
nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions) NK_STREAMING_ {
|
|
271
281
|
svbool_t predicate_all_b64x = svptrue_b64();
|
|
272
282
|
nk_f64_t outer_sum = 0.0, outer_comp = 0.0;
|
|
273
283
|
nk_size_t row = 0;
|
|
@@ -298,14 +308,18 @@ __arm_locally_streaming static nk_f64_t nk_mahalanobis_f64_smef64_streaming_(nk_
|
|
|
298
308
|
predicate_b64x = svwhilelt_b64(j, dimensions);
|
|
299
309
|
}
|
|
300
310
|
|
|
301
|
-
nk_f64_dot2_(
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
nk_f64_dot2_(
|
|
308
|
-
|
|
311
|
+
nk_f64_dot2_(
|
|
312
|
+
&outer_sum, &outer_comp, d0,
|
|
313
|
+
nk_svaddv_f64_(predicate_all_b64x, sum_0_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_0_f64x));
|
|
314
|
+
nk_f64_dot2_(
|
|
315
|
+
&outer_sum, &outer_comp, d1,
|
|
316
|
+
nk_svaddv_f64_(predicate_all_b64x, sum_1_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_1_f64x));
|
|
317
|
+
nk_f64_dot2_(
|
|
318
|
+
&outer_sum, &outer_comp, d2,
|
|
319
|
+
nk_svaddv_f64_(predicate_all_b64x, sum_2_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_2_f64x));
|
|
320
|
+
nk_f64_dot2_(
|
|
321
|
+
&outer_sum, &outer_comp, d3,
|
|
322
|
+
nk_svaddv_f64_(predicate_all_b64x, sum_3_f64x) + nk_svaddv_f64_(predicate_all_b64x, compensation_3_f64x));
|
|
309
323
|
}
|
|
310
324
|
|
|
311
325
|
// 1-row tail
|
|
@@ -324,16 +338,19 @@ __arm_locally_streaming static nk_f64_t nk_mahalanobis_f64_smef64_streaming_(nk_
|
|
|
324
338
|
predicate_b64x = svwhilelt_b64(j, dimensions);
|
|
325
339
|
}
|
|
326
340
|
|
|
327
|
-
nk_f64_t cb_j =
|
|
341
|
+
nk_f64_t cb_j = nk_svaddv_f64_(predicate_all_b64x, sum_f64x) +
|
|
342
|
+
nk_svaddv_f64_(predicate_all_b64x, compensation_f64x);
|
|
328
343
|
nk_f64_dot2_(&outer_sum, &outer_comp, diff_row, cb_j);
|
|
329
344
|
}
|
|
330
345
|
|
|
331
346
|
return outer_sum + outer_comp;
|
|
332
347
|
}
|
|
333
348
|
|
|
334
|
-
NK_PUBLIC void nk_mahalanobis_f64_smef64(
|
|
335
|
-
|
|
336
|
-
|
|
349
|
+
NK_PUBLIC void nk_mahalanobis_f64_smef64( //
|
|
350
|
+
nk_f64_t const *a, nk_f64_t const *b, nk_f64_t const *c, nk_size_t dimensions, nk_f64_t *result) {
|
|
351
|
+
nk_sme_start_streaming_();
|
|
352
|
+
nk_f64_t quadratic = nk_mahalanobis_f64_smef64_ssve_(a, b, c, dimensions);
|
|
353
|
+
nk_sme_stop_streaming_();
|
|
337
354
|
*result = nk_f64_sqrt_neon(quadratic > 0 ? quadratic : 0);
|
|
338
355
|
}
|
|
339
356
|
|
|
@@ -341,11 +358,9 @@ NK_PUBLIC void nk_mahalanobis_f64_smef64(nk_f64_t const *a, nk_f64_t const *b, n
|
|
|
341
358
|
* @brief f32c bilinear: complex GEMV via FMOPA (widening f32→f64).
|
|
342
359
|
* ZA0.D = C staging, ZA1.D = v_real accumulator, ZA2.D = v_imag accumulator.
|
|
343
360
|
*/
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
nk_size_t dimensions,
|
|
348
|
-
nk_f64c_t *results) {
|
|
361
|
+
__arm_new("za") static void nk_bilinear_f32c_smef64_streaming_( //
|
|
362
|
+
nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_f32c_t const *c_pairs, nk_size_t dimensions,
|
|
363
|
+
nk_f64c_t *results) NK_STREAMING_ {
|
|
349
364
|
svbool_t predicate_body_b64x = svptrue_b64();
|
|
350
365
|
nk_size_t tile_dimension = svcntd();
|
|
351
366
|
nk_f64_t outer_sum_real_f64 = 0.0, outer_sum_imag_f64 = 0.0;
|
|
@@ -407,10 +422,10 @@ __arm_locally_streaming __arm_new("za") static void nk_bilinear_f32c_smef64_stre
|
|
|
407
422
|
svfloat64_t a_im_f64x = svcvt_f64_f32_x(row_predicate_b64x, svtrn2_f32(a_f32x, a_f32x));
|
|
408
423
|
|
|
409
424
|
// Complex dot: a × v
|
|
410
|
-
outer_sum_real_f64 +=
|
|
425
|
+
outer_sum_real_f64 += nk_svaddv_f64_(
|
|
411
426
|
predicate_body_b64x, svsub_f64_x(row_predicate_b64x, svmul_f64_x(row_predicate_b64x, a_re_f64x, v_re_f64x),
|
|
412
427
|
svmul_f64_x(row_predicate_b64x, a_im_f64x, v_im_f64x)));
|
|
413
|
-
outer_sum_imag_f64 +=
|
|
428
|
+
outer_sum_imag_f64 += nk_svaddv_f64_(
|
|
414
429
|
predicate_body_b64x, svadd_f64_x(row_predicate_b64x, svmul_f64_x(row_predicate_b64x, a_re_f64x, v_im_f64x),
|
|
415
430
|
svmul_f64_x(row_predicate_b64x, a_im_f64x, v_re_f64x)));
|
|
416
431
|
}
|
|
@@ -419,19 +434,21 @@ __arm_locally_streaming __arm_new("za") static void nk_bilinear_f32c_smef64_stre
|
|
|
419
434
|
results->imag = outer_sum_imag_f64;
|
|
420
435
|
}
|
|
421
436
|
|
|
422
|
-
NK_PUBLIC void nk_bilinear_f32c_smef64(
|
|
423
|
-
|
|
437
|
+
NK_PUBLIC void nk_bilinear_f32c_smef64( //
|
|
438
|
+
nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_f32c_t const *c_pairs, nk_size_t dimensions,
|
|
439
|
+
nk_f64c_t *results) {
|
|
440
|
+
nk_sme_start_streaming_();
|
|
424
441
|
nk_bilinear_f32c_smef64_streaming_(a_pairs, b_pairs, c_pairs, dimensions, results);
|
|
442
|
+
nk_sme_stop_streaming_();
|
|
425
443
|
}
|
|
426
444
|
|
|
427
445
|
/**
|
|
428
446
|
* @brief f64c bilinear: interleaved Dot2 with permute + deferred XOR sign-flip.
|
|
429
447
|
* 2 accumulators instead of 4, halving inner loop work (~15 vs ~28 SVE ops).
|
|
430
448
|
*/
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
nk_f64c_t *results) {
|
|
449
|
+
static void nk_bilinear_f64c_smef64_ssve_( //
|
|
450
|
+
nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_f64c_t const *c_pairs, nk_size_t dimensions,
|
|
451
|
+
nk_f64c_t *results) NK_STREAMING_ {
|
|
435
452
|
svbool_t predicate_all_b64x = svptrue_b64();
|
|
436
453
|
nk_f64_t outer_sum_real = 0.0, outer_comp_real = 0.0;
|
|
437
454
|
nk_f64_t outer_sum_imag = 0.0, outer_comp_imag = 0.0;
|
|
@@ -474,10 +491,10 @@ __arm_locally_streaming static void nk_bilinear_f64c_smef64_streaming_(nk_f64c_t
|
|
|
474
491
|
sveor_u64_x(predicate_all_b64x, svreinterpret_u64_f64(sum_real_f64x), sign_mask_u64x));
|
|
475
492
|
comp_real_f64x = svreinterpret_f64_u64(
|
|
476
493
|
sveor_u64_x(predicate_all_b64x, svreinterpret_u64_f64(comp_real_f64x), sign_mask_u64x));
|
|
477
|
-
nk_f64_t inner_real =
|
|
478
|
-
|
|
479
|
-
nk_f64_t inner_imag =
|
|
480
|
-
|
|
494
|
+
nk_f64_t inner_real = nk_svaddv_f64_(predicate_all_b64x,
|
|
495
|
+
svadd_f64_x(predicate_all_b64x, sum_real_f64x, comp_real_f64x));
|
|
496
|
+
nk_f64_t inner_imag = nk_svaddv_f64_(predicate_all_b64x,
|
|
497
|
+
svadd_f64_x(predicate_all_b64x, sum_imag_f64x, comp_imag_f64x));
|
|
481
498
|
|
|
482
499
|
// Outer Dot2 complex multiply: a × inner
|
|
483
500
|
nk_f64_dot2_(&outer_sum_real, &outer_comp_real, a_real, inner_real);
|
|
@@ -490,9 +507,12 @@ __arm_locally_streaming static void nk_bilinear_f64c_smef64_streaming_(nk_f64c_t
|
|
|
490
507
|
results->imag = outer_sum_imag + outer_comp_imag;
|
|
491
508
|
}
|
|
492
509
|
|
|
493
|
-
NK_PUBLIC void nk_bilinear_f64c_smef64(
|
|
494
|
-
|
|
495
|
-
|
|
510
|
+
NK_PUBLIC void nk_bilinear_f64c_smef64( //
|
|
511
|
+
nk_f64c_t const *a_pairs, nk_f64c_t const *b_pairs, nk_f64c_t const *c_pairs, nk_size_t dimensions,
|
|
512
|
+
nk_f64c_t *results) {
|
|
513
|
+
nk_sme_start_streaming_();
|
|
514
|
+
nk_bilinear_f64c_smef64_ssve_(a_pairs, b_pairs, c_pairs, dimensions, results);
|
|
515
|
+
nk_sme_stop_streaming_();
|
|
496
516
|
}
|
|
497
517
|
|
|
498
518
|
#if defined(__clang__)
|
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
#if NK_TARGET_RVVBF16
|
|
23
23
|
|
|
24
24
|
#include "numkong/types.h"
|
|
25
|
-
#include "numkong/cast/rvv.h" // `nk_e4m3m1_to_bf16m2_rvv_`, `nk_e5m2m1_to_bf16m2_rvv_
|
|
25
|
+
#include "numkong/cast/rvv.h" // `nk_e4m3m1_to_bf16m2_rvv_`, `nk_e5m2m1_to_bf16m2_rvv_`
|
|
26
26
|
|
|
27
27
|
#if defined(__clang__)
|
|
28
28
|
#pragma clang attribute push(__attribute__((target("arch=+v,+zvfbfwma"))), apply_to = function)
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
#if NK_TARGET_RVVHALF
|
|
24
24
|
|
|
25
25
|
#include "numkong/types.h"
|
|
26
|
-
#include "numkong/cast/rvv.h" // `nk_e4m3m1_to_f16m2_rvv_`, `nk_e2m3m1_to_f16m2_rvv_
|
|
26
|
+
#include "numkong/cast/rvv.h" // `nk_e4m3m1_to_f16m2_rvv_`, `nk_e2m3m1_to_f16m2_rvv_`
|
|
27
27
|
|
|
28
28
|
#if defined(__clang__)
|
|
29
29
|
#pragma clang attribute push(__attribute__((target("arch=+v,+zvfh"))), apply_to = function)
|
|
@@ -39,6 +39,7 @@
|
|
|
39
39
|
#if NK_TARGET_SVE
|
|
40
40
|
|
|
41
41
|
#include "numkong/types.h" // `nk_f32_t`
|
|
42
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
42
43
|
#include "numkong/dot/serial.h" // `nk_u1x8_popcount_`
|
|
43
44
|
|
|
44
45
|
#if defined(__cplusplus)
|
|
@@ -110,7 +111,7 @@ NK_PUBLIC void nk_dot_f32_sve(nk_f32_t const *a_scalars, nk_f32_t const *b_scala
|
|
|
110
111
|
ab_f64x = svmla_f64_m(pred_odd_b64x, ab_f64x, svcvt_f64_f32_x(pred_odd_b64x, svext_f32(a_f32x, a_f32x, 1)),
|
|
111
112
|
svcvt_f64_f32_x(pred_odd_b64x, svext_f32(b_f32x, b_f32x, 1)));
|
|
112
113
|
}
|
|
113
|
-
*result =
|
|
114
|
+
*result = nk_svaddv_f64_(svptrue_b64(), ab_f64x);
|
|
114
115
|
}
|
|
115
116
|
|
|
116
117
|
NK_PUBLIC void nk_dot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
|
|
@@ -149,8 +150,8 @@ NK_PUBLIC void nk_dot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pair
|
|
|
149
150
|
ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_real_odd_f64x, b_imag_odd_f64x);
|
|
150
151
|
ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_imag_odd_f64x, b_real_odd_f64x);
|
|
151
152
|
}
|
|
152
|
-
results->real =
|
|
153
|
-
results->imag =
|
|
153
|
+
results->real = nk_svaddv_f64_(svptrue_b64(), ab_real_f64x);
|
|
154
|
+
results->imag = nk_svaddv_f64_(svptrue_b64(), ab_imag_f64x);
|
|
154
155
|
}
|
|
155
156
|
|
|
156
157
|
NK_PUBLIC void nk_vdot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pairs, nk_size_t count_pairs,
|
|
@@ -189,8 +190,8 @@ NK_PUBLIC void nk_vdot_f32c_sve(nk_f32c_t const *a_pairs, nk_f32c_t const *b_pai
|
|
|
189
190
|
ab_imag_f64x = svmla_f64_m(pred_odd_b64x, ab_imag_f64x, a_real_odd_f64x, b_imag_odd_f64x);
|
|
190
191
|
ab_imag_f64x = svmls_f64_m(pred_odd_b64x, ab_imag_f64x, a_imag_odd_f64x, b_real_odd_f64x);
|
|
191
192
|
}
|
|
192
|
-
results->real =
|
|
193
|
-
results->imag =
|
|
193
|
+
results->real = nk_svaddv_f64_(svptrue_b64(), ab_real_f64x);
|
|
194
|
+
results->imag = nk_svaddv_f64_(svptrue_b64(), ab_imag_f64x);
|
|
194
195
|
}
|
|
195
196
|
|
|
196
197
|
NK_PUBLIC void nk_dot_f64_sve(nk_f64_t const *a_scalars, nk_f64_t const *b_scalars, nk_size_t count_scalars,
|
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
#if NK_TARGET_SVEBFDOT
|
|
32
32
|
|
|
33
33
|
#include "numkong/types.h"
|
|
34
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
34
35
|
|
|
35
36
|
#if defined(__cplusplus)
|
|
36
37
|
extern "C" {
|
|
@@ -56,7 +57,7 @@ NK_PUBLIC void nk_dot_bf16_svebfdot(nk_bf16_t const *a_scalars, nk_bf16_t const
|
|
|
56
57
|
sum_f32x = svbfdot_f32(sum_f32x, a_bf16x, b_bf16x);
|
|
57
58
|
idx_scalars += svcnth();
|
|
58
59
|
} while (idx_scalars < count_scalars);
|
|
59
|
-
*result =
|
|
60
|
+
*result = nk_svaddv_f32_(svptrue_b32(), sum_f32x);
|
|
60
61
|
}
|
|
61
62
|
|
|
62
63
|
#if defined(__clang__)
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
#if NK_TARGET_SVEHALF
|
|
34
34
|
|
|
35
35
|
#include "numkong/types.h" // `nk_f16_t`
|
|
36
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
36
37
|
#include "numkong/dot/serial.h" // `nk_u1x8_popcount_`
|
|
37
38
|
|
|
38
39
|
#if defined(__cplusplus)
|
|
@@ -67,7 +68,7 @@ NK_PUBLIC void nk_dot_f16_svehalf(nk_f16_t const *a_scalars, nk_f16_t const *b_s
|
|
|
67
68
|
|
|
68
69
|
idx_scalars += svcnth();
|
|
69
70
|
} while (idx_scalars < count_scalars);
|
|
70
|
-
*result =
|
|
71
|
+
*result = nk_svaddv_f32_(svptrue_b32(), ab_f32x);
|
|
71
72
|
}
|
|
72
73
|
|
|
73
74
|
NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
|
|
@@ -107,8 +108,8 @@ NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_
|
|
|
107
108
|
|
|
108
109
|
idx_scalars += svcnth();
|
|
109
110
|
} while (idx_scalars < count_pairs);
|
|
110
|
-
results->real =
|
|
111
|
-
results->imag =
|
|
111
|
+
results->real = nk_svaddv_f32_(svptrue_b32(), ab_real_f32x);
|
|
112
|
+
results->imag = nk_svaddv_f32_(svptrue_b32(), ab_imag_f32x);
|
|
112
113
|
}
|
|
113
114
|
|
|
114
115
|
NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b_pairs, nk_size_t count_pairs,
|
|
@@ -148,8 +149,8 @@ NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a_pairs, nk_f16c_t const *b
|
|
|
148
149
|
|
|
149
150
|
idx_scalars += svcnth();
|
|
150
151
|
} while (idx_scalars < count_pairs);
|
|
151
|
-
results->real =
|
|
152
|
-
results->imag =
|
|
152
|
+
results->real = nk_svaddv_f32_(svptrue_b32(), ab_real_f32x);
|
|
153
|
+
results->imag = nk_svaddv_f32_(svptrue_b32(), ab_imag_f32x);
|
|
153
154
|
}
|
|
154
155
|
|
|
155
156
|
#if defined(__clang__)
|
|
@@ -34,6 +34,7 @@
|
|
|
34
34
|
#if NK_TARGET_SVESDOT
|
|
35
35
|
|
|
36
36
|
#include "numkong/types.h"
|
|
37
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
37
38
|
|
|
38
39
|
#if defined(__cplusplus)
|
|
39
40
|
extern "C" {
|
|
@@ -57,7 +58,7 @@ NK_PUBLIC void nk_dot_i8_svesdot(nk_i8_t const *a_scalars, nk_i8_t const *b_scal
|
|
|
57
58
|
sum_i32x = svdot_s32(sum_i32x, a_i8x, b_i8x);
|
|
58
59
|
idx_scalars += svcntb();
|
|
59
60
|
} while (idx_scalars < count_scalars);
|
|
60
|
-
*result = (nk_i32_t)
|
|
61
|
+
*result = (nk_i32_t)nk_svaddv_s32_(svptrue_b32(), sum_i32x);
|
|
61
62
|
}
|
|
62
63
|
|
|
63
64
|
NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scalars, nk_size_t count_scalars,
|
|
@@ -71,7 +72,7 @@ NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a_scalars, nk_u8_t const *b_scal
|
|
|
71
72
|
sum_u32x = svdot_u32(sum_u32x, a_u8x, b_u8x);
|
|
72
73
|
idx_scalars += svcntb();
|
|
73
74
|
} while (idx_scalars < count_scalars);
|
|
74
|
-
*result = (nk_u32_t)
|
|
75
|
+
*result = (nk_u32_t)nk_svaddv_u32_(svptrue_b32(), sum_u32x);
|
|
75
76
|
}
|
|
76
77
|
|
|
77
78
|
#if defined(__clang__)
|