numkong 7.0.0 → 7.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +197 -124
- package/binding.gyp +34 -484
- package/c/dispatch_bf16.c +59 -1
- package/c/dispatch_e2m3.c +41 -8
- package/c/dispatch_e3m2.c +49 -8
- package/c/dispatch_e4m3.c +51 -9
- package/c/dispatch_e5m2.c +45 -1
- package/c/dispatch_f16.c +79 -26
- package/c/dispatch_f16c.c +5 -5
- package/c/dispatch_f32.c +56 -0
- package/c/dispatch_f64.c +52 -0
- package/c/dispatch_i4.c +3 -0
- package/c/dispatch_i8.c +62 -3
- package/c/dispatch_other.c +18 -0
- package/c/dispatch_u1.c +54 -9
- package/c/dispatch_u4.c +3 -0
- package/c/dispatch_u8.c +64 -3
- package/c/numkong.c +3 -0
- package/include/README.md +79 -9
- package/include/numkong/attention/sapphireamx.h +278 -276
- package/include/numkong/attention/sme.h +983 -977
- package/include/numkong/attention.h +1 -1
- package/include/numkong/capabilities.h +289 -94
- package/include/numkong/cast/README.md +40 -40
- package/include/numkong/cast/diamond.h +64 -0
- package/include/numkong/cast/haswell.h +42 -194
- package/include/numkong/cast/icelake.h +42 -37
- package/include/numkong/cast/loongsonasx.h +252 -0
- package/include/numkong/cast/neon.h +216 -249
- package/include/numkong/cast/powervsx.h +449 -0
- package/include/numkong/cast/rvv.h +223 -274
- package/include/numkong/cast/sapphire.h +18 -18
- package/include/numkong/cast/serial.h +1018 -944
- package/include/numkong/cast/skylake.h +82 -23
- package/include/numkong/cast/v128relaxed.h +462 -105
- package/include/numkong/cast.h +24 -0
- package/include/numkong/cast.hpp +44 -0
- package/include/numkong/curved/README.md +17 -17
- package/include/numkong/curved/neon.h +131 -7
- package/include/numkong/curved/neonbfdot.h +6 -7
- package/include/numkong/curved/rvv.h +26 -26
- package/include/numkong/curved/smef64.h +186 -182
- package/include/numkong/curved.h +14 -18
- package/include/numkong/dot/README.md +154 -137
- package/include/numkong/dot/alder.h +43 -43
- package/include/numkong/dot/diamond.h +158 -0
- package/include/numkong/dot/genoa.h +4 -30
- package/include/numkong/dot/haswell.h +215 -180
- package/include/numkong/dot/icelake.h +190 -76
- package/include/numkong/dot/loongsonasx.h +671 -0
- package/include/numkong/dot/neon.h +124 -73
- package/include/numkong/dot/neonbfdot.h +11 -12
- package/include/numkong/dot/neonfhm.h +44 -46
- package/include/numkong/dot/neonfp8.h +323 -0
- package/include/numkong/dot/neonsdot.h +190 -76
- package/include/numkong/dot/powervsx.h +752 -0
- package/include/numkong/dot/rvv.h +92 -84
- package/include/numkong/dot/rvvbf16.h +12 -12
- package/include/numkong/dot/rvvhalf.h +12 -12
- package/include/numkong/dot/sapphire.h +4 -4
- package/include/numkong/dot/serial.h +66 -30
- package/include/numkong/dot/sierra.h +31 -31
- package/include/numkong/dot/skylake.h +142 -110
- package/include/numkong/dot/sve.h +217 -177
- package/include/numkong/dot/svebfdot.h +10 -10
- package/include/numkong/dot/svehalf.h +85 -41
- package/include/numkong/dot/svesdot.h +89 -0
- package/include/numkong/dot/v128relaxed.h +124 -89
- package/include/numkong/dot.h +114 -48
- package/include/numkong/dots/README.md +203 -203
- package/include/numkong/dots/alder.h +12 -9
- package/include/numkong/dots/diamond.h +86 -0
- package/include/numkong/dots/genoa.h +10 -4
- package/include/numkong/dots/haswell.h +63 -48
- package/include/numkong/dots/icelake.h +27 -18
- package/include/numkong/dots/loongsonasx.h +176 -0
- package/include/numkong/dots/neon.h +14 -11
- package/include/numkong/dots/neonbfdot.h +4 -3
- package/include/numkong/dots/neonfhm.h +11 -9
- package/include/numkong/dots/neonfp8.h +99 -0
- package/include/numkong/dots/neonsdot.h +48 -12
- package/include/numkong/dots/powervsx.h +194 -0
- package/include/numkong/dots/rvv.h +451 -344
- package/include/numkong/dots/sapphireamx.h +1028 -984
- package/include/numkong/dots/serial.h +213 -197
- package/include/numkong/dots/sierra.h +10 -7
- package/include/numkong/dots/skylake.h +47 -36
- package/include/numkong/dots/sme.h +2001 -2364
- package/include/numkong/dots/smebi32.h +175 -162
- package/include/numkong/dots/smef64.h +328 -323
- package/include/numkong/dots/v128relaxed.h +64 -41
- package/include/numkong/dots.h +573 -293
- package/include/numkong/dots.hpp +45 -43
- package/include/numkong/each/README.md +133 -137
- package/include/numkong/each/haswell.h +6 -6
- package/include/numkong/each/icelake.h +7 -7
- package/include/numkong/each/neon.h +76 -42
- package/include/numkong/each/neonbfdot.h +11 -12
- package/include/numkong/each/neonhalf.h +24 -116
- package/include/numkong/each/rvv.h +28 -28
- package/include/numkong/each/sapphire.h +27 -161
- package/include/numkong/each/serial.h +6 -6
- package/include/numkong/each/skylake.h +7 -7
- package/include/numkong/each/v128relaxed.h +562 -0
- package/include/numkong/each.h +148 -62
- package/include/numkong/each.hpp +2 -2
- package/include/numkong/geospatial/README.md +18 -18
- package/include/numkong/geospatial/haswell.h +365 -325
- package/include/numkong/geospatial/neon.h +350 -306
- package/include/numkong/geospatial/rvv.h +4 -4
- package/include/numkong/geospatial/skylake.h +376 -340
- package/include/numkong/geospatial/v128relaxed.h +366 -327
- package/include/numkong/geospatial.h +17 -17
- package/include/numkong/matrix.hpp +4 -4
- package/include/numkong/maxsim/README.md +14 -14
- package/include/numkong/maxsim/alder.h +6 -6
- package/include/numkong/maxsim/genoa.h +4 -4
- package/include/numkong/maxsim/haswell.h +6 -6
- package/include/numkong/maxsim/icelake.h +18 -18
- package/include/numkong/maxsim/neonsdot.h +21 -21
- package/include/numkong/maxsim/sapphireamx.h +14 -14
- package/include/numkong/maxsim/serial.h +6 -6
- package/include/numkong/maxsim/sme.h +221 -196
- package/include/numkong/maxsim/v128relaxed.h +6 -6
- package/include/numkong/mesh/README.md +62 -56
- package/include/numkong/mesh/haswell.h +339 -464
- package/include/numkong/mesh/neon.h +1100 -519
- package/include/numkong/mesh/neonbfdot.h +36 -68
- package/include/numkong/mesh/rvv.h +530 -435
- package/include/numkong/mesh/serial.h +75 -91
- package/include/numkong/mesh/skylake.h +1627 -302
- package/include/numkong/mesh/v128relaxed.h +443 -330
- package/include/numkong/mesh.h +63 -49
- package/include/numkong/mesh.hpp +4 -4
- package/include/numkong/numkong.h +3 -3
- package/include/numkong/numkong.hpp +1 -0
- package/include/numkong/probability/README.md +23 -19
- package/include/numkong/probability/neon.h +82 -52
- package/include/numkong/probability/rvv.h +28 -23
- package/include/numkong/probability/serial.h +51 -39
- package/include/numkong/probability.h +20 -23
- package/include/numkong/random.h +1 -1
- package/include/numkong/reduce/README.md +143 -138
- package/include/numkong/reduce/alder.h +81 -77
- package/include/numkong/reduce/haswell.h +222 -220
- package/include/numkong/reduce/neon.h +629 -519
- package/include/numkong/reduce/neonbfdot.h +7 -218
- package/include/numkong/reduce/neonfhm.h +9 -381
- package/include/numkong/reduce/neonsdot.h +9 -9
- package/include/numkong/reduce/rvv.h +928 -802
- package/include/numkong/reduce/serial.h +23 -27
- package/include/numkong/reduce/sierra.h +20 -20
- package/include/numkong/reduce/skylake.h +326 -324
- package/include/numkong/reduce/v128relaxed.h +52 -52
- package/include/numkong/reduce.h +4 -23
- package/include/numkong/reduce.hpp +156 -11
- package/include/numkong/scalar/README.md +6 -6
- package/include/numkong/scalar/haswell.h +26 -17
- package/include/numkong/scalar/loongsonasx.h +74 -0
- package/include/numkong/scalar/neon.h +9 -9
- package/include/numkong/scalar/powervsx.h +96 -0
- package/include/numkong/scalar/rvv.h +2 -2
- package/include/numkong/scalar/sapphire.h +21 -10
- package/include/numkong/scalar/serial.h +21 -21
- package/include/numkong/scalar.h +13 -0
- package/include/numkong/set/README.md +28 -28
- package/include/numkong/set/haswell.h +12 -12
- package/include/numkong/set/icelake.h +14 -14
- package/include/numkong/set/loongsonasx.h +181 -0
- package/include/numkong/set/neon.h +17 -18
- package/include/numkong/set/powervsx.h +326 -0
- package/include/numkong/set/rvv.h +4 -4
- package/include/numkong/set/serial.h +6 -6
- package/include/numkong/set/sve.h +60 -59
- package/include/numkong/set/v128relaxed.h +6 -6
- package/include/numkong/set.h +21 -7
- package/include/numkong/sets/README.md +26 -26
- package/include/numkong/sets/loongsonasx.h +52 -0
- package/include/numkong/sets/powervsx.h +65 -0
- package/include/numkong/sets/smebi32.h +395 -364
- package/include/numkong/sets.h +83 -40
- package/include/numkong/sparse/README.md +4 -4
- package/include/numkong/sparse/icelake.h +101 -101
- package/include/numkong/sparse/serial.h +1 -1
- package/include/numkong/sparse/sve2.h +137 -141
- package/include/numkong/sparse/turin.h +12 -12
- package/include/numkong/sparse.h +10 -10
- package/include/numkong/spatial/README.md +230 -226
- package/include/numkong/spatial/alder.h +113 -116
- package/include/numkong/spatial/diamond.h +240 -0
- package/include/numkong/spatial/genoa.h +0 -68
- package/include/numkong/spatial/haswell.h +74 -55
- package/include/numkong/spatial/icelake.h +539 -58
- package/include/numkong/spatial/loongsonasx.h +483 -0
- package/include/numkong/spatial/neon.h +125 -52
- package/include/numkong/spatial/neonbfdot.h +8 -9
- package/include/numkong/spatial/neonfp8.h +258 -0
- package/include/numkong/spatial/neonsdot.h +180 -12
- package/include/numkong/spatial/powervsx.h +738 -0
- package/include/numkong/spatial/rvv.h +146 -139
- package/include/numkong/spatial/rvvbf16.h +17 -12
- package/include/numkong/spatial/rvvhalf.h +13 -10
- package/include/numkong/spatial/serial.h +13 -12
- package/include/numkong/spatial/sierra.h +232 -39
- package/include/numkong/spatial/skylake.h +73 -74
- package/include/numkong/spatial/sve.h +93 -72
- package/include/numkong/spatial/svebfdot.h +29 -29
- package/include/numkong/spatial/svehalf.h +52 -26
- package/include/numkong/spatial/svesdot.h +142 -0
- package/include/numkong/spatial/v128relaxed.h +293 -41
- package/include/numkong/spatial.h +338 -82
- package/include/numkong/spatials/README.md +194 -194
- package/include/numkong/spatials/diamond.h +82 -0
- package/include/numkong/spatials/haswell.h +2 -2
- package/include/numkong/spatials/loongsonasx.h +153 -0
- package/include/numkong/spatials/neonfp8.h +111 -0
- package/include/numkong/spatials/neonsdot.h +34 -0
- package/include/numkong/spatials/powervsx.h +153 -0
- package/include/numkong/spatials/rvv.h +259 -243
- package/include/numkong/spatials/sapphireamx.h +173 -173
- package/include/numkong/spatials/serial.h +2 -2
- package/include/numkong/spatials/skylake.h +2 -2
- package/include/numkong/spatials/sme.h +590 -605
- package/include/numkong/spatials/smef64.h +139 -130
- package/include/numkong/spatials/v128relaxed.h +2 -2
- package/include/numkong/spatials.h +820 -500
- package/include/numkong/spatials.hpp +49 -48
- package/include/numkong/tensor.hpp +406 -17
- package/include/numkong/trigonometry/README.md +19 -19
- package/include/numkong/trigonometry/haswell.h +402 -401
- package/include/numkong/trigonometry/neon.h +386 -387
- package/include/numkong/trigonometry/rvv.h +52 -51
- package/include/numkong/trigonometry/serial.h +13 -13
- package/include/numkong/trigonometry/skylake.h +373 -369
- package/include/numkong/trigonometry/v128relaxed.h +375 -374
- package/include/numkong/trigonometry.h +13 -13
- package/include/numkong/trigonometry.hpp +2 -2
- package/include/numkong/types.h +287 -49
- package/include/numkong/types.hpp +436 -12
- package/include/numkong/vector.hpp +82 -14
- package/javascript/dist/cjs/numkong-wasm.js +6 -12
- package/javascript/dist/cjs/numkong.d.ts +7 -1
- package/javascript/dist/cjs/numkong.js +37 -11
- package/javascript/dist/cjs/types.d.ts +9 -0
- package/javascript/dist/cjs/types.js +96 -0
- package/javascript/dist/esm/numkong-browser.d.ts +14 -0
- package/javascript/dist/esm/numkong-browser.js +23 -0
- package/javascript/dist/esm/numkong-wasm.js +6 -12
- package/javascript/dist/esm/numkong.d.ts +7 -1
- package/javascript/dist/esm/numkong.js +37 -11
- package/javascript/dist/esm/types.d.ts +9 -0
- package/javascript/dist/esm/types.js +96 -0
- package/javascript/node-gyp-build.d.ts +4 -1
- package/javascript/numkong-browser.ts +40 -0
- package/javascript/numkong-wasm.ts +7 -13
- package/javascript/numkong.c +5 -26
- package/javascript/numkong.ts +36 -11
- package/javascript/tsconfig-base.json +1 -0
- package/javascript/tsconfig-cjs.json +6 -1
- package/javascript/types.ts +110 -0
- package/numkong.gypi +101 -0
- package/package.json +34 -13
- package/probes/arm_neon.c +8 -0
- package/probes/arm_neon_bfdot.c +9 -0
- package/probes/arm_neon_fhm.c +9 -0
- package/probes/arm_neon_half.c +8 -0
- package/probes/arm_neon_sdot.c +9 -0
- package/probes/arm_neonfp8.c +9 -0
- package/probes/arm_sme.c +16 -0
- package/probes/arm_sme2.c +16 -0
- package/probes/arm_sme2p1.c +16 -0
- package/probes/arm_sme_bf16.c +16 -0
- package/probes/arm_sme_bi32.c +16 -0
- package/probes/arm_sme_f64.c +16 -0
- package/probes/arm_sme_fa64.c +14 -0
- package/probes/arm_sme_half.c +16 -0
- package/probes/arm_sme_lut2.c +15 -0
- package/probes/arm_sve.c +18 -0
- package/probes/arm_sve2.c +20 -0
- package/probes/arm_sve2p1.c +18 -0
- package/probes/arm_sve_bfdot.c +20 -0
- package/probes/arm_sve_half.c +18 -0
- package/probes/arm_sve_sdot.c +21 -0
- package/probes/loongarch_lasx.c +12 -0
- package/probes/power_vsx.c +12 -0
- package/probes/probe.js +127 -0
- package/probes/riscv_rvv.c +14 -0
- package/probes/riscv_rvv_bb.c +15 -0
- package/probes/riscv_rvv_bf16.c +17 -0
- package/probes/riscv_rvv_half.c +14 -0
- package/probes/wasm_v128relaxed.c +11 -0
- package/probes/x86_alder.c +17 -0
- package/probes/x86_diamond.c +17 -0
- package/probes/x86_genoa.c +17 -0
- package/probes/x86_graniteamx.c +19 -0
- package/probes/x86_haswell.c +11 -0
- package/probes/x86_icelake.c +17 -0
- package/probes/x86_sapphire.c +16 -0
- package/probes/x86_sapphireamx.c +18 -0
- package/probes/x86_sierra.c +17 -0
- package/probes/x86_skylake.c +15 -0
- package/probes/x86_turin.c +17 -0
- package/wasm/numkong-emscripten.js +2 -0
- package/wasm/numkong.d.ts +14 -0
- package/wasm/numkong.js +1124 -0
- package/wasm/numkong.wasm +0 -0
- package/include/numkong/curved/neonhalf.h +0 -212
- package/include/numkong/dot/neonhalf.h +0 -198
- package/include/numkong/dots/neonhalf.h +0 -57
- package/include/numkong/mesh/neonhalf.h +0 -616
- package/include/numkong/reduce/neonhalf.h +0 -157
- package/include/numkong/spatial/neonhalf.h +0 -118
- package/include/numkong/spatial/sapphire.h +0 -343
- package/include/numkong/spatials/neonhalf.h +0 -58
- package/javascript/README.md +0 -246
package/include/numkong/dot.h
CHANGED
|
@@ -73,14 +73,14 @@
|
|
|
73
73
|
* BF16 dot products (VDPBF16PS) are Genoa-only, accumulating bf16 pairs directly to f32.
|
|
74
74
|
* Genoa shows 40% faster integer multiply-add (3c vs 5c) than Ice Lake.
|
|
75
75
|
*
|
|
76
|
-
* Intrinsic
|
|
77
|
-
* _mm256_fmadd_ps
|
|
78
|
-
* _mm256_fmadd_pd
|
|
79
|
-
* _mm256_maddubs_epi16
|
|
80
|
-
* _mm256_madd_epi16
|
|
81
|
-
* _mm256_dpbusd_epi32
|
|
82
|
-
* _mm512_dpwssd_epi32
|
|
83
|
-
* _mm512_dpbf16_ps
|
|
76
|
+
* Intrinsic Instruction Haswell Icelake Genoa
|
|
77
|
+
* _mm256_fmadd_ps VFMADD231PS (YMM, YMM, YMM) 5cy @ p01 4cy @ p01 4cy @ p01
|
|
78
|
+
* _mm256_fmadd_pd VFMADD231PD (YMM, YMM, YMM) 5cy @ p01 4cy @ p01 4cy @ p01
|
|
79
|
+
* _mm256_maddubs_epi16 VPMADDUBSW (YMM, YMM, YMM) 5cy @ p0 5cy @ p01 3cy @ p01
|
|
80
|
+
* _mm256_madd_epi16 VPMADDWD (YMM, YMM, YMM) 5cy @ p0 5cy @ p01 3cy @ p01
|
|
81
|
+
* _mm256_dpbusd_epi32 VPDPBUSD (YMM, YMM, YMM) n/a 5cy @ p01 4cy @ p01
|
|
82
|
+
* _mm512_dpwssd_epi32 VPDPWSSD (ZMM, ZMM, ZMM) n/a 5cy @ p0 4cy @ p01
|
|
83
|
+
* _mm512_dpbf16_ps VDPBF16PS (ZMM, ZMM, ZMM) n/a n/a 6cy @ p01
|
|
84
84
|
*
|
|
85
85
|
* @section arm_neon_instructions Relevant ARM NEON Instructions
|
|
86
86
|
*
|
|
@@ -89,13 +89,13 @@
|
|
|
89
89
|
* provides native bf16 dot products on Graviton 3+. Complex dot products use LD2 for deinterleaved
|
|
90
90
|
* loads of real/imag pairs, though its L01+V throughput can bottleneck on memory-bound workloads.
|
|
91
91
|
*
|
|
92
|
-
* Intrinsic
|
|
93
|
-
* vfmaq_f32
|
|
94
|
-
* vfmaq_f64
|
|
95
|
-
* vdotq_s32
|
|
96
|
-
* vdotq_u32
|
|
97
|
-
* vbfdotq_f32
|
|
98
|
-
* vld2q_f32
|
|
92
|
+
* Intrinsic Instruction M1 Firestorm Graviton 3 Graviton 4
|
|
93
|
+
* vfmaq_f32 FMLA.S (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
|
|
94
|
+
* vfmaq_f64 FMLA.D (vec) 4cy @ V0123 4cy @ V0123 4cy @ V0123
|
|
95
|
+
* vdotq_s32 SDOT (vec) 3cy @ V0123 3cy @ V0123 3cy @ V0123
|
|
96
|
+
* vdotq_u32 UDOT (vec) 3cy @ V0123 3cy @ V0123 3cy @ V0123
|
|
97
|
+
* vbfdotq_f32 BFDOT (vec) N/A 4cy @ V0123 5cy @ V0123
|
|
98
|
+
* vld2q_f32 LD2 (Q-form) 5cy @ L01+V 8cy @ L01+V 8cy @ L01+V
|
|
99
99
|
*
|
|
100
100
|
* @section arm_sve_instructions Relevant ARM SVE Instructions
|
|
101
101
|
*
|
|
@@ -103,12 +103,12 @@
|
|
|
103
103
|
* scalar cleanup loops. FADDV performs horizontal reduction; notably 45% faster on Graviton 4
|
|
104
104
|
* (6c) than Graviton 3 (11c). SVE complex dot products use svld2 for structure loads.
|
|
105
105
|
*
|
|
106
|
-
* Intrinsic
|
|
107
|
-
* svmla_f32_x
|
|
108
|
-
* svmls_f32_x
|
|
109
|
-
* svwhilelt_b32
|
|
110
|
-
* svld2_f32
|
|
111
|
-
* svaddv_f32
|
|
106
|
+
* Intrinsic Instruction Graviton 3 Graviton 4
|
|
107
|
+
* svmla_f32_x FMLA (pred) 4cy @ V0123 4cy @ V0123
|
|
108
|
+
* svmls_f32_x FMLS (pred) 4cy @ V0123 4cy @ V0123
|
|
109
|
+
* svwhilelt_b32 WHILELT 3cy @ M0 3cy @ M0
|
|
110
|
+
* svld2_f32 LD2 (SVE) 8cy @ L01+V 8cy @ L01+V
|
|
111
|
+
* svaddv_f32 FADDV 11cy @ V0123 6cy @ V0123
|
|
112
112
|
*
|
|
113
113
|
* @section complex_instructions Complex Number Optimizations
|
|
114
114
|
*
|
|
@@ -121,7 +121,7 @@
|
|
|
121
121
|
*
|
|
122
122
|
* @section references References
|
|
123
123
|
*
|
|
124
|
-
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/
|
|
124
|
+
* - x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
|
|
125
125
|
* - Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/
|
|
126
126
|
*
|
|
127
127
|
*/
|
|
@@ -293,16 +293,12 @@ NK_PUBLIC void nk_dot_u1_neon(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t
|
|
|
293
293
|
/** @copydoc nk_dot_f16 */
|
|
294
294
|
NK_PUBLIC void nk_dot_f16_neon(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
295
295
|
|
|
296
|
-
#endif // NK_TARGET_NEON
|
|
297
|
-
|
|
298
|
-
#if NK_TARGET_NEONHALF
|
|
299
|
-
/** @copydoc nk_dot_f16 */
|
|
300
|
-
NK_PUBLIC void nk_dot_f16_neonhalf(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
301
296
|
/** @copydoc nk_dot_f16c */
|
|
302
|
-
NK_PUBLIC void
|
|
297
|
+
NK_PUBLIC void nk_dot_f16c_neon(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n, nk_f32c_t *result);
|
|
303
298
|
/** @copydoc nk_vdot_f16c */
|
|
304
|
-
NK_PUBLIC void
|
|
305
|
-
|
|
299
|
+
NK_PUBLIC void nk_vdot_f16c_neon(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n, nk_f32c_t *result);
|
|
300
|
+
|
|
301
|
+
#endif // NK_TARGET_NEON
|
|
306
302
|
|
|
307
303
|
#if NK_TARGET_NEONFHM
|
|
308
304
|
/** @copydoc nk_dot_f16 */
|
|
@@ -332,6 +328,13 @@ NK_PUBLIC void nk_dot_e2m3_neonsdot(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_s
|
|
|
332
328
|
NK_PUBLIC void nk_dot_e3m2_neonsdot(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
333
329
|
#endif // NK_TARGET_NEONSDOT
|
|
334
330
|
|
|
331
|
+
#if NK_TARGET_SVESDOT
|
|
332
|
+
/** @copydoc nk_dot_i8 */
|
|
333
|
+
NK_PUBLIC void nk_dot_i8_svesdot(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i32_t *result);
|
|
334
|
+
/** @copydoc nk_dot_u8 */
|
|
335
|
+
NK_PUBLIC void nk_dot_u8_svesdot(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result);
|
|
336
|
+
#endif // NK_TARGET_SVESDOT
|
|
337
|
+
|
|
335
338
|
#if NK_TARGET_NEONBFDOT
|
|
336
339
|
/** @copydoc nk_dot_bf16 */
|
|
337
340
|
NK_PUBLIC void nk_dot_bf16_neonbfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
@@ -345,10 +348,16 @@ NK_PUBLIC void nk_dot_bf16c_neonbfdot(nk_bf16c_t const *a, nk_bf16c_t const *b,
|
|
|
345
348
|
NK_PUBLIC void nk_vdot_bf16c_neonbfdot(nk_bf16c_t const *a, nk_bf16c_t const *b, nk_size_t n, nk_f32c_t *result);
|
|
346
349
|
#endif // NK_TARGET_NEONBFDOT
|
|
347
350
|
|
|
348
|
-
#if
|
|
349
|
-
/** @copydoc
|
|
350
|
-
NK_PUBLIC void
|
|
351
|
-
|
|
351
|
+
#if NK_TARGET_NEONFP8
|
|
352
|
+
/** @copydoc nk_dot_e4m3 */
|
|
353
|
+
NK_PUBLIC void nk_dot_e4m3_neonfp8(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
354
|
+
/** @copydoc nk_dot_e5m2 */
|
|
355
|
+
NK_PUBLIC void nk_dot_e5m2_neonfp8(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
356
|
+
/** @copydoc nk_dot_e2m3 */
|
|
357
|
+
NK_PUBLIC void nk_dot_e2m3_neonfp8(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
358
|
+
/** @copydoc nk_dot_e3m2 */
|
|
359
|
+
NK_PUBLIC void nk_dot_e3m2_neonfp8(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
360
|
+
#endif // NK_TARGET_NEONFP8
|
|
352
361
|
|
|
353
362
|
#if NK_TARGET_SVE
|
|
354
363
|
/** @copydoc nk_dot_f32 */
|
|
@@ -374,6 +383,10 @@ NK_PUBLIC void nk_dot_f16c_svehalf(nk_f16c_t const *a, nk_f16c_t const *b, nk_si
|
|
|
374
383
|
NK_PUBLIC void nk_vdot_f16c_svehalf(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n, nk_f32c_t *result);
|
|
375
384
|
#endif // NK_TARGET_SVEHALF
|
|
376
385
|
|
|
386
|
+
#if NK_TARGET_SVEBFDOT
|
|
387
|
+
/** @copydoc nk_dot_bf16 */
|
|
388
|
+
NK_PUBLIC void nk_dot_bf16_svebfdot(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
389
|
+
#endif // NK_TARGET_SVEBFDOT
|
|
377
390
|
#if NK_TARGET_HASWELL
|
|
378
391
|
/** @copydoc nk_dot_f32 */
|
|
379
392
|
NK_PUBLIC void nk_dot_f32_haswell(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result);
|
|
@@ -474,6 +487,8 @@ NK_PUBLIC void nk_dot_e2m3_icelake(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_si
|
|
|
474
487
|
NK_PUBLIC void nk_dot_e3m2_icelake(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
475
488
|
/** @copydoc nk_dot_u1 */
|
|
476
489
|
NK_PUBLIC void nk_dot_u1_icelake(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bits, nk_u32_t *result);
|
|
490
|
+
/** @copydoc nk_dot_e4m3 */
|
|
491
|
+
NK_PUBLIC void nk_dot_e4m3_icelake(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
477
492
|
#endif // NK_TARGET_ICELAKE
|
|
478
493
|
|
|
479
494
|
#if NK_TARGET_GENOA
|
|
@@ -484,12 +499,19 @@ NK_PUBLIC void nk_dot_bf16c_genoa(nk_bf16c_t const *a, nk_bf16c_t const *b, nk_s
|
|
|
484
499
|
/** @copydoc nk_vdot_bf16c */
|
|
485
500
|
NK_PUBLIC void nk_vdot_bf16c_genoa(nk_bf16c_t const *a, nk_bf16c_t const *b, nk_size_t n, nk_f32c_t *result);
|
|
486
501
|
|
|
487
|
-
/** @copydoc nk_dot_e4m3 */
|
|
488
|
-
NK_PUBLIC void nk_dot_e4m3_genoa(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
489
502
|
/** @copydoc nk_dot_e5m2 */
|
|
490
503
|
NK_PUBLIC void nk_dot_e5m2_genoa(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
491
504
|
#endif // NK_TARGET_GENOA
|
|
492
505
|
|
|
506
|
+
#if NK_TARGET_DIAMOND
|
|
507
|
+
/** @copydoc nk_dot_f16 */
|
|
508
|
+
NK_PUBLIC void nk_dot_f16_diamond(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result);
|
|
509
|
+
/** @copydoc nk_dot_e4m3 */
|
|
510
|
+
NK_PUBLIC void nk_dot_e4m3_diamond(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result);
|
|
511
|
+
/** @copydoc nk_dot_e5m2 */
|
|
512
|
+
NK_PUBLIC void nk_dot_e5m2_diamond(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result);
|
|
513
|
+
#endif // NK_TARGET_DIAMOND
|
|
514
|
+
|
|
493
515
|
#if NK_TARGET_ALDER
|
|
494
516
|
/** @copydoc nk_dot_i8 */
|
|
495
517
|
NK_PUBLIC void nk_dot_i8_alder(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i32_t *result);
|
|
@@ -638,16 +660,18 @@ NK_INTERNAL nk_dtype_t nk_dot_output_dtype(nk_dtype_t dtype) {
|
|
|
638
660
|
#include "numkong/dot/serial.h"
|
|
639
661
|
#include "numkong/dot/neon.h"
|
|
640
662
|
#include "numkong/dot/neonsdot.h"
|
|
641
|
-
#include "numkong/dot/neonhalf.h"
|
|
642
663
|
#include "numkong/dot/neonfhm.h"
|
|
643
664
|
#include "numkong/dot/neonbfdot.h"
|
|
665
|
+
#include "numkong/dot/neonfp8.h"
|
|
644
666
|
#include "numkong/dot/sve.h"
|
|
645
667
|
#include "numkong/dot/svehalf.h"
|
|
646
668
|
#include "numkong/dot/svebfdot.h"
|
|
669
|
+
#include "numkong/dot/svesdot.h"
|
|
647
670
|
#include "numkong/dot/haswell.h"
|
|
648
671
|
#include "numkong/dot/skylake.h"
|
|
649
672
|
#include "numkong/dot/icelake.h"
|
|
650
673
|
#include "numkong/dot/genoa.h"
|
|
674
|
+
#include "numkong/dot/diamond.h"
|
|
651
675
|
#include "numkong/dot/sapphire.h"
|
|
652
676
|
#include "numkong/dot/alder.h"
|
|
653
677
|
#include "numkong/dot/sierra.h"
|
|
@@ -655,7 +679,9 @@ NK_INTERNAL nk_dtype_t nk_dot_output_dtype(nk_dtype_t dtype) {
|
|
|
655
679
|
#include "numkong/dot/rvvbb.h"
|
|
656
680
|
#include "numkong/dot/rvvhalf.h"
|
|
657
681
|
#include "numkong/dot/rvvbf16.h"
|
|
682
|
+
#include "numkong/dot/powervsx.h"
|
|
658
683
|
#include "numkong/dot/v128relaxed.h"
|
|
684
|
+
#include "numkong/dot/loongsonasx.h"
|
|
659
685
|
|
|
660
686
|
#if defined(__cplusplus)
|
|
661
687
|
extern "C" {
|
|
@@ -666,8 +692,14 @@ extern "C" {
|
|
|
666
692
|
NK_PUBLIC void nk_dot_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i32_t *result) {
|
|
667
693
|
#if NK_TARGET_V128RELAXED
|
|
668
694
|
nk_dot_i8_v128relaxed(a, b, n, result);
|
|
695
|
+
#elif NK_TARGET_POWERVSX
|
|
696
|
+
nk_dot_i8_powervsx(a, b, n, result);
|
|
697
|
+
#elif NK_TARGET_LOONGSONASX
|
|
698
|
+
nk_dot_i8_loongsonasx(a, b, n, result);
|
|
669
699
|
#elif NK_TARGET_RVV
|
|
670
700
|
nk_dot_i8_rvv(a, b, n, result);
|
|
701
|
+
#elif NK_TARGET_SVESDOT
|
|
702
|
+
nk_dot_i8_svesdot(a, b, n, result);
|
|
671
703
|
#elif NK_TARGET_NEONSDOT
|
|
672
704
|
nk_dot_i8_neonsdot(a, b, n, result);
|
|
673
705
|
#elif NK_TARGET_ICELAKE
|
|
@@ -688,8 +720,14 @@ NK_PUBLIC void nk_dot_i8(nk_i8_t const *a, nk_i8_t const *b, nk_size_t n, nk_i32
|
|
|
688
720
|
NK_PUBLIC void nk_dot_u8(nk_u8_t const *a, nk_u8_t const *b, nk_size_t n, nk_u32_t *result) {
|
|
689
721
|
#if NK_TARGET_V128RELAXED
|
|
690
722
|
nk_dot_u8_v128relaxed(a, b, n, result);
|
|
723
|
+
#elif NK_TARGET_POWERVSX
|
|
724
|
+
nk_dot_u8_powervsx(a, b, n, result);
|
|
725
|
+
#elif NK_TARGET_LOONGSONASX
|
|
726
|
+
nk_dot_u8_loongsonasx(a, b, n, result);
|
|
691
727
|
#elif NK_TARGET_RVV
|
|
692
728
|
nk_dot_u8_rvv(a, b, n, result);
|
|
729
|
+
#elif NK_TARGET_SVESDOT
|
|
730
|
+
nk_dot_u8_svesdot(a, b, n, result);
|
|
693
731
|
#elif NK_TARGET_NEONSDOT
|
|
694
732
|
nk_dot_u8_neonsdot(a, b, n, result);
|
|
695
733
|
#elif NK_TARGET_ICELAKE
|
|
@@ -746,6 +784,8 @@ NK_PUBLIC void nk_dot_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bit
|
|
|
746
784
|
nk_dot_u1_haswell(a, b, n_bits, result);
|
|
747
785
|
#elif NK_TARGET_V128RELAXED
|
|
748
786
|
nk_dot_u1_v128relaxed(a, b, n_bits, result);
|
|
787
|
+
#elif NK_TARGET_POWERVSX
|
|
788
|
+
nk_dot_u1_powervsx(a, b, n_bits, result);
|
|
749
789
|
#elif NK_TARGET_RVVBB
|
|
750
790
|
nk_dot_u1_rvvbb(a, b, n_bits, result);
|
|
751
791
|
#elif NK_TARGET_RVV
|
|
@@ -760,6 +800,8 @@ NK_PUBLIC void nk_dot_u1(nk_u1x8_t const *a, nk_u1x8_t const *b, nk_size_t n_bit
|
|
|
760
800
|
NK_PUBLIC void nk_dot_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
761
801
|
#if NK_TARGET_V128RELAXED
|
|
762
802
|
nk_dot_f16_v128relaxed(a, b, n, result);
|
|
803
|
+
#elif NK_TARGET_POWERVSX
|
|
804
|
+
nk_dot_f16_powervsx(a, b, n, result);
|
|
763
805
|
#elif NK_TARGET_RVVHALF
|
|
764
806
|
nk_dot_f16_rvvhalf(a, b, n, result);
|
|
765
807
|
#elif NK_TARGET_RVV
|
|
@@ -768,10 +810,10 @@ NK_PUBLIC void nk_dot_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_
|
|
|
768
810
|
nk_dot_f16_svehalf(a, b, n, result);
|
|
769
811
|
#elif NK_TARGET_NEONFHM
|
|
770
812
|
nk_dot_f16_neonfhm(a, b, n, result);
|
|
771
|
-
#elif NK_TARGET_NEONHALF
|
|
772
|
-
nk_dot_f16_neonhalf(a, b, n, result);
|
|
773
813
|
#elif NK_TARGET_NEON
|
|
774
814
|
nk_dot_f16_neon(a, b, n, result);
|
|
815
|
+
#elif NK_TARGET_DIAMOND
|
|
816
|
+
nk_dot_f16_diamond(a, b, n, result);
|
|
775
817
|
#elif NK_TARGET_SKYLAKE
|
|
776
818
|
nk_dot_f16_skylake(a, b, n, result);
|
|
777
819
|
#elif NK_TARGET_HASWELL
|
|
@@ -784,6 +826,10 @@ NK_PUBLIC void nk_dot_f16(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_
|
|
|
784
826
|
NK_PUBLIC void nk_dot_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
785
827
|
#if NK_TARGET_V128RELAXED
|
|
786
828
|
nk_dot_bf16_v128relaxed(a, b, n, result);
|
|
829
|
+
#elif NK_TARGET_POWERVSX
|
|
830
|
+
nk_dot_bf16_powervsx(a, b, n, result);
|
|
831
|
+
#elif NK_TARGET_LOONGSONASX
|
|
832
|
+
nk_dot_bf16_loongsonasx(a, b, n, result);
|
|
787
833
|
#elif NK_TARGET_GENOA
|
|
788
834
|
nk_dot_bf16_genoa(a, b, n, result);
|
|
789
835
|
#elif NK_TARGET_RVVBF16
|
|
@@ -806,8 +852,12 @@ NK_PUBLIC void nk_dot_bf16(nk_bf16_t const *a, nk_bf16_t const *b, nk_size_t n,
|
|
|
806
852
|
}
|
|
807
853
|
|
|
808
854
|
NK_PUBLIC void nk_dot_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
809
|
-
#if
|
|
810
|
-
|
|
855
|
+
#if NK_TARGET_DIAMOND
|
|
856
|
+
nk_dot_e4m3_diamond(a, b, n, result);
|
|
857
|
+
#elif NK_TARGET_ICELAKE
|
|
858
|
+
nk_dot_e4m3_icelake(a, b, n, result);
|
|
859
|
+
#elif NK_TARGET_NEONFP8
|
|
860
|
+
nk_dot_e4m3_neonfp8(a, b, n, result);
|
|
811
861
|
#elif NK_TARGET_NEONBFDOT
|
|
812
862
|
nk_dot_e4m3_neonbfdot(a, b, n, result);
|
|
813
863
|
#elif NK_TARGET_NEONFHM
|
|
@@ -832,8 +882,12 @@ NK_PUBLIC void nk_dot_e4m3(nk_e4m3_t const *a, nk_e4m3_t const *b, nk_size_t n,
|
|
|
832
882
|
}
|
|
833
883
|
|
|
834
884
|
NK_PUBLIC void nk_dot_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
835
|
-
#if
|
|
885
|
+
#if NK_TARGET_DIAMOND
|
|
886
|
+
nk_dot_e5m2_diamond(a, b, n, result);
|
|
887
|
+
#elif NK_TARGET_GENOA
|
|
836
888
|
nk_dot_e5m2_genoa(a, b, n, result);
|
|
889
|
+
#elif NK_TARGET_NEONFP8
|
|
890
|
+
nk_dot_e5m2_neonfp8(a, b, n, result);
|
|
837
891
|
#elif NK_TARGET_NEONBFDOT
|
|
838
892
|
nk_dot_e5m2_neonbfdot(a, b, n, result);
|
|
839
893
|
#elif NK_TARGET_NEONFHM
|
|
@@ -858,7 +912,9 @@ NK_PUBLIC void nk_dot_e5m2(nk_e5m2_t const *a, nk_e5m2_t const *b, nk_size_t n,
|
|
|
858
912
|
}
|
|
859
913
|
|
|
860
914
|
NK_PUBLIC void nk_dot_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
861
|
-
#if
|
|
915
|
+
#if NK_TARGET_NEONFP8
|
|
916
|
+
nk_dot_e2m3_neonfp8(a, b, n, result);
|
|
917
|
+
#elif NK_TARGET_ICELAKE
|
|
862
918
|
nk_dot_e2m3_icelake(a, b, n, result);
|
|
863
919
|
#elif NK_TARGET_SKYLAKE
|
|
864
920
|
nk_dot_e2m3_skylake(a, b, n, result);
|
|
@@ -882,7 +938,9 @@ NK_PUBLIC void nk_dot_e2m3(nk_e2m3_t const *a, nk_e2m3_t const *b, nk_size_t n,
|
|
|
882
938
|
}
|
|
883
939
|
|
|
884
940
|
NK_PUBLIC void nk_dot_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n, nk_f32_t *result) {
|
|
885
|
-
#if
|
|
941
|
+
#if NK_TARGET_NEONFP8
|
|
942
|
+
nk_dot_e3m2_neonfp8(a, b, n, result);
|
|
943
|
+
#elif NK_TARGET_ICELAKE
|
|
886
944
|
nk_dot_e3m2_icelake(a, b, n, result);
|
|
887
945
|
#elif NK_TARGET_NEONSDOT
|
|
888
946
|
nk_dot_e3m2_neonsdot(a, b, n, result);
|
|
@@ -904,6 +962,10 @@ NK_PUBLIC void nk_dot_e3m2(nk_e3m2_t const *a, nk_e3m2_t const *b, nk_size_t n,
|
|
|
904
962
|
NK_PUBLIC void nk_dot_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_f64_t *result) {
|
|
905
963
|
#if NK_TARGET_V128RELAXED
|
|
906
964
|
nk_dot_f32_v128relaxed(a, b, n, result);
|
|
965
|
+
#elif NK_TARGET_POWERVSX
|
|
966
|
+
nk_dot_f32_powervsx(a, b, n, result);
|
|
967
|
+
#elif NK_TARGET_LOONGSONASX
|
|
968
|
+
nk_dot_f32_loongsonasx(a, b, n, result);
|
|
907
969
|
#elif NK_TARGET_RVV
|
|
908
970
|
nk_dot_f32_rvv(a, b, n, result);
|
|
909
971
|
#elif NK_TARGET_SVE
|
|
@@ -922,6 +984,10 @@ NK_PUBLIC void nk_dot_f32(nk_f32_t const *a, nk_f32_t const *b, nk_size_t n, nk_
|
|
|
922
984
|
NK_PUBLIC void nk_dot_f64(nk_f64_t const *a, nk_f64_t const *b, nk_size_t n, nk_f64_t *result) {
|
|
923
985
|
#if NK_TARGET_V128RELAXED
|
|
924
986
|
nk_dot_f64_v128relaxed(a, b, n, result);
|
|
987
|
+
#elif NK_TARGET_POWERVSX
|
|
988
|
+
nk_dot_f64_powervsx(a, b, n, result);
|
|
989
|
+
#elif NK_TARGET_LOONGSONASX
|
|
990
|
+
nk_dot_f64_loongsonasx(a, b, n, result);
|
|
925
991
|
#elif NK_TARGET_RVV
|
|
926
992
|
nk_dot_f64_rvv(a, b, n, result);
|
|
927
993
|
#elif NK_TARGET_SVE
|
|
@@ -942,8 +1008,8 @@ NK_PUBLIC void nk_dot_f16c(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n,
|
|
|
942
1008
|
nk_dot_f16c_svehalf(a, b, n, result);
|
|
943
1009
|
#elif NK_TARGET_NEONFHM
|
|
944
1010
|
nk_dot_f16c_neonfhm(a, b, n, result);
|
|
945
|
-
#elif
|
|
946
|
-
|
|
1011
|
+
#elif NK_TARGET_NEON
|
|
1012
|
+
nk_dot_f16c_neon(a, b, n, result);
|
|
947
1013
|
#elif NK_TARGET_HASWELL
|
|
948
1014
|
nk_dot_f16c_haswell(a, b, n, result);
|
|
949
1015
|
#else
|
|
@@ -1004,8 +1070,8 @@ NK_PUBLIC void nk_vdot_f16c(nk_f16c_t const *a, nk_f16c_t const *b, nk_size_t n,
|
|
|
1004
1070
|
nk_vdot_f16c_svehalf(a, b, n, result);
|
|
1005
1071
|
#elif NK_TARGET_NEONFHM
|
|
1006
1072
|
nk_vdot_f16c_neonfhm(a, b, n, result);
|
|
1007
|
-
#elif
|
|
1008
|
-
|
|
1073
|
+
#elif NK_TARGET_NEON
|
|
1074
|
+
nk_vdot_f16c_neon(a, b, n, result);
|
|
1009
1075
|
#elif NK_TARGET_HASWELL
|
|
1010
1076
|
nk_vdot_f16c_haswell(a, b, n, result);
|
|
1011
1077
|
#else
|