numkong 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +495 -0
  3. package/binding.gyp +540 -0
  4. package/c/dispatch.h +512 -0
  5. package/c/dispatch_bf16.c +389 -0
  6. package/c/dispatch_bf16c.c +52 -0
  7. package/c/dispatch_e2m3.c +263 -0
  8. package/c/dispatch_e3m2.c +243 -0
  9. package/c/dispatch_e4m3.c +276 -0
  10. package/c/dispatch_e5m2.c +272 -0
  11. package/c/dispatch_f16.c +376 -0
  12. package/c/dispatch_f16c.c +58 -0
  13. package/c/dispatch_f32.c +378 -0
  14. package/c/dispatch_f32c.c +99 -0
  15. package/c/dispatch_f64.c +296 -0
  16. package/c/dispatch_f64c.c +98 -0
  17. package/c/dispatch_i16.c +96 -0
  18. package/c/dispatch_i32.c +89 -0
  19. package/c/dispatch_i4.c +150 -0
  20. package/c/dispatch_i64.c +86 -0
  21. package/c/dispatch_i8.c +289 -0
  22. package/c/dispatch_other.c +330 -0
  23. package/c/dispatch_u1.c +148 -0
  24. package/c/dispatch_u16.c +124 -0
  25. package/c/dispatch_u32.c +118 -0
  26. package/c/dispatch_u4.c +150 -0
  27. package/c/dispatch_u64.c +102 -0
  28. package/c/dispatch_u8.c +303 -0
  29. package/c/numkong.c +950 -0
  30. package/include/README.md +573 -0
  31. package/include/module.modulemap +129 -0
  32. package/include/numkong/attention/sapphireamx.h +1361 -0
  33. package/include/numkong/attention/sme.h +2066 -0
  34. package/include/numkong/attention.h +49 -0
  35. package/include/numkong/capabilities.h +748 -0
  36. package/include/numkong/cast/README.md +262 -0
  37. package/include/numkong/cast/haswell.h +975 -0
  38. package/include/numkong/cast/icelake.h +470 -0
  39. package/include/numkong/cast/neon.h +1192 -0
  40. package/include/numkong/cast/rvv.h +1021 -0
  41. package/include/numkong/cast/sapphire.h +262 -0
  42. package/include/numkong/cast/serial.h +2262 -0
  43. package/include/numkong/cast/skylake.h +856 -0
  44. package/include/numkong/cast/v128relaxed.h +180 -0
  45. package/include/numkong/cast.h +230 -0
  46. package/include/numkong/curved/README.md +223 -0
  47. package/include/numkong/curved/genoa.h +182 -0
  48. package/include/numkong/curved/haswell.h +276 -0
  49. package/include/numkong/curved/neon.h +205 -0
  50. package/include/numkong/curved/neonbfdot.h +212 -0
  51. package/include/numkong/curved/neonhalf.h +212 -0
  52. package/include/numkong/curved/rvv.h +305 -0
  53. package/include/numkong/curved/serial.h +207 -0
  54. package/include/numkong/curved/skylake.h +457 -0
  55. package/include/numkong/curved/smef64.h +506 -0
  56. package/include/numkong/curved.h +517 -0
  57. package/include/numkong/curved.hpp +144 -0
  58. package/include/numkong/dot/README.md +425 -0
  59. package/include/numkong/dot/alder.h +563 -0
  60. package/include/numkong/dot/genoa.h +315 -0
  61. package/include/numkong/dot/haswell.h +1688 -0
  62. package/include/numkong/dot/icelake.h +883 -0
  63. package/include/numkong/dot/neon.h +818 -0
  64. package/include/numkong/dot/neonbfdot.h +244 -0
  65. package/include/numkong/dot/neonfhm.h +360 -0
  66. package/include/numkong/dot/neonhalf.h +198 -0
  67. package/include/numkong/dot/neonsdot.h +508 -0
  68. package/include/numkong/dot/rvv.h +714 -0
  69. package/include/numkong/dot/rvvbb.h +72 -0
  70. package/include/numkong/dot/rvvbf16.h +123 -0
  71. package/include/numkong/dot/rvvhalf.h +129 -0
  72. package/include/numkong/dot/sapphire.h +141 -0
  73. package/include/numkong/dot/serial.h +838 -0
  74. package/include/numkong/dot/sierra.h +405 -0
  75. package/include/numkong/dot/skylake.h +1084 -0
  76. package/include/numkong/dot/sve.h +379 -0
  77. package/include/numkong/dot/svebfdot.h +74 -0
  78. package/include/numkong/dot/svehalf.h +123 -0
  79. package/include/numkong/dot/v128relaxed.h +1258 -0
  80. package/include/numkong/dot.h +1070 -0
  81. package/include/numkong/dot.hpp +94 -0
  82. package/include/numkong/dots/README.md +496 -0
  83. package/include/numkong/dots/alder.h +114 -0
  84. package/include/numkong/dots/genoa.h +94 -0
  85. package/include/numkong/dots/haswell.h +295 -0
  86. package/include/numkong/dots/icelake.h +171 -0
  87. package/include/numkong/dots/neon.h +120 -0
  88. package/include/numkong/dots/neonbfdot.h +58 -0
  89. package/include/numkong/dots/neonfhm.h +94 -0
  90. package/include/numkong/dots/neonhalf.h +57 -0
  91. package/include/numkong/dots/neonsdot.h +108 -0
  92. package/include/numkong/dots/rvv.h +2486 -0
  93. package/include/numkong/dots/sapphireamx.h +3973 -0
  94. package/include/numkong/dots/serial.h +2844 -0
  95. package/include/numkong/dots/sierra.h +97 -0
  96. package/include/numkong/dots/skylake.h +196 -0
  97. package/include/numkong/dots/sme.h +5372 -0
  98. package/include/numkong/dots/smebi32.h +461 -0
  99. package/include/numkong/dots/smef64.h +1318 -0
  100. package/include/numkong/dots/smehalf.h +47 -0
  101. package/include/numkong/dots/v128relaxed.h +294 -0
  102. package/include/numkong/dots.h +2804 -0
  103. package/include/numkong/dots.hpp +639 -0
  104. package/include/numkong/each/README.md +469 -0
  105. package/include/numkong/each/haswell.h +1658 -0
  106. package/include/numkong/each/icelake.h +272 -0
  107. package/include/numkong/each/neon.h +1104 -0
  108. package/include/numkong/each/neonbfdot.h +212 -0
  109. package/include/numkong/each/neonhalf.h +410 -0
  110. package/include/numkong/each/rvv.h +1121 -0
  111. package/include/numkong/each/sapphire.h +477 -0
  112. package/include/numkong/each/serial.h +260 -0
  113. package/include/numkong/each/skylake.h +1562 -0
  114. package/include/numkong/each.h +2146 -0
  115. package/include/numkong/each.hpp +434 -0
  116. package/include/numkong/geospatial/README.md +147 -0
  117. package/include/numkong/geospatial/haswell.h +593 -0
  118. package/include/numkong/geospatial/neon.h +571 -0
  119. package/include/numkong/geospatial/rvv.h +701 -0
  120. package/include/numkong/geospatial/serial.h +309 -0
  121. package/include/numkong/geospatial/skylake.h +577 -0
  122. package/include/numkong/geospatial/v128relaxed.h +613 -0
  123. package/include/numkong/geospatial.h +453 -0
  124. package/include/numkong/geospatial.hpp +235 -0
  125. package/include/numkong/matrix.hpp +336 -0
  126. package/include/numkong/maxsim/README.md +187 -0
  127. package/include/numkong/maxsim/alder.h +511 -0
  128. package/include/numkong/maxsim/genoa.h +115 -0
  129. package/include/numkong/maxsim/haswell.h +553 -0
  130. package/include/numkong/maxsim/icelake.h +480 -0
  131. package/include/numkong/maxsim/neonsdot.h +394 -0
  132. package/include/numkong/maxsim/sapphireamx.h +877 -0
  133. package/include/numkong/maxsim/serial.h +490 -0
  134. package/include/numkong/maxsim/sme.h +929 -0
  135. package/include/numkong/maxsim/v128relaxed.h +280 -0
  136. package/include/numkong/maxsim.h +571 -0
  137. package/include/numkong/maxsim.hpp +133 -0
  138. package/include/numkong/mesh/README.md +227 -0
  139. package/include/numkong/mesh/haswell.h +2235 -0
  140. package/include/numkong/mesh/neon.h +1329 -0
  141. package/include/numkong/mesh/neonbfdot.h +842 -0
  142. package/include/numkong/mesh/neonhalf.h +616 -0
  143. package/include/numkong/mesh/rvv.h +916 -0
  144. package/include/numkong/mesh/serial.h +742 -0
  145. package/include/numkong/mesh/skylake.h +1135 -0
  146. package/include/numkong/mesh/v128relaxed.h +1052 -0
  147. package/include/numkong/mesh.h +652 -0
  148. package/include/numkong/mesh.hpp +762 -0
  149. package/include/numkong/numkong.h +78 -0
  150. package/include/numkong/numkong.hpp +57 -0
  151. package/include/numkong/probability/README.md +173 -0
  152. package/include/numkong/probability/haswell.h +267 -0
  153. package/include/numkong/probability/neon.h +225 -0
  154. package/include/numkong/probability/rvv.h +409 -0
  155. package/include/numkong/probability/serial.h +169 -0
  156. package/include/numkong/probability/skylake.h +324 -0
  157. package/include/numkong/probability.h +383 -0
  158. package/include/numkong/probability.hpp +120 -0
  159. package/include/numkong/random.h +50 -0
  160. package/include/numkong/random.hpp +285 -0
  161. package/include/numkong/reduce/README.md +547 -0
  162. package/include/numkong/reduce/alder.h +632 -0
  163. package/include/numkong/reduce/genoa.h +201 -0
  164. package/include/numkong/reduce/haswell.h +3783 -0
  165. package/include/numkong/reduce/icelake.h +549 -0
  166. package/include/numkong/reduce/neon.h +3841 -0
  167. package/include/numkong/reduce/neonbfdot.h +353 -0
  168. package/include/numkong/reduce/neonfhm.h +665 -0
  169. package/include/numkong/reduce/neonhalf.h +157 -0
  170. package/include/numkong/reduce/neonsdot.h +357 -0
  171. package/include/numkong/reduce/rvv.h +3407 -0
  172. package/include/numkong/reduce/serial.h +757 -0
  173. package/include/numkong/reduce/sierra.h +338 -0
  174. package/include/numkong/reduce/skylake.h +3792 -0
  175. package/include/numkong/reduce/v128relaxed.h +2302 -0
  176. package/include/numkong/reduce.h +1597 -0
  177. package/include/numkong/reduce.hpp +633 -0
  178. package/include/numkong/scalar/README.md +89 -0
  179. package/include/numkong/scalar/haswell.h +113 -0
  180. package/include/numkong/scalar/neon.h +122 -0
  181. package/include/numkong/scalar/neonhalf.h +70 -0
  182. package/include/numkong/scalar/rvv.h +211 -0
  183. package/include/numkong/scalar/sapphire.h +63 -0
  184. package/include/numkong/scalar/serial.h +332 -0
  185. package/include/numkong/scalar/v128relaxed.h +56 -0
  186. package/include/numkong/scalar.h +683 -0
  187. package/include/numkong/set/README.md +179 -0
  188. package/include/numkong/set/haswell.h +334 -0
  189. package/include/numkong/set/icelake.h +485 -0
  190. package/include/numkong/set/neon.h +364 -0
  191. package/include/numkong/set/rvv.h +226 -0
  192. package/include/numkong/set/rvvbb.h +117 -0
  193. package/include/numkong/set/serial.h +174 -0
  194. package/include/numkong/set/sve.h +185 -0
  195. package/include/numkong/set/v128relaxed.h +240 -0
  196. package/include/numkong/set.h +457 -0
  197. package/include/numkong/set.hpp +114 -0
  198. package/include/numkong/sets/README.md +149 -0
  199. package/include/numkong/sets/haswell.h +63 -0
  200. package/include/numkong/sets/icelake.h +66 -0
  201. package/include/numkong/sets/neon.h +61 -0
  202. package/include/numkong/sets/serial.h +43 -0
  203. package/include/numkong/sets/smebi32.h +1099 -0
  204. package/include/numkong/sets/v128relaxed.h +58 -0
  205. package/include/numkong/sets.h +339 -0
  206. package/include/numkong/sparse/README.md +156 -0
  207. package/include/numkong/sparse/icelake.h +463 -0
  208. package/include/numkong/sparse/neon.h +288 -0
  209. package/include/numkong/sparse/serial.h +117 -0
  210. package/include/numkong/sparse/sve2.h +507 -0
  211. package/include/numkong/sparse/turin.h +322 -0
  212. package/include/numkong/sparse.h +363 -0
  213. package/include/numkong/sparse.hpp +113 -0
  214. package/include/numkong/spatial/README.md +435 -0
  215. package/include/numkong/spatial/alder.h +607 -0
  216. package/include/numkong/spatial/genoa.h +290 -0
  217. package/include/numkong/spatial/haswell.h +960 -0
  218. package/include/numkong/spatial/icelake.h +586 -0
  219. package/include/numkong/spatial/neon.h +773 -0
  220. package/include/numkong/spatial/neonbfdot.h +165 -0
  221. package/include/numkong/spatial/neonhalf.h +118 -0
  222. package/include/numkong/spatial/neonsdot.h +261 -0
  223. package/include/numkong/spatial/rvv.h +984 -0
  224. package/include/numkong/spatial/rvvbf16.h +123 -0
  225. package/include/numkong/spatial/rvvhalf.h +117 -0
  226. package/include/numkong/spatial/sapphire.h +343 -0
  227. package/include/numkong/spatial/serial.h +346 -0
  228. package/include/numkong/spatial/sierra.h +323 -0
  229. package/include/numkong/spatial/skylake.h +606 -0
  230. package/include/numkong/spatial/sve.h +224 -0
  231. package/include/numkong/spatial/svebfdot.h +122 -0
  232. package/include/numkong/spatial/svehalf.h +109 -0
  233. package/include/numkong/spatial/v128relaxed.h +717 -0
  234. package/include/numkong/spatial.h +1425 -0
  235. package/include/numkong/spatial.hpp +183 -0
  236. package/include/numkong/spatials/README.md +580 -0
  237. package/include/numkong/spatials/alder.h +94 -0
  238. package/include/numkong/spatials/genoa.h +94 -0
  239. package/include/numkong/spatials/haswell.h +219 -0
  240. package/include/numkong/spatials/icelake.h +113 -0
  241. package/include/numkong/spatials/neon.h +109 -0
  242. package/include/numkong/spatials/neonbfdot.h +60 -0
  243. package/include/numkong/spatials/neonfhm.h +92 -0
  244. package/include/numkong/spatials/neonhalf.h +58 -0
  245. package/include/numkong/spatials/neonsdot.h +109 -0
  246. package/include/numkong/spatials/rvv.h +1960 -0
  247. package/include/numkong/spatials/sapphireamx.h +1149 -0
  248. package/include/numkong/spatials/serial.h +226 -0
  249. package/include/numkong/spatials/sierra.h +96 -0
  250. package/include/numkong/spatials/skylake.h +184 -0
  251. package/include/numkong/spatials/sme.h +1901 -0
  252. package/include/numkong/spatials/smef64.h +465 -0
  253. package/include/numkong/spatials/v128relaxed.h +240 -0
  254. package/include/numkong/spatials.h +3021 -0
  255. package/include/numkong/spatials.hpp +508 -0
  256. package/include/numkong/tensor.hpp +1592 -0
  257. package/include/numkong/trigonometry/README.md +184 -0
  258. package/include/numkong/trigonometry/haswell.h +652 -0
  259. package/include/numkong/trigonometry/neon.h +639 -0
  260. package/include/numkong/trigonometry/rvv.h +699 -0
  261. package/include/numkong/trigonometry/serial.h +703 -0
  262. package/include/numkong/trigonometry/skylake.h +721 -0
  263. package/include/numkong/trigonometry/v128relaxed.h +666 -0
  264. package/include/numkong/trigonometry.h +467 -0
  265. package/include/numkong/trigonometry.hpp +166 -0
  266. package/include/numkong/types.h +1384 -0
  267. package/include/numkong/types.hpp +5603 -0
  268. package/include/numkong/vector.hpp +698 -0
  269. package/javascript/README.md +246 -0
  270. package/javascript/dist/cjs/numkong-wasm.d.ts +166 -0
  271. package/javascript/dist/cjs/numkong-wasm.js +617 -0
  272. package/javascript/dist/cjs/numkong.d.ts +343 -0
  273. package/javascript/dist/cjs/numkong.js +523 -0
  274. package/javascript/dist/cjs/package.json +3 -0
  275. package/javascript/dist/cjs/types.d.ts +284 -0
  276. package/javascript/dist/cjs/types.js +653 -0
  277. package/javascript/dist/esm/numkong-wasm.d.ts +166 -0
  278. package/javascript/dist/esm/numkong-wasm.js +595 -0
  279. package/javascript/dist/esm/numkong.d.ts +343 -0
  280. package/javascript/dist/esm/numkong.js +452 -0
  281. package/javascript/dist/esm/package.json +3 -0
  282. package/javascript/dist/esm/types.d.ts +284 -0
  283. package/javascript/dist/esm/types.js +630 -0
  284. package/javascript/dist-package-cjs.json +3 -0
  285. package/javascript/dist-package-esm.json +3 -0
  286. package/javascript/node-gyp-build.d.ts +1 -0
  287. package/javascript/numkong-wasm.ts +756 -0
  288. package/javascript/numkong.c +689 -0
  289. package/javascript/numkong.ts +575 -0
  290. package/javascript/tsconfig-base.json +39 -0
  291. package/javascript/tsconfig-cjs.json +8 -0
  292. package/javascript/tsconfig-esm.json +8 -0
  293. package/javascript/types.ts +674 -0
  294. package/package.json +87 -0
@@ -0,0 +1,465 @@
1
+ /**
2
+ * @brief Batched Spatial Distances for ARM SME-F64.
3
+ * @file include/numkong/spatials/smef64.h
4
+ * @author Ash Vardanian
5
+ * @date February 23, 2026
6
+ *
7
+ * @sa include/numkong/spatials.h
8
+ */
9
+ #ifndef NK_SPATIALS_SMEF64_H
10
+ #define NK_SPATIALS_SMEF64_H
11
+
12
+ #if NK_TARGET_ARM_
13
+ #if NK_TARGET_SME
14
+
15
+ #include "numkong/dots/serial.h"
16
+ #include "numkong/dots/smef64.h"
17
+
18
+ #if defined(__cplusplus)
19
+ extern "C" {
20
+ #endif
21
+
22
+ #if defined(__clang__)
23
+ #pragma clang attribute push(__attribute__((target("sme,sve,sme-f64f64"))), apply_to = function)
24
+ #elif defined(__GNUC__)
25
+ #pragma GCC push_options
26
+ #pragma GCC target("+sme+sme-f64f64")
27
+ #endif
28
+
29
+ NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f32_ssve_(nk_f32_t const *data, nk_size_t count) NK_STREAMING_ {
30
+ svfloat64_t accumulator_lo_f64x = svdup_f64(0.0);
31
+ svfloat64_t accumulator_hi_f64x = svdup_f64(0.0);
32
+ nk_size_t const vector_length = svcntw();
33
+ nk_size_t const half_vector_length = svcntd();
34
+ for (nk_size_t i = 0; i < count; i += vector_length) {
35
+ svbool_t predicate_f32x = svwhilelt_b32_u64(i, count);
36
+ svfloat32_t values_f32x = svld1_f32(predicate_f32x, data + i);
37
+
38
+ svbool_t predicate_lo_f64x = svwhilelt_b64_u64(i, count);
39
+ svfloat64_t values_lo_f64x = svcvt_f64_f32_x(predicate_lo_f64x, values_f32x);
40
+ accumulator_lo_f64x = svmla_f64_x(predicate_lo_f64x, accumulator_lo_f64x, values_lo_f64x, values_lo_f64x);
41
+
42
+ svbool_t predicate_hi_f64x = svwhilelt_b64_u64(i + half_vector_length, count);
43
+ svfloat64_t values_hi_f64x = svcvtlt_f64_f32_x(predicate_hi_f64x, values_f32x);
44
+ accumulator_hi_f64x = svmla_f64_x(predicate_hi_f64x, accumulator_hi_f64x, values_hi_f64x, values_hi_f64x);
45
+ }
46
+ return svaddv_f64(svptrue_b64(), accumulator_lo_f64x) + svaddv_f64(svptrue_b64(), accumulator_hi_f64x);
47
+ }
48
+
49
+ NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f64_ssve_(nk_f64_t const *data, nk_size_t count) NK_STREAMING_COMPATIBLE_ {
50
+ svfloat64_t accumulator_f64x = svdup_f64(0.0);
51
+ nk_size_t const vector_length = svcntd();
52
+ for (nk_size_t i = 0; i < count; i += vector_length) {
53
+ svbool_t predicate_f64x = svwhilelt_b64_u64(i, count);
54
+ svfloat64_t values_f64x = svld1_f64(predicate_f64x, data + i);
55
+ accumulator_f64x = svmla_f64_x(predicate_f64x, accumulator_f64x, values_f64x, values_f64x);
56
+ }
57
+ return svaddv_f64(svptrue_b64(), accumulator_f64x);
58
+ }
59
+
60
+ NK_PUBLIC svfloat64_t nk_angulars_from_dot_f64x_ssvef64_(svbool_t predicate_f64x, svfloat64_t dots_f64x,
61
+ svfloat64_t query_norm_sq_f64x,
62
+ svfloat64_t target_norms_sq_f64x) NK_STREAMING_COMPATIBLE_ {
63
+ svfloat64_t norms_product_f64x = svmul_f64_x(predicate_f64x, query_norm_sq_f64x, target_norms_sq_f64x);
64
+ svbool_t positive_norms_f64x = svcmpgt_n_f64(predicate_f64x, norms_product_f64x, 0.0);
65
+ svfloat64_t denom_f64x = svsqrt_f64_x(positive_norms_f64x, norms_product_f64x);
66
+ svfloat64_t safe_denom_f64x = svsel_f64(positive_norms_f64x, denom_f64x, svdup_n_f64(1.0));
67
+ svfloat64_t normalized_f64x = svdiv_f64_x(predicate_f64x, dots_f64x, safe_denom_f64x);
68
+ svfloat64_t angular_f64x = svsub_f64_x(predicate_f64x, svdup_n_f64(1.0), normalized_f64x);
69
+ angular_f64x = svsel_f64(
70
+ positive_norms_f64x, angular_f64x,
71
+ svsel_f64(svcmpeq_n_f64(predicate_f64x, dots_f64x, 0.0), svdup_n_f64(0.0), svdup_n_f64(1.0)));
72
+ return svmax_f64_x(predicate_f64x, angular_f64x, svdup_n_f64(0.0));
73
+ }
74
+
75
+ NK_PUBLIC svfloat64_t nk_euclideans_from_dot_f64x_ssvef64_(svbool_t predicate_f64x, svfloat64_t dots_f64x,
76
+ svfloat64_t query_norm_sq_f64x,
77
+ svfloat64_t target_norms_sq_f64x) NK_STREAMING_COMPATIBLE_ {
78
+ svfloat64_t sum_sq_f64x = svadd_f64_x(predicate_f64x, query_norm_sq_f64x, target_norms_sq_f64x);
79
+ svfloat64_t dist_sq_f64x = svsub_f64_x(predicate_f64x, sum_sq_f64x,
80
+ svmul_f64_x(predicate_f64x, svdup_n_f64(2.0), dots_f64x));
81
+ dist_sq_f64x = svmax_f64_x(predicate_f64x, dist_sq_f64x, svdup_n_f64(0.0));
82
+ return svsqrt_f64_x(predicate_f64x, dist_sq_f64x);
83
+ }
84
+
85
+ #pragma region Single Precision Packed Angular
86
+
87
+ __arm_locally_streaming static void nk_angulars_packed_f32_smef64_finalize_streaming_( //
88
+ nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
89
+ nk_size_t rows, nk_size_t columns, nk_size_t depth, //
90
+ nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
91
+
92
+ nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
93
+ nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
94
+
95
+ for (nk_size_t row_index = 0; row_index < rows; row_index++) {
96
+ nk_f32_t const *a_row = a + row_index * a_stride_elements;
97
+ nk_f64_t *c_row = c + row_index * c_stride_elements;
98
+ nk_f64_t query_norm_sq_f64 = nk_dots_reduce_sumsq_f32_ssve_(a_row, depth);
99
+ svfloat64_t query_norm_sq_f64x = svdup_n_f64(query_norm_sq_f64);
100
+
101
+ for (nk_size_t col_index = 0; col_index < columns; col_index += svcntd()) {
102
+ svbool_t predicate_f64x = svwhilelt_b64_u64(col_index, columns);
103
+ svfloat64_t dots_f64x = svld1_f64(predicate_f64x, c_row + col_index);
104
+ svfloat64_t target_norms_sq_f64x = svld1_f64(predicate_f64x, b_norms + col_index);
105
+ svst1_f64(predicate_f64x, c_row + col_index,
106
+ nk_angulars_from_dot_f64x_ssvef64_(predicate_f64x, dots_f64x, query_norm_sq_f64x,
107
+ target_norms_sq_f64x));
108
+ }
109
+ }
110
+ }
111
+
112
+ NK_PUBLIC void nk_angulars_packed_f32_smef64( //
113
+ nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
114
+ nk_size_t rows, nk_size_t columns, nk_size_t depth, //
115
+ nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
116
+
117
+ nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
118
+ nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
119
+
120
+ nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
121
+ nk_angulars_packed_f32_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
122
+ c_stride_elements);
123
+ }
124
+
125
+ #pragma region Single Precision Packed Euclidean
126
+
127
+ __arm_locally_streaming static void nk_euclideans_packed_f32_smef64_finalize_streaming_( //
128
+ nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
129
+ nk_size_t rows, nk_size_t columns, nk_size_t depth, //
130
+ nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
131
+
132
+ nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
133
+ nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
134
+
135
+ for (nk_size_t row_index = 0; row_index < rows; row_index++) {
136
+ nk_f32_t const *a_row = a + row_index * a_stride_elements;
137
+ nk_f64_t *c_row = c + row_index * c_stride_elements;
138
+ nk_f64_t query_norm_sq_f64 = nk_dots_reduce_sumsq_f32_ssve_(a_row, depth);
139
+ svfloat64_t query_norm_sq_f64x = svdup_n_f64(query_norm_sq_f64);
140
+
141
+ for (nk_size_t col_index = 0; col_index < columns; col_index += svcntd()) {
142
+ svbool_t predicate_f64x = svwhilelt_b64_u64(col_index, columns);
143
+ svfloat64_t dots_f64x = svld1_f64(predicate_f64x, c_row + col_index);
144
+ svfloat64_t target_norms_sq_f64x = svld1_f64(predicate_f64x, b_norms + col_index);
145
+ svst1_f64(predicate_f64x, c_row + col_index,
146
+ nk_euclideans_from_dot_f64x_ssvef64_(predicate_f64x, dots_f64x, query_norm_sq_f64x,
147
+ target_norms_sq_f64x));
148
+ }
149
+ }
150
+ }
151
+
152
+ NK_PUBLIC void nk_euclideans_packed_f32_smef64( //
153
+ nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
154
+ nk_size_t rows, nk_size_t columns, nk_size_t depth, //
155
+ nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
156
+
157
+ nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
158
+ nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
159
+
160
+ nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
161
+ nk_euclideans_packed_f32_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
162
+ c_stride_elements);
163
+ }
164
+
165
+ #pragma region Single Precision Symmetric Angular
166
+
167
+ __arm_locally_streaming static void nk_angulars_symmetric_f32_smef64_finalize_streaming_( //
168
+ nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride_elements, //
169
+ nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
170
+ // Phase 1: cache row norms on diagonal
171
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
172
+ nk_f32_t const *row_vector = vectors + row_index * stride_elements;
173
+ nk_f64_t *result_row = result + row_index * result_stride_elements;
174
+ result_row[row_index] = nk_dots_reduce_sumsq_f32_ssve_(row_vector, depth);
175
+ }
176
+ // Phase 2: column-chunked post-processing
177
+ nk_f64_t column_norms[256];
178
+ for (nk_size_t chunk_start = 0; chunk_start < n_vectors; chunk_start += 256) {
179
+ nk_size_t chunk_end = chunk_start + 256 < n_vectors ? chunk_start + 256 : n_vectors;
180
+ for (nk_size_t col = chunk_start; col < chunk_end; ++col) {
181
+ nk_f32_t const *col_vector = vectors + col * stride_elements;
182
+ column_norms[col - chunk_start] = nk_dots_reduce_sumsq_f32_ssve_(col_vector, depth);
183
+ }
184
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
185
+ nk_size_t col_start = row_index + 1 > chunk_start ? row_index + 1 : chunk_start;
186
+ if (col_start >= chunk_end) continue;
187
+ nk_f64_t *result_row = result + row_index * result_stride_elements;
188
+ svfloat64_t query_norm_sq_f64x = svdup_n_f64(result_row[row_index]);
189
+ for (nk_size_t col_index = col_start; col_index < chunk_end; col_index += svcntd()) {
190
+ svbool_t predicate_f64x = svwhilelt_b64_u64(col_index, chunk_end);
191
+ svfloat64_t dots_f64x = svld1_f64(predicate_f64x, result_row + col_index);
192
+ svfloat64_t target_norms_sq_f64x = svld1_f64(predicate_f64x, column_norms + (col_index - chunk_start));
193
+ svst1_f64(predicate_f64x, result_row + col_index,
194
+ nk_angulars_from_dot_f64x_ssvef64_(predicate_f64x, dots_f64x, query_norm_sq_f64x,
195
+ target_norms_sq_f64x));
196
+ }
197
+ }
198
+ }
199
+ // Phase 3: zero diagonals
200
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index)
201
+ result[row_index * result_stride_elements + row_index] = 0;
202
+ }
203
+
204
+ NK_PUBLIC void nk_angulars_symmetric_f32_smef64( //
205
+ nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride, //
206
+ nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start, nk_size_t row_count) {
207
+
208
+ nk_size_t const stride_elements = stride / sizeof(nk_f32_t);
209
+ nk_size_t const result_stride_elements = result_stride / sizeof(nk_f64_t);
210
+
211
+ nk_dots_symmetric_f32_smef64_streaming_(vectors, n_vectors, depth, stride_elements, result, result_stride_elements,
212
+ row_start, row_count);
213
+ nk_angulars_symmetric_f32_smef64_finalize_streaming_(vectors, n_vectors, depth, stride_elements, result,
214
+ result_stride_elements, row_start, row_count);
215
+ }
216
+
217
+ #pragma region Single Precision Symmetric Euclidean
218
+
219
+ __arm_locally_streaming static void nk_euclideans_symmetric_f32_smef64_finalize_streaming_( //
220
+ nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride_elements, //
221
+ nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
222
+ // Phase 1: cache row norms on diagonal
223
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
224
+ nk_f32_t const *row_vector = vectors + row_index * stride_elements;
225
+ nk_f64_t *result_row = result + row_index * result_stride_elements;
226
+ result_row[row_index] = nk_dots_reduce_sumsq_f32_ssve_(row_vector, depth);
227
+ }
228
+ // Phase 2: column-chunked post-processing
229
+ nk_f64_t column_norms[256];
230
+ for (nk_size_t chunk_start = 0; chunk_start < n_vectors; chunk_start += 256) {
231
+ nk_size_t chunk_end = chunk_start + 256 < n_vectors ? chunk_start + 256 : n_vectors;
232
+ for (nk_size_t col = chunk_start; col < chunk_end; ++col) {
233
+ nk_f32_t const *col_vector = vectors + col * stride_elements;
234
+ column_norms[col - chunk_start] = nk_dots_reduce_sumsq_f32_ssve_(col_vector, depth);
235
+ }
236
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
237
+ nk_size_t col_start = row_index + 1 > chunk_start ? row_index + 1 : chunk_start;
238
+ if (col_start >= chunk_end) continue;
239
+ nk_f64_t *result_row = result + row_index * result_stride_elements;
240
+ svfloat64_t query_norm_sq_f64x = svdup_n_f64(result_row[row_index]);
241
+ for (nk_size_t col_index = col_start; col_index < chunk_end; col_index += svcntd()) {
242
+ svbool_t predicate_f64x = svwhilelt_b64_u64(col_index, chunk_end);
243
+ svfloat64_t dots_f64x = svld1_f64(predicate_f64x, result_row + col_index);
244
+ svfloat64_t target_norms_sq_f64x = svld1_f64(predicate_f64x, column_norms + (col_index - chunk_start));
245
+ svst1_f64(predicate_f64x, result_row + col_index,
246
+ nk_euclideans_from_dot_f64x_ssvef64_(predicate_f64x, dots_f64x, query_norm_sq_f64x,
247
+ target_norms_sq_f64x));
248
+ }
249
+ }
250
+ }
251
+ // Phase 3: zero diagonals
252
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index)
253
+ result[row_index * result_stride_elements + row_index] = 0;
254
+ }
255
+
256
+ NK_PUBLIC void nk_euclideans_symmetric_f32_smef64( //
257
+ nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride, //
258
+ nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start, nk_size_t row_count) {
259
+
260
+ nk_size_t const stride_elements = stride / sizeof(nk_f32_t);
261
+ nk_size_t const result_stride_elements = result_stride / sizeof(nk_f64_t);
262
+
263
+ nk_dots_symmetric_f32_smef64_streaming_(vectors, n_vectors, depth, stride_elements, result, result_stride_elements,
264
+ row_start, row_count);
265
+ nk_euclideans_symmetric_f32_smef64_finalize_streaming_(vectors, n_vectors, depth, stride_elements, result,
266
+ result_stride_elements, row_start, row_count);
267
+ }
268
+
269
+ #pragma region Double Precision Packed Angular
270
+
271
+ __arm_locally_streaming static void nk_angulars_packed_f64_smef64_finalize_streaming_( //
272
+ nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
273
+ nk_size_t rows, nk_size_t columns, nk_size_t depth, //
274
+ nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
275
+
276
+ nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
277
+ nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
278
+
279
+ for (nk_size_t row_index = 0; row_index < rows; row_index++) {
280
+ nk_f64_t const *a_row = a + row_index * a_stride_elements;
281
+ nk_f64_t *c_row = c + row_index * c_stride_elements;
282
+ nk_f64_t query_norm_sq_f64 = nk_dots_reduce_sumsq_f64_ssve_(a_row, depth);
283
+ svfloat64_t query_norm_sq_f64x = svdup_n_f64(query_norm_sq_f64);
284
+
285
+ for (nk_size_t col_index = 0; col_index < columns; col_index += svcntd()) {
286
+ svbool_t predicate_f64x = svwhilelt_b64_u64(col_index, columns);
287
+ svfloat64_t dots_f64x = svld1_f64(predicate_f64x, c_row + col_index);
288
+ svfloat64_t target_norms_sq_f64x = svld1_f64(predicate_f64x, b_norms + col_index);
289
+ svst1_f64(predicate_f64x, c_row + col_index,
290
+ nk_angulars_from_dot_f64x_ssvef64_(predicate_f64x, dots_f64x, query_norm_sq_f64x,
291
+ target_norms_sq_f64x));
292
+ }
293
+ }
294
+ }
295
+
296
+ NK_PUBLIC void nk_angulars_packed_f64_smef64( //
297
+ nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
298
+ nk_size_t rows, nk_size_t columns, nk_size_t depth, //
299
+ nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
300
+
301
+ nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
302
+ nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
303
+
304
+ nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
305
+ nk_angulars_packed_f64_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
306
+ c_stride_elements);
307
+ }
308
+
309
+ #pragma region Double Precision Packed Euclidean
310
+
311
+ __arm_locally_streaming static void nk_euclideans_packed_f64_smef64_finalize_streaming_( //
312
+ nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
313
+ nk_size_t rows, nk_size_t columns, nk_size_t depth, //
314
+ nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
315
+
316
+ nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
317
+ nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
318
+
319
+ for (nk_size_t row_index = 0; row_index < rows; row_index++) {
320
+ nk_f64_t const *a_row = a + row_index * a_stride_elements;
321
+ nk_f64_t *c_row = c + row_index * c_stride_elements;
322
+ nk_f64_t query_norm_sq_f64 = nk_dots_reduce_sumsq_f64_ssve_(a_row, depth);
323
+ svfloat64_t query_norm_sq_f64x = svdup_n_f64(query_norm_sq_f64);
324
+
325
+ for (nk_size_t col_index = 0; col_index < columns; col_index += svcntd()) {
326
+ svbool_t predicate_f64x = svwhilelt_b64_u64(col_index, columns);
327
+ svfloat64_t dots_f64x = svld1_f64(predicate_f64x, c_row + col_index);
328
+ svfloat64_t target_norms_sq_f64x = svld1_f64(predicate_f64x, b_norms + col_index);
329
+ svst1_f64(predicate_f64x, c_row + col_index,
330
+ nk_euclideans_from_dot_f64x_ssvef64_(predicate_f64x, dots_f64x, query_norm_sq_f64x,
331
+ target_norms_sq_f64x));
332
+ }
333
+ }
334
+ }
335
+
336
+ NK_PUBLIC void nk_euclideans_packed_f64_smef64( //
337
+ nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
338
+ nk_size_t rows, nk_size_t columns, nk_size_t depth, //
339
+ nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
340
+
341
+ nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
342
+ nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
343
+
344
+ nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
345
+ nk_euclideans_packed_f64_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
346
+ c_stride_elements);
347
+ }
348
+
349
+ #pragma region Double Precision Symmetric Angular
350
+
351
+ __arm_locally_streaming static void nk_angulars_symmetric_f64_smef64_finalize_streaming_( //
352
+ nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride_elements, //
353
+ nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
354
+ // Phase 1: cache row norms on diagonal
355
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
356
+ nk_f64_t const *row_vector = vectors + row_index * stride_elements;
357
+ nk_f64_t *result_row = result + row_index * result_stride_elements;
358
+ result_row[row_index] = nk_dots_reduce_sumsq_f64_ssve_(row_vector, depth);
359
+ }
360
+ // Phase 2: column-chunked post-processing
361
+ nk_f64_t column_norms[256];
362
+ for (nk_size_t chunk_start = 0; chunk_start < n_vectors; chunk_start += 256) {
363
+ nk_size_t chunk_end = chunk_start + 256 < n_vectors ? chunk_start + 256 : n_vectors;
364
+ for (nk_size_t col = chunk_start; col < chunk_end; ++col) {
365
+ nk_f64_t const *col_vector = vectors + col * stride_elements;
366
+ column_norms[col - chunk_start] = nk_dots_reduce_sumsq_f64_ssve_(col_vector, depth);
367
+ }
368
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
369
+ nk_size_t col_start = row_index + 1 > chunk_start ? row_index + 1 : chunk_start;
370
+ if (col_start >= chunk_end) continue;
371
+ nk_f64_t *result_row = result + row_index * result_stride_elements;
372
+ svfloat64_t query_norm_sq_f64x = svdup_n_f64(result_row[row_index]);
373
+ for (nk_size_t col_index = col_start; col_index < chunk_end; col_index += svcntd()) {
374
+ svbool_t predicate_f64x = svwhilelt_b64_u64(col_index, chunk_end);
375
+ svfloat64_t dots_f64x = svld1_f64(predicate_f64x, result_row + col_index);
376
+ svfloat64_t target_norms_sq_f64x = svld1_f64(predicate_f64x, column_norms + (col_index - chunk_start));
377
+ svst1_f64(predicate_f64x, result_row + col_index,
378
+ nk_angulars_from_dot_f64x_ssvef64_(predicate_f64x, dots_f64x, query_norm_sq_f64x,
379
+ target_norms_sq_f64x));
380
+ }
381
+ }
382
+ }
383
+ // Phase 3: zero diagonals
384
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index)
385
+ result[row_index * result_stride_elements + row_index] = 0;
386
+ }
387
+
388
+ NK_PUBLIC void nk_angulars_symmetric_f64_smef64( //
389
+ nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride, //
390
+ nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start, nk_size_t row_count) {
391
+
392
+ nk_size_t const stride_elements = stride / sizeof(nk_f64_t);
393
+ nk_size_t const result_stride_elements = result_stride / sizeof(nk_f64_t);
394
+
395
+ nk_dots_symmetric_f64_smef64_streaming_(vectors, n_vectors, depth, stride_elements, result, result_stride_elements,
396
+ row_start, row_count);
397
+ nk_angulars_symmetric_f64_smef64_finalize_streaming_(vectors, n_vectors, depth, stride_elements, result,
398
+ result_stride_elements, row_start, row_count);
399
+ }
400
+
401
+ #pragma region Double Precision Symmetric Euclidean
402
+
403
+ __arm_locally_streaming static void nk_euclideans_symmetric_f64_smef64_finalize_streaming_( //
404
+ nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride_elements, //
405
+ nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
406
+ // Phase 1: cache row norms on diagonal
407
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
408
+ nk_f64_t const *row_vector = vectors + row_index * stride_elements;
409
+ nk_f64_t *result_row = result + row_index * result_stride_elements;
410
+ result_row[row_index] = nk_dots_reduce_sumsq_f64_ssve_(row_vector, depth);
411
+ }
412
+ // Phase 2: column-chunked post-processing
413
+ nk_f64_t column_norms[256];
414
+ for (nk_size_t chunk_start = 0; chunk_start < n_vectors; chunk_start += 256) {
415
+ nk_size_t chunk_end = chunk_start + 256 < n_vectors ? chunk_start + 256 : n_vectors;
416
+ for (nk_size_t col = chunk_start; col < chunk_end; ++col) {
417
+ nk_f64_t const *col_vector = vectors + col * stride_elements;
418
+ column_norms[col - chunk_start] = nk_dots_reduce_sumsq_f64_ssve_(col_vector, depth);
419
+ }
420
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
421
+ nk_size_t col_start = row_index + 1 > chunk_start ? row_index + 1 : chunk_start;
422
+ if (col_start >= chunk_end) continue;
423
+ nk_f64_t *result_row = result + row_index * result_stride_elements;
424
+ svfloat64_t query_norm_sq_f64x = svdup_n_f64(result_row[row_index]);
425
+ for (nk_size_t col_index = col_start; col_index < chunk_end; col_index += svcntd()) {
426
+ svbool_t predicate_f64x = svwhilelt_b64_u64(col_index, chunk_end);
427
+ svfloat64_t dots_f64x = svld1_f64(predicate_f64x, result_row + col_index);
428
+ svfloat64_t target_norms_sq_f64x = svld1_f64(predicate_f64x, column_norms + (col_index - chunk_start));
429
+ svst1_f64(predicate_f64x, result_row + col_index,
430
+ nk_euclideans_from_dot_f64x_ssvef64_(predicate_f64x, dots_f64x, query_norm_sq_f64x,
431
+ target_norms_sq_f64x));
432
+ }
433
+ }
434
+ }
435
+ // Phase 3: zero diagonals
436
+ for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index)
437
+ result[row_index * result_stride_elements + row_index] = 0;
438
+ }
439
+
440
+ NK_PUBLIC void nk_euclideans_symmetric_f64_smef64( //
441
+ nk_f64_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride, //
442
+ nk_f64_t *result, nk_size_t result_stride, nk_size_t row_start, nk_size_t row_count) {
443
+
444
+ nk_size_t const stride_elements = stride / sizeof(nk_f64_t);
445
+ nk_size_t const result_stride_elements = result_stride / sizeof(nk_f64_t);
446
+
447
+ nk_dots_symmetric_f64_smef64_streaming_(vectors, n_vectors, depth, stride_elements, result, result_stride_elements,
448
+ row_start, row_count);
449
+ nk_euclideans_symmetric_f64_smef64_finalize_streaming_(vectors, n_vectors, depth, stride_elements, result,
450
+ result_stride_elements, row_start, row_count);
451
+ }
452
+
453
+ #if defined(__clang__)
454
+ #pragma clang attribute pop
455
+ #elif defined(__GNUC__)
456
+ #pragma GCC pop_options
457
+ #endif
458
+
459
+ #if defined(__cplusplus)
460
+ } // extern "C"
461
+ #endif
462
+
463
+ #endif // NK_TARGET_SME
464
+ #endif // NK_TARGET_ARM_
465
+ #endif // NK_SPATIALS_SMEF64_H