numkong 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/binding.gyp +18 -0
  2. package/c/dispatch_e5m2.c +23 -3
  3. package/include/numkong/capabilities.h +1 -1
  4. package/include/numkong/cast/README.md +3 -0
  5. package/include/numkong/cast/haswell.h +28 -64
  6. package/include/numkong/cast/serial.h +17 -0
  7. package/include/numkong/cast/skylake.h +67 -52
  8. package/include/numkong/cast.h +1 -0
  9. package/include/numkong/dot/README.md +1 -0
  10. package/include/numkong/dot/haswell.h +92 -13
  11. package/include/numkong/dot/serial.h +15 -0
  12. package/include/numkong/dot/skylake.h +61 -14
  13. package/include/numkong/dots/README.md +2 -0
  14. package/include/numkong/dots/graniteamx.h +434 -0
  15. package/include/numkong/dots/haswell.h +28 -28
  16. package/include/numkong/dots/sapphireamx.h +1 -1
  17. package/include/numkong/dots/serial.h +23 -8
  18. package/include/numkong/dots/skylake.h +28 -23
  19. package/include/numkong/dots.h +12 -0
  20. package/include/numkong/each/serial.h +18 -1
  21. package/include/numkong/geospatial/serial.h +14 -3
  22. package/include/numkong/maxsim/serial.h +15 -0
  23. package/include/numkong/mesh/README.md +50 -44
  24. package/include/numkong/mesh/genoa.h +462 -0
  25. package/include/numkong/mesh/haswell.h +806 -933
  26. package/include/numkong/mesh/neon.h +871 -943
  27. package/include/numkong/mesh/neonbfdot.h +382 -522
  28. package/include/numkong/mesh/neonfhm.h +676 -0
  29. package/include/numkong/mesh/rvv.h +404 -319
  30. package/include/numkong/mesh/serial.h +204 -162
  31. package/include/numkong/mesh/skylake.h +1029 -1585
  32. package/include/numkong/mesh/v128relaxed.h +403 -377
  33. package/include/numkong/mesh.h +38 -0
  34. package/include/numkong/reduce/serial.h +15 -1
  35. package/include/numkong/sparse/serial.h +17 -2
  36. package/include/numkong/spatial/genoa.h +0 -68
  37. package/include/numkong/spatial/haswell.h +98 -56
  38. package/include/numkong/spatial/serial.h +15 -0
  39. package/include/numkong/spatial/skylake.h +114 -54
  40. package/include/numkong/spatial.h +0 -12
  41. package/include/numkong/spatials/graniteamx.h +128 -0
  42. package/include/numkong/spatials/serial.h +18 -1
  43. package/include/numkong/spatials/skylake.h +2 -2
  44. package/include/numkong/spatials.h +17 -0
  45. package/include/numkong/tensor.hpp +107 -23
  46. package/javascript/numkong.c +3 -2
  47. package/package.json +7 -7
  48. package/wasm/numkong.wasm +0 -0
@@ -0,0 +1,676 @@
1
+ /**
2
+ * @brief SIMD-accelerated Point Cloud Alignment for NEON FP16 FHM (widening FMA).
3
+ * @file include/numkong/mesh/neonfhm.h
4
+ * @author Ash Vardanian
5
+ * @date April 15, 2026
6
+ *
7
+ * @sa include/numkong/mesh.h
8
+ *
9
+ * @section mesh_neonfhm_instructions ARM NEON FP16 Matrix Instructions (ARMv8.4-FHM)
10
+ *
11
+ * Intrinsic Instruction A76 M5
12
+ * vld3q_u16 LD3 (V.8H x 3) 6cy @ 1p 6cy @ 1p
13
+ * vfmlalq_low_f16 FMLAL (V.4S, V.8H, V.8H) 4cy @ 2p 4cy @ 4p
14
+ * vfmlalq_high_f16 FMLAL2 (V.4S, V.8H, V.8H) 4cy @ 2p 4cy @ 4p
15
+ * vcvt_f32_f16 FCVTL (V.4S, V.4H) 4cy @ 2p 3cy @ 4p
16
+ * vcvt_high_f32_f16 FCVTL2 (V.4S, V.8H) 4cy @ 2p 3cy @ 4p
17
+ * vfmaq_f32 FMLA (V.4S, V.4S, V.4S) 4cy @ 2p 3cy @ 4p
18
+ * vaddq_f32 FADD (V.4S, V.4S, V.4S) 2cy @ 2p 2cy @ 4p
19
+ * vaddvq_f32 FADDP+FADDP (V.4S) 5cy @ 1p 8cy @ 1p
20
+ *
21
+ * The ARMv8.4-FHM extension (FEAT_FHM) provides FMLAL/FMLSL instructions that fuse FP16 to FP32
22
+ * widening with multiply-accumulate into a single operation. `vfmlalq_low_f16` operates on elements
23
+ * 0-3 of the FP16 inputs; `vfmlalq_high_f16` operates on elements 4-7 — together they process a
24
+ * full `float16x8_t` of data into two `float32x4_t` accumulators with full FP32 accumulator precision.
25
+ *
26
+ * For 3D mesh alignment (RMSD, Kabsch, Umeyama), this replaces the two-step FP16→FP32 widen
27
+ * (`vcvt_f32_f16` + `vcvt_high_f32_f16`) followed by FP32 FMA (`vfmaq_f32`) in the covariance and
28
+ * norm-squared accumulation, fusing widen + multiply-accumulate. The low/high halves are kept as
29
+ * separate `float32x4_t` accumulators and combined only at reduction time. Sums of raw coordinates
30
+ * (for centroids) still use a conventional widen-then-add path since there is no widening-add
31
+ * intrinsic for FP16 inputs.
32
+ */
33
+ #ifndef NK_MESH_NEONFHM_H
34
+ #define NK_MESH_NEONFHM_H
35
+
36
+ #if NK_TARGET_ARM64_
37
+ #if NK_TARGET_NEONFHM
38
+
39
+ #include "numkong/types.h"
40
+ #include "numkong/spatial/neon.h" // `nk_f32_sqrt_neon`
41
+
42
+ #if defined(__cplusplus)
43
+ extern "C" {
44
+ #endif
45
+
46
+ #if defined(__clang__)
47
+ #pragma clang attribute push(__attribute__((target("arch=armv8.2-a+simd+fp16+fp16fml"))), apply_to = function)
48
+ #elif defined(__GNUC__)
49
+ #pragma GCC push_options
50
+ #pragma GCC target("arch=armv8.2-a+simd+fp16+fp16fml")
51
+ #endif
52
+
53
+ /* Load 8 fp16 xyz triplets (24 fp16 values) → 3x float16x8_t.
54
+ * Uses vld3q_u16 to de-interleave, then reinterprets as f16 (avoids vld3q_f16 which is
55
+ * unavailable on MSVC for ARM64).
56
+ *
57
+ * Input: 24 contiguous fp16 [x0,y0,z0, ..., x7,y7,z7]
58
+ * Output: x_f16x8, y_f16x8, z_f16x8 channel vectors (8 lanes each)
59
+ */
60
+ NK_INTERNAL void nk_deinterleave_f16x8_to_f16x8x3_neonfhm_(nk_f16_t const *ptr, //
61
+ float16x8_t *x_out, float16x8_t *y_out, float16x8_t *z_out) {
62
+ uint16x8x3_t xyz_u16x8x3 = vld3q_u16((nk_u16_t const *)ptr);
63
+ *x_out = vreinterpretq_f16_u16(xyz_u16x8x3.val[0]);
64
+ *y_out = vreinterpretq_f16_u16(xyz_u16x8x3.val[1]);
65
+ *z_out = vreinterpretq_f16_u16(xyz_u16x8x3.val[2]);
66
+ }
67
+
68
+ NK_INTERNAL void nk_partial_deinterleave_f16_to_f16x8x3_neonfhm_(nk_f16_t const *ptr, nk_size_t n_points, //
69
+ float16x8_t *x_out, float16x8_t *y_out,
70
+ float16x8_t *z_out) {
71
+ nk_u16_t buf[24] = {0};
72
+ nk_u16_t const *src = (nk_u16_t const *)ptr;
73
+ for (nk_size_t k = 0; k < n_points * 3; ++k) buf[k] = src[k];
74
+ nk_deinterleave_f16x8_to_f16x8x3_neonfhm_((nk_f16_t const *)buf, x_out, y_out, z_out);
75
+ }
76
+
77
+ /**
78
+ * @brief RMSD (Root Mean Square Deviation) using NEON FHM widening FMA.
79
+ * Matches the serial-RMSD contract: zero centroids, identity rotation, raw √(Σ‖a-b‖² / n).
80
+ */
81
+ NK_PUBLIC void nk_rmsd_f16_neonfhm(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
82
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
83
+ if (rotation)
84
+ rotation[0] = 1, rotation[1] = 0, rotation[2] = 0, rotation[3] = 0, rotation[4] = 1, rotation[5] = 0,
85
+ rotation[6] = 0, rotation[7] = 0, rotation[8] = 1;
86
+ if (scale) *scale = 1.0f;
87
+ if (a_centroid) a_centroid[0] = 0, a_centroid[1] = 0, a_centroid[2] = 0;
88
+ if (b_centroid) b_centroid[0] = 0, b_centroid[1] = 0, b_centroid[2] = 0;
89
+
90
+ float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
91
+ // Squared-delta accumulators split into low (elements 0-3) and high (4-7) halves for FHM
92
+ float32x4_t sum_squared_x_low_f32x4 = zeros_f32x4, sum_squared_x_high_f32x4 = zeros_f32x4;
93
+ float32x4_t sum_squared_y_low_f32x4 = zeros_f32x4, sum_squared_y_high_f32x4 = zeros_f32x4;
94
+ float32x4_t sum_squared_z_low_f32x4 = zeros_f32x4, sum_squared_z_high_f32x4 = zeros_f32x4;
95
+
96
+ float16x8_t a_x_f16x8, a_y_f16x8, a_z_f16x8;
97
+ float16x8_t b_x_f16x8, b_y_f16x8, b_z_f16x8;
98
+ nk_size_t i = 0;
99
+
100
+ for (; i + 8 <= n; i += 8) {
101
+ nk_deinterleave_f16x8_to_f16x8x3_neonfhm_(a + i * 3, &a_x_f16x8, &a_y_f16x8, &a_z_f16x8);
102
+ nk_deinterleave_f16x8_to_f16x8x3_neonfhm_(b + i * 3, &b_x_f16x8, &b_y_f16x8, &b_z_f16x8);
103
+
104
+ float16x8_t delta_x_f16x8 = vsubq_f16(a_x_f16x8, b_x_f16x8);
105
+ float16x8_t delta_y_f16x8 = vsubq_f16(a_y_f16x8, b_y_f16x8);
106
+ float16x8_t delta_z_f16x8 = vsubq_f16(a_z_f16x8, b_z_f16x8);
107
+
108
+ sum_squared_x_low_f32x4 = vfmlalq_low_f16(sum_squared_x_low_f32x4, delta_x_f16x8, delta_x_f16x8);
109
+ sum_squared_x_high_f32x4 = vfmlalq_high_f16(sum_squared_x_high_f32x4, delta_x_f16x8, delta_x_f16x8);
110
+ sum_squared_y_low_f32x4 = vfmlalq_low_f16(sum_squared_y_low_f32x4, delta_y_f16x8, delta_y_f16x8);
111
+ sum_squared_y_high_f32x4 = vfmlalq_high_f16(sum_squared_y_high_f32x4, delta_y_f16x8, delta_y_f16x8);
112
+ sum_squared_z_low_f32x4 = vfmlalq_low_f16(sum_squared_z_low_f32x4, delta_z_f16x8, delta_z_f16x8);
113
+ sum_squared_z_high_f32x4 = vfmlalq_high_f16(sum_squared_z_high_f32x4, delta_z_f16x8, delta_z_f16x8);
114
+ }
115
+
116
+ if (i < n) {
117
+ nk_partial_deinterleave_f16_to_f16x8x3_neonfhm_(a + i * 3, n - i, &a_x_f16x8, &a_y_f16x8, &a_z_f16x8);
118
+ nk_partial_deinterleave_f16_to_f16x8x3_neonfhm_(b + i * 3, n - i, &b_x_f16x8, &b_y_f16x8, &b_z_f16x8);
119
+
120
+ float16x8_t delta_x_f16x8 = vsubq_f16(a_x_f16x8, b_x_f16x8);
121
+ float16x8_t delta_y_f16x8 = vsubq_f16(a_y_f16x8, b_y_f16x8);
122
+ float16x8_t delta_z_f16x8 = vsubq_f16(a_z_f16x8, b_z_f16x8);
123
+
124
+ sum_squared_x_low_f32x4 = vfmlalq_low_f16(sum_squared_x_low_f32x4, delta_x_f16x8, delta_x_f16x8);
125
+ sum_squared_x_high_f32x4 = vfmlalq_high_f16(sum_squared_x_high_f32x4, delta_x_f16x8, delta_x_f16x8);
126
+ sum_squared_y_low_f32x4 = vfmlalq_low_f16(sum_squared_y_low_f32x4, delta_y_f16x8, delta_y_f16x8);
127
+ sum_squared_y_high_f32x4 = vfmlalq_high_f16(sum_squared_y_high_f32x4, delta_y_f16x8, delta_y_f16x8);
128
+ sum_squared_z_low_f32x4 = vfmlalq_low_f16(sum_squared_z_low_f32x4, delta_z_f16x8, delta_z_f16x8);
129
+ sum_squared_z_high_f32x4 = vfmlalq_high_f16(sum_squared_z_high_f32x4, delta_z_f16x8, delta_z_f16x8);
130
+ }
131
+
132
+ nk_f32_t sum_squared = vaddvq_f32(vaddq_f32(sum_squared_x_low_f32x4, sum_squared_x_high_f32x4)) +
133
+ vaddvq_f32(vaddq_f32(sum_squared_y_low_f32x4, sum_squared_y_high_f32x4)) +
134
+ vaddvq_f32(vaddq_f32(sum_squared_z_low_f32x4, sum_squared_z_high_f32x4));
135
+ *result = nk_f32_sqrt_neon(sum_squared / (nk_f32_t)n);
136
+ }
137
+
138
+ /**
139
+ * @brief Kabsch algorithm for optimal rigid body superposition using NEON FHM widening FMA.
140
+ * Finds the rotation matrix R that minimizes RMSD between two point sets.
141
+ */
142
+ NK_PUBLIC void nk_kabsch_f16_neonfhm(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
143
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
144
+ float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
145
+
146
+ // Centroid sums (widen-add path)
147
+ float32x4_t sum_a_x_f32x4 = zeros_f32x4, sum_a_y_f32x4 = zeros_f32x4, sum_a_z_f32x4 = zeros_f32x4;
148
+ float32x4_t sum_b_x_f32x4 = zeros_f32x4, sum_b_y_f32x4 = zeros_f32x4, sum_b_z_f32x4 = zeros_f32x4;
149
+
150
+ // Covariance matrix H: 9 cells, each split into low/high f32x4 FHM accumulators.
151
+ float32x4_t covariance_xx_low_f32x4 = zeros_f32x4, covariance_xx_high_f32x4 = zeros_f32x4;
152
+ float32x4_t covariance_xy_low_f32x4 = zeros_f32x4, covariance_xy_high_f32x4 = zeros_f32x4;
153
+ float32x4_t covariance_xz_low_f32x4 = zeros_f32x4, covariance_xz_high_f32x4 = zeros_f32x4;
154
+ float32x4_t covariance_yx_low_f32x4 = zeros_f32x4, covariance_yx_high_f32x4 = zeros_f32x4;
155
+ float32x4_t covariance_yy_low_f32x4 = zeros_f32x4, covariance_yy_high_f32x4 = zeros_f32x4;
156
+ float32x4_t covariance_yz_low_f32x4 = zeros_f32x4, covariance_yz_high_f32x4 = zeros_f32x4;
157
+ float32x4_t covariance_zx_low_f32x4 = zeros_f32x4, covariance_zx_high_f32x4 = zeros_f32x4;
158
+ float32x4_t covariance_zy_low_f32x4 = zeros_f32x4, covariance_zy_high_f32x4 = zeros_f32x4;
159
+ float32x4_t covariance_zz_low_f32x4 = zeros_f32x4, covariance_zz_high_f32x4 = zeros_f32x4;
160
+ float32x4_t norm_squared_a_low_f32x4 = zeros_f32x4, norm_squared_a_high_f32x4 = zeros_f32x4;
161
+ float32x4_t norm_squared_b_low_f32x4 = zeros_f32x4, norm_squared_b_high_f32x4 = zeros_f32x4;
162
+
163
+ float16x8_t a_x_f16x8, a_y_f16x8, a_z_f16x8;
164
+ float16x8_t b_x_f16x8, b_y_f16x8, b_z_f16x8;
165
+ nk_size_t i = 0;
166
+
167
+ for (; i + 8 <= n; i += 8) {
168
+ nk_deinterleave_f16x8_to_f16x8x3_neonfhm_(a + i * 3, &a_x_f16x8, &a_y_f16x8, &a_z_f16x8);
169
+ nk_deinterleave_f16x8_to_f16x8x3_neonfhm_(b + i * 3, &b_x_f16x8, &b_y_f16x8, &b_z_f16x8);
170
+
171
+ // Centroid sums via widen-then-add
172
+ sum_a_x_f32x4 = vaddq_f32(sum_a_x_f32x4, vcvt_f32_f16(vget_low_f16(a_x_f16x8)));
173
+ sum_a_x_f32x4 = vaddq_f32(sum_a_x_f32x4, vcvt_high_f32_f16(a_x_f16x8));
174
+ sum_a_y_f32x4 = vaddq_f32(sum_a_y_f32x4, vcvt_f32_f16(vget_low_f16(a_y_f16x8)));
175
+ sum_a_y_f32x4 = vaddq_f32(sum_a_y_f32x4, vcvt_high_f32_f16(a_y_f16x8));
176
+ sum_a_z_f32x4 = vaddq_f32(sum_a_z_f32x4, vcvt_f32_f16(vget_low_f16(a_z_f16x8)));
177
+ sum_a_z_f32x4 = vaddq_f32(sum_a_z_f32x4, vcvt_high_f32_f16(a_z_f16x8));
178
+ sum_b_x_f32x4 = vaddq_f32(sum_b_x_f32x4, vcvt_f32_f16(vget_low_f16(b_x_f16x8)));
179
+ sum_b_x_f32x4 = vaddq_f32(sum_b_x_f32x4, vcvt_high_f32_f16(b_x_f16x8));
180
+ sum_b_y_f32x4 = vaddq_f32(sum_b_y_f32x4, vcvt_f32_f16(vget_low_f16(b_y_f16x8)));
181
+ sum_b_y_f32x4 = vaddq_f32(sum_b_y_f32x4, vcvt_high_f32_f16(b_y_f16x8));
182
+ sum_b_z_f32x4 = vaddq_f32(sum_b_z_f32x4, vcvt_f32_f16(vget_low_f16(b_z_f16x8)));
183
+ sum_b_z_f32x4 = vaddq_f32(sum_b_z_f32x4, vcvt_high_f32_f16(b_z_f16x8));
184
+
185
+ // Covariance H = sum a * bᵀ via FHM widening FMA (9 cells × 2 halves)
186
+ covariance_xx_low_f32x4 = vfmlalq_low_f16(covariance_xx_low_f32x4, a_x_f16x8, b_x_f16x8);
187
+ covariance_xx_high_f32x4 = vfmlalq_high_f16(covariance_xx_high_f32x4, a_x_f16x8, b_x_f16x8);
188
+ covariance_xy_low_f32x4 = vfmlalq_low_f16(covariance_xy_low_f32x4, a_x_f16x8, b_y_f16x8);
189
+ covariance_xy_high_f32x4 = vfmlalq_high_f16(covariance_xy_high_f32x4, a_x_f16x8, b_y_f16x8);
190
+ covariance_xz_low_f32x4 = vfmlalq_low_f16(covariance_xz_low_f32x4, a_x_f16x8, b_z_f16x8);
191
+ covariance_xz_high_f32x4 = vfmlalq_high_f16(covariance_xz_high_f32x4, a_x_f16x8, b_z_f16x8);
192
+ covariance_yx_low_f32x4 = vfmlalq_low_f16(covariance_yx_low_f32x4, a_y_f16x8, b_x_f16x8);
193
+ covariance_yx_high_f32x4 = vfmlalq_high_f16(covariance_yx_high_f32x4, a_y_f16x8, b_x_f16x8);
194
+ covariance_yy_low_f32x4 = vfmlalq_low_f16(covariance_yy_low_f32x4, a_y_f16x8, b_y_f16x8);
195
+ covariance_yy_high_f32x4 = vfmlalq_high_f16(covariance_yy_high_f32x4, a_y_f16x8, b_y_f16x8);
196
+ covariance_yz_low_f32x4 = vfmlalq_low_f16(covariance_yz_low_f32x4, a_y_f16x8, b_z_f16x8);
197
+ covariance_yz_high_f32x4 = vfmlalq_high_f16(covariance_yz_high_f32x4, a_y_f16x8, b_z_f16x8);
198
+ covariance_zx_low_f32x4 = vfmlalq_low_f16(covariance_zx_low_f32x4, a_z_f16x8, b_x_f16x8);
199
+ covariance_zx_high_f32x4 = vfmlalq_high_f16(covariance_zx_high_f32x4, a_z_f16x8, b_x_f16x8);
200
+ covariance_zy_low_f32x4 = vfmlalq_low_f16(covariance_zy_low_f32x4, a_z_f16x8, b_y_f16x8);
201
+ covariance_zy_high_f32x4 = vfmlalq_high_f16(covariance_zy_high_f32x4, a_z_f16x8, b_y_f16x8);
202
+ covariance_zz_low_f32x4 = vfmlalq_low_f16(covariance_zz_low_f32x4, a_z_f16x8, b_z_f16x8);
203
+ covariance_zz_high_f32x4 = vfmlalq_high_f16(covariance_zz_high_f32x4, a_z_f16x8, b_z_f16x8);
204
+
205
+ // Norm-squared of both point sets via FHM
206
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_x_f16x8, a_x_f16x8);
207
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_x_f16x8, a_x_f16x8);
208
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_y_f16x8, a_y_f16x8);
209
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_y_f16x8, a_y_f16x8);
210
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_z_f16x8, a_z_f16x8);
211
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_z_f16x8, a_z_f16x8);
212
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_x_f16x8, b_x_f16x8);
213
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_x_f16x8, b_x_f16x8);
214
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_y_f16x8, b_y_f16x8);
215
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_y_f16x8, b_y_f16x8);
216
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_z_f16x8, b_z_f16x8);
217
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_z_f16x8, b_z_f16x8);
218
+ }
219
+
220
+ if (i < n) {
221
+ nk_partial_deinterleave_f16_to_f16x8x3_neonfhm_(a + i * 3, n - i, &a_x_f16x8, &a_y_f16x8, &a_z_f16x8);
222
+ nk_partial_deinterleave_f16_to_f16x8x3_neonfhm_(b + i * 3, n - i, &b_x_f16x8, &b_y_f16x8, &b_z_f16x8);
223
+
224
+ sum_a_x_f32x4 = vaddq_f32(sum_a_x_f32x4, vcvt_f32_f16(vget_low_f16(a_x_f16x8)));
225
+ sum_a_x_f32x4 = vaddq_f32(sum_a_x_f32x4, vcvt_high_f32_f16(a_x_f16x8));
226
+ sum_a_y_f32x4 = vaddq_f32(sum_a_y_f32x4, vcvt_f32_f16(vget_low_f16(a_y_f16x8)));
227
+ sum_a_y_f32x4 = vaddq_f32(sum_a_y_f32x4, vcvt_high_f32_f16(a_y_f16x8));
228
+ sum_a_z_f32x4 = vaddq_f32(sum_a_z_f32x4, vcvt_f32_f16(vget_low_f16(a_z_f16x8)));
229
+ sum_a_z_f32x4 = vaddq_f32(sum_a_z_f32x4, vcvt_high_f32_f16(a_z_f16x8));
230
+ sum_b_x_f32x4 = vaddq_f32(sum_b_x_f32x4, vcvt_f32_f16(vget_low_f16(b_x_f16x8)));
231
+ sum_b_x_f32x4 = vaddq_f32(sum_b_x_f32x4, vcvt_high_f32_f16(b_x_f16x8));
232
+ sum_b_y_f32x4 = vaddq_f32(sum_b_y_f32x4, vcvt_f32_f16(vget_low_f16(b_y_f16x8)));
233
+ sum_b_y_f32x4 = vaddq_f32(sum_b_y_f32x4, vcvt_high_f32_f16(b_y_f16x8));
234
+ sum_b_z_f32x4 = vaddq_f32(sum_b_z_f32x4, vcvt_f32_f16(vget_low_f16(b_z_f16x8)));
235
+ sum_b_z_f32x4 = vaddq_f32(sum_b_z_f32x4, vcvt_high_f32_f16(b_z_f16x8));
236
+
237
+ covariance_xx_low_f32x4 = vfmlalq_low_f16(covariance_xx_low_f32x4, a_x_f16x8, b_x_f16x8);
238
+ covariance_xx_high_f32x4 = vfmlalq_high_f16(covariance_xx_high_f32x4, a_x_f16x8, b_x_f16x8);
239
+ covariance_xy_low_f32x4 = vfmlalq_low_f16(covariance_xy_low_f32x4, a_x_f16x8, b_y_f16x8);
240
+ covariance_xy_high_f32x4 = vfmlalq_high_f16(covariance_xy_high_f32x4, a_x_f16x8, b_y_f16x8);
241
+ covariance_xz_low_f32x4 = vfmlalq_low_f16(covariance_xz_low_f32x4, a_x_f16x8, b_z_f16x8);
242
+ covariance_xz_high_f32x4 = vfmlalq_high_f16(covariance_xz_high_f32x4, a_x_f16x8, b_z_f16x8);
243
+ covariance_yx_low_f32x4 = vfmlalq_low_f16(covariance_yx_low_f32x4, a_y_f16x8, b_x_f16x8);
244
+ covariance_yx_high_f32x4 = vfmlalq_high_f16(covariance_yx_high_f32x4, a_y_f16x8, b_x_f16x8);
245
+ covariance_yy_low_f32x4 = vfmlalq_low_f16(covariance_yy_low_f32x4, a_y_f16x8, b_y_f16x8);
246
+ covariance_yy_high_f32x4 = vfmlalq_high_f16(covariance_yy_high_f32x4, a_y_f16x8, b_y_f16x8);
247
+ covariance_yz_low_f32x4 = vfmlalq_low_f16(covariance_yz_low_f32x4, a_y_f16x8, b_z_f16x8);
248
+ covariance_yz_high_f32x4 = vfmlalq_high_f16(covariance_yz_high_f32x4, a_y_f16x8, b_z_f16x8);
249
+ covariance_zx_low_f32x4 = vfmlalq_low_f16(covariance_zx_low_f32x4, a_z_f16x8, b_x_f16x8);
250
+ covariance_zx_high_f32x4 = vfmlalq_high_f16(covariance_zx_high_f32x4, a_z_f16x8, b_x_f16x8);
251
+ covariance_zy_low_f32x4 = vfmlalq_low_f16(covariance_zy_low_f32x4, a_z_f16x8, b_y_f16x8);
252
+ covariance_zy_high_f32x4 = vfmlalq_high_f16(covariance_zy_high_f32x4, a_z_f16x8, b_y_f16x8);
253
+ covariance_zz_low_f32x4 = vfmlalq_low_f16(covariance_zz_low_f32x4, a_z_f16x8, b_z_f16x8);
254
+ covariance_zz_high_f32x4 = vfmlalq_high_f16(covariance_zz_high_f32x4, a_z_f16x8, b_z_f16x8);
255
+
256
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_x_f16x8, a_x_f16x8);
257
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_x_f16x8, a_x_f16x8);
258
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_y_f16x8, a_y_f16x8);
259
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_y_f16x8, a_y_f16x8);
260
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_z_f16x8, a_z_f16x8);
261
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_z_f16x8, a_z_f16x8);
262
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_x_f16x8, b_x_f16x8);
263
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_x_f16x8, b_x_f16x8);
264
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_y_f16x8, b_y_f16x8);
265
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_y_f16x8, b_y_f16x8);
266
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_z_f16x8, b_z_f16x8);
267
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_z_f16x8, b_z_f16x8);
268
+ }
269
+
270
+ // Combine low+high halves
271
+ float32x4_t covariance_xx_f32x4 = vaddq_f32(covariance_xx_low_f32x4, covariance_xx_high_f32x4);
272
+ float32x4_t covariance_xy_f32x4 = vaddq_f32(covariance_xy_low_f32x4, covariance_xy_high_f32x4);
273
+ float32x4_t covariance_xz_f32x4 = vaddq_f32(covariance_xz_low_f32x4, covariance_xz_high_f32x4);
274
+ float32x4_t covariance_yx_f32x4 = vaddq_f32(covariance_yx_low_f32x4, covariance_yx_high_f32x4);
275
+ float32x4_t covariance_yy_f32x4 = vaddq_f32(covariance_yy_low_f32x4, covariance_yy_high_f32x4);
276
+ float32x4_t covariance_yz_f32x4 = vaddq_f32(covariance_yz_low_f32x4, covariance_yz_high_f32x4);
277
+ float32x4_t covariance_zx_f32x4 = vaddq_f32(covariance_zx_low_f32x4, covariance_zx_high_f32x4);
278
+ float32x4_t covariance_zy_f32x4 = vaddq_f32(covariance_zy_low_f32x4, covariance_zy_high_f32x4);
279
+ float32x4_t covariance_zz_f32x4 = vaddq_f32(covariance_zz_low_f32x4, covariance_zz_high_f32x4);
280
+ float32x4_t norm_squared_a_f32x4 = vaddq_f32(norm_squared_a_low_f32x4, norm_squared_a_high_f32x4);
281
+ float32x4_t norm_squared_b_f32x4 = vaddq_f32(norm_squared_b_low_f32x4, norm_squared_b_high_f32x4);
282
+
283
+ // Reduce vector accumulators
284
+ nk_f32_t sum_a_x = vaddvq_f32(sum_a_x_f32x4);
285
+ nk_f32_t sum_a_y = vaddvq_f32(sum_a_y_f32x4);
286
+ nk_f32_t sum_a_z = vaddvq_f32(sum_a_z_f32x4);
287
+ nk_f32_t sum_b_x = vaddvq_f32(sum_b_x_f32x4);
288
+ nk_f32_t sum_b_y = vaddvq_f32(sum_b_y_f32x4);
289
+ nk_f32_t sum_b_z = vaddvq_f32(sum_b_z_f32x4);
290
+
291
+ nk_f32_t covariance_x_x = vaddvq_f32(covariance_xx_f32x4);
292
+ nk_f32_t covariance_x_y = vaddvq_f32(covariance_xy_f32x4);
293
+ nk_f32_t covariance_x_z = vaddvq_f32(covariance_xz_f32x4);
294
+ nk_f32_t covariance_y_x = vaddvq_f32(covariance_yx_f32x4);
295
+ nk_f32_t covariance_y_y = vaddvq_f32(covariance_yy_f32x4);
296
+ nk_f32_t covariance_y_z = vaddvq_f32(covariance_yz_f32x4);
297
+ nk_f32_t covariance_z_x = vaddvq_f32(covariance_zx_f32x4);
298
+ nk_f32_t covariance_z_y = vaddvq_f32(covariance_zy_f32x4);
299
+ nk_f32_t covariance_z_z = vaddvq_f32(covariance_zz_f32x4);
300
+ nk_f32_t norm_squared_a = vaddvq_f32(norm_squared_a_f32x4);
301
+ nk_f32_t norm_squared_b = vaddvq_f32(norm_squared_b_f32x4);
302
+
303
+ // Compute centroids
304
+ nk_f32_t inv_n = 1.0f / (nk_f32_t)n;
305
+ nk_f32_t centroid_a_x = sum_a_x * inv_n;
306
+ nk_f32_t centroid_a_y = sum_a_y * inv_n;
307
+ nk_f32_t centroid_a_z = sum_a_z * inv_n;
308
+ nk_f32_t centroid_b_x = sum_b_x * inv_n;
309
+ nk_f32_t centroid_b_y = sum_b_y * inv_n;
310
+ nk_f32_t centroid_b_z = sum_b_z * inv_n;
311
+
312
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
313
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
314
+
315
+ // Apply centering: H_centered = H − n · centroid_a · centroid_bᵀ
316
+ nk_f32_t cross_covariance[9];
317
+ cross_covariance[0] = covariance_x_x - (nk_f32_t)n * centroid_a_x * centroid_b_x;
318
+ cross_covariance[1] = covariance_x_y - (nk_f32_t)n * centroid_a_x * centroid_b_y;
319
+ cross_covariance[2] = covariance_x_z - (nk_f32_t)n * centroid_a_x * centroid_b_z;
320
+ cross_covariance[3] = covariance_y_x - (nk_f32_t)n * centroid_a_y * centroid_b_x;
321
+ cross_covariance[4] = covariance_y_y - (nk_f32_t)n * centroid_a_y * centroid_b_y;
322
+ cross_covariance[5] = covariance_y_z - (nk_f32_t)n * centroid_a_y * centroid_b_z;
323
+ cross_covariance[6] = covariance_z_x - (nk_f32_t)n * centroid_a_z * centroid_b_x;
324
+ cross_covariance[7] = covariance_z_y - (nk_f32_t)n * centroid_a_z * centroid_b_y;
325
+ cross_covariance[8] = covariance_z_z - (nk_f32_t)n * centroid_a_z * centroid_b_z;
326
+
327
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
328
+ nk_f32_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
329
+ cross_covariance[4] * cross_covariance[4] +
330
+ cross_covariance[8] * cross_covariance[8];
331
+ nk_f32_t covariance_offdiagonal_norm_squared =
332
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
333
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
334
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
335
+ nk_f32_t optimal_rotation[9];
336
+ nk_f32_t trace_rotation_covariance;
337
+ if (covariance_offdiagonal_norm_squared < 1e-12f * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0f &&
338
+ cross_covariance[4] > 0.0f && cross_covariance[8] > 0.0f) {
339
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
340
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
341
+ optimal_rotation[8] = 1;
342
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
343
+ }
344
+ else {
345
+ // SVD of H = U · S · Vᵀ
346
+ nk_f32_t svd_left[9], svd_diagonal[9], svd_right[9];
347
+ nk_svd3x3_f32_(cross_covariance, svd_left, svd_diagonal, svd_right);
348
+
349
+ // R = V · Uᵀ
350
+ optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
351
+ optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
352
+ optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
353
+ optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
354
+ optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
355
+ optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
356
+ optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
357
+ optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
358
+ optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
359
+
360
+ // Handle reflection: if det(R) < 0, negate third column of V and recompute
361
+ nk_f32_t rotation_determinant = nk_det3x3_f32_(optimal_rotation);
362
+ if (rotation_determinant < 0) {
363
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
364
+ optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
365
+ optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
366
+ optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
367
+ optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
368
+ optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
369
+ optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
370
+ optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
371
+ optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
372
+ optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
373
+ }
374
+
375
+ trace_rotation_covariance =
376
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
377
+ optimal_rotation[2] * cross_covariance[6] + //
378
+ optimal_rotation[3] * cross_covariance[1] + optimal_rotation[4] * cross_covariance[4] +
379
+ optimal_rotation[5] * cross_covariance[7] + //
380
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
381
+ optimal_rotation[8] * cross_covariance[8];
382
+ }
383
+
384
+ if (rotation)
385
+ for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
386
+ if (scale) *scale = 1.0f;
387
+
388
+ // Folded SSD via trace identity: SSD = ‖a-ā‖² + ‖b-b̄‖² − 2·trace(R · H_centered).
389
+ nk_f32_t centered_norm_squared_a = norm_squared_a -
390
+ (nk_f32_t)n * (centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y +
391
+ centroid_a_z * centroid_a_z);
392
+ nk_f32_t centered_norm_squared_b = norm_squared_b -
393
+ (nk_f32_t)n * (centroid_b_x * centroid_b_x + centroid_b_y * centroid_b_y +
394
+ centroid_b_z * centroid_b_z);
395
+ if (centered_norm_squared_a < 0.0f) centered_norm_squared_a = 0.0f;
396
+ if (centered_norm_squared_b < 0.0f) centered_norm_squared_b = 0.0f;
397
+ nk_f32_t sum_squared = centered_norm_squared_a + centered_norm_squared_b - 2.0f * trace_rotation_covariance;
398
+ if (sum_squared < 0.0f) sum_squared = 0.0f;
399
+ *result = nk_f32_sqrt_neon(sum_squared * inv_n);
400
+ }
401
+
402
+ /**
403
+ * @brief Umeyama algorithm (Kabsch with uniform scale) using NEON FHM widening FMA.
404
+ * Finds rotation R and scale c minimizing ‖c·R·a − b‖² after centroid alignment.
405
+ */
406
+ NK_PUBLIC void nk_umeyama_f16_neonfhm(nk_f16_t const *a, nk_f16_t const *b, nk_size_t n, nk_f32_t *a_centroid,
407
+ nk_f32_t *b_centroid, nk_f32_t *rotation, nk_f32_t *scale, nk_f32_t *result) {
408
+ float32x4_t const zeros_f32x4 = vdupq_n_f32(0);
409
+
410
+ float32x4_t sum_a_x_f32x4 = zeros_f32x4, sum_a_y_f32x4 = zeros_f32x4, sum_a_z_f32x4 = zeros_f32x4;
411
+ float32x4_t sum_b_x_f32x4 = zeros_f32x4, sum_b_y_f32x4 = zeros_f32x4, sum_b_z_f32x4 = zeros_f32x4;
412
+
413
+ float32x4_t covariance_xx_low_f32x4 = zeros_f32x4, covariance_xx_high_f32x4 = zeros_f32x4;
414
+ float32x4_t covariance_xy_low_f32x4 = zeros_f32x4, covariance_xy_high_f32x4 = zeros_f32x4;
415
+ float32x4_t covariance_xz_low_f32x4 = zeros_f32x4, covariance_xz_high_f32x4 = zeros_f32x4;
416
+ float32x4_t covariance_yx_low_f32x4 = zeros_f32x4, covariance_yx_high_f32x4 = zeros_f32x4;
417
+ float32x4_t covariance_yy_low_f32x4 = zeros_f32x4, covariance_yy_high_f32x4 = zeros_f32x4;
418
+ float32x4_t covariance_yz_low_f32x4 = zeros_f32x4, covariance_yz_high_f32x4 = zeros_f32x4;
419
+ float32x4_t covariance_zx_low_f32x4 = zeros_f32x4, covariance_zx_high_f32x4 = zeros_f32x4;
420
+ float32x4_t covariance_zy_low_f32x4 = zeros_f32x4, covariance_zy_high_f32x4 = zeros_f32x4;
421
+ float32x4_t covariance_zz_low_f32x4 = zeros_f32x4, covariance_zz_high_f32x4 = zeros_f32x4;
422
+ float32x4_t norm_squared_a_low_f32x4 = zeros_f32x4, norm_squared_a_high_f32x4 = zeros_f32x4;
423
+ float32x4_t norm_squared_b_low_f32x4 = zeros_f32x4, norm_squared_b_high_f32x4 = zeros_f32x4;
424
+
425
+ float16x8_t a_x_f16x8, a_y_f16x8, a_z_f16x8;
426
+ float16x8_t b_x_f16x8, b_y_f16x8, b_z_f16x8;
427
+ nk_size_t i = 0;
428
+
429
+ for (; i + 8 <= n; i += 8) {
430
+ nk_deinterleave_f16x8_to_f16x8x3_neonfhm_(a + i * 3, &a_x_f16x8, &a_y_f16x8, &a_z_f16x8);
431
+ nk_deinterleave_f16x8_to_f16x8x3_neonfhm_(b + i * 3, &b_x_f16x8, &b_y_f16x8, &b_z_f16x8);
432
+
433
+ sum_a_x_f32x4 = vaddq_f32(sum_a_x_f32x4, vcvt_f32_f16(vget_low_f16(a_x_f16x8)));
434
+ sum_a_x_f32x4 = vaddq_f32(sum_a_x_f32x4, vcvt_high_f32_f16(a_x_f16x8));
435
+ sum_a_y_f32x4 = vaddq_f32(sum_a_y_f32x4, vcvt_f32_f16(vget_low_f16(a_y_f16x8)));
436
+ sum_a_y_f32x4 = vaddq_f32(sum_a_y_f32x4, vcvt_high_f32_f16(a_y_f16x8));
437
+ sum_a_z_f32x4 = vaddq_f32(sum_a_z_f32x4, vcvt_f32_f16(vget_low_f16(a_z_f16x8)));
438
+ sum_a_z_f32x4 = vaddq_f32(sum_a_z_f32x4, vcvt_high_f32_f16(a_z_f16x8));
439
+ sum_b_x_f32x4 = vaddq_f32(sum_b_x_f32x4, vcvt_f32_f16(vget_low_f16(b_x_f16x8)));
440
+ sum_b_x_f32x4 = vaddq_f32(sum_b_x_f32x4, vcvt_high_f32_f16(b_x_f16x8));
441
+ sum_b_y_f32x4 = vaddq_f32(sum_b_y_f32x4, vcvt_f32_f16(vget_low_f16(b_y_f16x8)));
442
+ sum_b_y_f32x4 = vaddq_f32(sum_b_y_f32x4, vcvt_high_f32_f16(b_y_f16x8));
443
+ sum_b_z_f32x4 = vaddq_f32(sum_b_z_f32x4, vcvt_f32_f16(vget_low_f16(b_z_f16x8)));
444
+ sum_b_z_f32x4 = vaddq_f32(sum_b_z_f32x4, vcvt_high_f32_f16(b_z_f16x8));
445
+
446
+ covariance_xx_low_f32x4 = vfmlalq_low_f16(covariance_xx_low_f32x4, a_x_f16x8, b_x_f16x8);
447
+ covariance_xx_high_f32x4 = vfmlalq_high_f16(covariance_xx_high_f32x4, a_x_f16x8, b_x_f16x8);
448
+ covariance_xy_low_f32x4 = vfmlalq_low_f16(covariance_xy_low_f32x4, a_x_f16x8, b_y_f16x8);
449
+ covariance_xy_high_f32x4 = vfmlalq_high_f16(covariance_xy_high_f32x4, a_x_f16x8, b_y_f16x8);
450
+ covariance_xz_low_f32x4 = vfmlalq_low_f16(covariance_xz_low_f32x4, a_x_f16x8, b_z_f16x8);
451
+ covariance_xz_high_f32x4 = vfmlalq_high_f16(covariance_xz_high_f32x4, a_x_f16x8, b_z_f16x8);
452
+ covariance_yx_low_f32x4 = vfmlalq_low_f16(covariance_yx_low_f32x4, a_y_f16x8, b_x_f16x8);
453
+ covariance_yx_high_f32x4 = vfmlalq_high_f16(covariance_yx_high_f32x4, a_y_f16x8, b_x_f16x8);
454
+ covariance_yy_low_f32x4 = vfmlalq_low_f16(covariance_yy_low_f32x4, a_y_f16x8, b_y_f16x8);
455
+ covariance_yy_high_f32x4 = vfmlalq_high_f16(covariance_yy_high_f32x4, a_y_f16x8, b_y_f16x8);
456
+ covariance_yz_low_f32x4 = vfmlalq_low_f16(covariance_yz_low_f32x4, a_y_f16x8, b_z_f16x8);
457
+ covariance_yz_high_f32x4 = vfmlalq_high_f16(covariance_yz_high_f32x4, a_y_f16x8, b_z_f16x8);
458
+ covariance_zx_low_f32x4 = vfmlalq_low_f16(covariance_zx_low_f32x4, a_z_f16x8, b_x_f16x8);
459
+ covariance_zx_high_f32x4 = vfmlalq_high_f16(covariance_zx_high_f32x4, a_z_f16x8, b_x_f16x8);
460
+ covariance_zy_low_f32x4 = vfmlalq_low_f16(covariance_zy_low_f32x4, a_z_f16x8, b_y_f16x8);
461
+ covariance_zy_high_f32x4 = vfmlalq_high_f16(covariance_zy_high_f32x4, a_z_f16x8, b_y_f16x8);
462
+ covariance_zz_low_f32x4 = vfmlalq_low_f16(covariance_zz_low_f32x4, a_z_f16x8, b_z_f16x8);
463
+ covariance_zz_high_f32x4 = vfmlalq_high_f16(covariance_zz_high_f32x4, a_z_f16x8, b_z_f16x8);
464
+
465
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_x_f16x8, a_x_f16x8);
466
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_x_f16x8, a_x_f16x8);
467
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_y_f16x8, a_y_f16x8);
468
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_y_f16x8, a_y_f16x8);
469
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_z_f16x8, a_z_f16x8);
470
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_z_f16x8, a_z_f16x8);
471
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_x_f16x8, b_x_f16x8);
472
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_x_f16x8, b_x_f16x8);
473
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_y_f16x8, b_y_f16x8);
474
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_y_f16x8, b_y_f16x8);
475
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_z_f16x8, b_z_f16x8);
476
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_z_f16x8, b_z_f16x8);
477
+ }
478
+
479
+ if (i < n) {
480
+ nk_partial_deinterleave_f16_to_f16x8x3_neonfhm_(a + i * 3, n - i, &a_x_f16x8, &a_y_f16x8, &a_z_f16x8);
481
+ nk_partial_deinterleave_f16_to_f16x8x3_neonfhm_(b + i * 3, n - i, &b_x_f16x8, &b_y_f16x8, &b_z_f16x8);
482
+
483
+ sum_a_x_f32x4 = vaddq_f32(sum_a_x_f32x4, vcvt_f32_f16(vget_low_f16(a_x_f16x8)));
484
+ sum_a_x_f32x4 = vaddq_f32(sum_a_x_f32x4, vcvt_high_f32_f16(a_x_f16x8));
485
+ sum_a_y_f32x4 = vaddq_f32(sum_a_y_f32x4, vcvt_f32_f16(vget_low_f16(a_y_f16x8)));
486
+ sum_a_y_f32x4 = vaddq_f32(sum_a_y_f32x4, vcvt_high_f32_f16(a_y_f16x8));
487
+ sum_a_z_f32x4 = vaddq_f32(sum_a_z_f32x4, vcvt_f32_f16(vget_low_f16(a_z_f16x8)));
488
+ sum_a_z_f32x4 = vaddq_f32(sum_a_z_f32x4, vcvt_high_f32_f16(a_z_f16x8));
489
+ sum_b_x_f32x4 = vaddq_f32(sum_b_x_f32x4, vcvt_f32_f16(vget_low_f16(b_x_f16x8)));
490
+ sum_b_x_f32x4 = vaddq_f32(sum_b_x_f32x4, vcvt_high_f32_f16(b_x_f16x8));
491
+ sum_b_y_f32x4 = vaddq_f32(sum_b_y_f32x4, vcvt_f32_f16(vget_low_f16(b_y_f16x8)));
492
+ sum_b_y_f32x4 = vaddq_f32(sum_b_y_f32x4, vcvt_high_f32_f16(b_y_f16x8));
493
+ sum_b_z_f32x4 = vaddq_f32(sum_b_z_f32x4, vcvt_f32_f16(vget_low_f16(b_z_f16x8)));
494
+ sum_b_z_f32x4 = vaddq_f32(sum_b_z_f32x4, vcvt_high_f32_f16(b_z_f16x8));
495
+
496
+ covariance_xx_low_f32x4 = vfmlalq_low_f16(covariance_xx_low_f32x4, a_x_f16x8, b_x_f16x8);
497
+ covariance_xx_high_f32x4 = vfmlalq_high_f16(covariance_xx_high_f32x4, a_x_f16x8, b_x_f16x8);
498
+ covariance_xy_low_f32x4 = vfmlalq_low_f16(covariance_xy_low_f32x4, a_x_f16x8, b_y_f16x8);
499
+ covariance_xy_high_f32x4 = vfmlalq_high_f16(covariance_xy_high_f32x4, a_x_f16x8, b_y_f16x8);
500
+ covariance_xz_low_f32x4 = vfmlalq_low_f16(covariance_xz_low_f32x4, a_x_f16x8, b_z_f16x8);
501
+ covariance_xz_high_f32x4 = vfmlalq_high_f16(covariance_xz_high_f32x4, a_x_f16x8, b_z_f16x8);
502
+ covariance_yx_low_f32x4 = vfmlalq_low_f16(covariance_yx_low_f32x4, a_y_f16x8, b_x_f16x8);
503
+ covariance_yx_high_f32x4 = vfmlalq_high_f16(covariance_yx_high_f32x4, a_y_f16x8, b_x_f16x8);
504
+ covariance_yy_low_f32x4 = vfmlalq_low_f16(covariance_yy_low_f32x4, a_y_f16x8, b_y_f16x8);
505
+ covariance_yy_high_f32x4 = vfmlalq_high_f16(covariance_yy_high_f32x4, a_y_f16x8, b_y_f16x8);
506
+ covariance_yz_low_f32x4 = vfmlalq_low_f16(covariance_yz_low_f32x4, a_y_f16x8, b_z_f16x8);
507
+ covariance_yz_high_f32x4 = vfmlalq_high_f16(covariance_yz_high_f32x4, a_y_f16x8, b_z_f16x8);
508
+ covariance_zx_low_f32x4 = vfmlalq_low_f16(covariance_zx_low_f32x4, a_z_f16x8, b_x_f16x8);
509
+ covariance_zx_high_f32x4 = vfmlalq_high_f16(covariance_zx_high_f32x4, a_z_f16x8, b_x_f16x8);
510
+ covariance_zy_low_f32x4 = vfmlalq_low_f16(covariance_zy_low_f32x4, a_z_f16x8, b_y_f16x8);
511
+ covariance_zy_high_f32x4 = vfmlalq_high_f16(covariance_zy_high_f32x4, a_z_f16x8, b_y_f16x8);
512
+ covariance_zz_low_f32x4 = vfmlalq_low_f16(covariance_zz_low_f32x4, a_z_f16x8, b_z_f16x8);
513
+ covariance_zz_high_f32x4 = vfmlalq_high_f16(covariance_zz_high_f32x4, a_z_f16x8, b_z_f16x8);
514
+
515
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_x_f16x8, a_x_f16x8);
516
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_x_f16x8, a_x_f16x8);
517
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_y_f16x8, a_y_f16x8);
518
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_y_f16x8, a_y_f16x8);
519
+ norm_squared_a_low_f32x4 = vfmlalq_low_f16(norm_squared_a_low_f32x4, a_z_f16x8, a_z_f16x8);
520
+ norm_squared_a_high_f32x4 = vfmlalq_high_f16(norm_squared_a_high_f32x4, a_z_f16x8, a_z_f16x8);
521
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_x_f16x8, b_x_f16x8);
522
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_x_f16x8, b_x_f16x8);
523
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_y_f16x8, b_y_f16x8);
524
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_y_f16x8, b_y_f16x8);
525
+ norm_squared_b_low_f32x4 = vfmlalq_low_f16(norm_squared_b_low_f32x4, b_z_f16x8, b_z_f16x8);
526
+ norm_squared_b_high_f32x4 = vfmlalq_high_f16(norm_squared_b_high_f32x4, b_z_f16x8, b_z_f16x8);
527
+ }
528
+
529
+ // Combine low+high halves
530
+ float32x4_t covariance_xx_f32x4 = vaddq_f32(covariance_xx_low_f32x4, covariance_xx_high_f32x4);
531
+ float32x4_t covariance_xy_f32x4 = vaddq_f32(covariance_xy_low_f32x4, covariance_xy_high_f32x4);
532
+ float32x4_t covariance_xz_f32x4 = vaddq_f32(covariance_xz_low_f32x4, covariance_xz_high_f32x4);
533
+ float32x4_t covariance_yx_f32x4 = vaddq_f32(covariance_yx_low_f32x4, covariance_yx_high_f32x4);
534
+ float32x4_t covariance_yy_f32x4 = vaddq_f32(covariance_yy_low_f32x4, covariance_yy_high_f32x4);
535
+ float32x4_t covariance_yz_f32x4 = vaddq_f32(covariance_yz_low_f32x4, covariance_yz_high_f32x4);
536
+ float32x4_t covariance_zx_f32x4 = vaddq_f32(covariance_zx_low_f32x4, covariance_zx_high_f32x4);
537
+ float32x4_t covariance_zy_f32x4 = vaddq_f32(covariance_zy_low_f32x4, covariance_zy_high_f32x4);
538
+ float32x4_t covariance_zz_f32x4 = vaddq_f32(covariance_zz_low_f32x4, covariance_zz_high_f32x4);
539
+ float32x4_t norm_squared_a_f32x4 = vaddq_f32(norm_squared_a_low_f32x4, norm_squared_a_high_f32x4);
540
+ float32x4_t norm_squared_b_f32x4 = vaddq_f32(norm_squared_b_low_f32x4, norm_squared_b_high_f32x4);
541
+
542
+ // Reduce vector accumulators
543
+ nk_f32_t sum_a_x = vaddvq_f32(sum_a_x_f32x4);
544
+ nk_f32_t sum_a_y = vaddvq_f32(sum_a_y_f32x4);
545
+ nk_f32_t sum_a_z = vaddvq_f32(sum_a_z_f32x4);
546
+ nk_f32_t sum_b_x = vaddvq_f32(sum_b_x_f32x4);
547
+ nk_f32_t sum_b_y = vaddvq_f32(sum_b_y_f32x4);
548
+ nk_f32_t sum_b_z = vaddvq_f32(sum_b_z_f32x4);
549
+ nk_f32_t covariance_x_x = vaddvq_f32(covariance_xx_f32x4);
550
+ nk_f32_t covariance_x_y = vaddvq_f32(covariance_xy_f32x4);
551
+ nk_f32_t covariance_x_z = vaddvq_f32(covariance_xz_f32x4);
552
+ nk_f32_t covariance_y_x = vaddvq_f32(covariance_yx_f32x4);
553
+ nk_f32_t covariance_y_y = vaddvq_f32(covariance_yy_f32x4);
554
+ nk_f32_t covariance_y_z = vaddvq_f32(covariance_yz_f32x4);
555
+ nk_f32_t covariance_z_x = vaddvq_f32(covariance_zx_f32x4);
556
+ nk_f32_t covariance_z_y = vaddvq_f32(covariance_zy_f32x4);
557
+ nk_f32_t covariance_z_z = vaddvq_f32(covariance_zz_f32x4);
558
+ nk_f32_t norm_squared_a = vaddvq_f32(norm_squared_a_f32x4);
559
+ nk_f32_t norm_squared_b = vaddvq_f32(norm_squared_b_f32x4);
560
+
561
+ // Compute centroids
562
+ nk_f32_t inv_n = 1.0f / (nk_f32_t)n;
563
+ nk_f32_t centroid_a_x = sum_a_x * inv_n, centroid_a_y = sum_a_y * inv_n, centroid_a_z = sum_a_z * inv_n;
564
+ nk_f32_t centroid_b_x = sum_b_x * inv_n, centroid_b_y = sum_b_y * inv_n, centroid_b_z = sum_b_z * inv_n;
565
+
566
+ if (a_centroid) a_centroid[0] = centroid_a_x, a_centroid[1] = centroid_a_y, a_centroid[2] = centroid_a_z;
567
+ if (b_centroid) b_centroid[0] = centroid_b_x, b_centroid[1] = centroid_b_y, b_centroid[2] = centroid_b_z;
568
+
569
+ // Centered norm-squared via parallel-axis identity; clamp at zero for numeric safety.
570
+ nk_f32_t centered_norm_squared_a = norm_squared_a -
571
+ (nk_f32_t)n * (centroid_a_x * centroid_a_x + centroid_a_y * centroid_a_y +
572
+ centroid_a_z * centroid_a_z);
573
+ nk_f32_t centered_norm_squared_b = norm_squared_b -
574
+ (nk_f32_t)n * (centroid_b_x * centroid_b_x + centroid_b_y * centroid_b_y +
575
+ centroid_b_z * centroid_b_z);
576
+ if (centered_norm_squared_a < 0.0f) centered_norm_squared_a = 0.0f;
577
+ if (centered_norm_squared_b < 0.0f) centered_norm_squared_b = 0.0f;
578
+
579
+ nk_f32_t cross_covariance[9];
580
+ cross_covariance[0] = covariance_x_x - (nk_f32_t)n * centroid_a_x * centroid_b_x;
581
+ cross_covariance[1] = covariance_x_y - (nk_f32_t)n * centroid_a_x * centroid_b_y;
582
+ cross_covariance[2] = covariance_x_z - (nk_f32_t)n * centroid_a_x * centroid_b_z;
583
+ cross_covariance[3] = covariance_y_x - (nk_f32_t)n * centroid_a_y * centroid_b_x;
584
+ cross_covariance[4] = covariance_y_y - (nk_f32_t)n * centroid_a_y * centroid_b_y;
585
+ cross_covariance[5] = covariance_y_z - (nk_f32_t)n * centroid_a_y * centroid_b_z;
586
+ cross_covariance[6] = covariance_z_x - (nk_f32_t)n * centroid_a_z * centroid_b_x;
587
+ cross_covariance[7] = covariance_z_y - (nk_f32_t)n * centroid_a_z * centroid_b_y;
588
+ cross_covariance[8] = covariance_z_z - (nk_f32_t)n * centroid_a_z * centroid_b_z;
589
+
590
+ // Identity-dominant short-circuit: if H ≈ diag(positive entries), R = I and trace(R·H) = trace(H).
591
+ nk_f32_t covariance_diagonal_norm_squared = cross_covariance[0] * cross_covariance[0] +
592
+ cross_covariance[4] * cross_covariance[4] +
593
+ cross_covariance[8] * cross_covariance[8];
594
+ nk_f32_t covariance_offdiagonal_norm_squared =
595
+ cross_covariance[1] * cross_covariance[1] + cross_covariance[2] * cross_covariance[2] +
596
+ cross_covariance[3] * cross_covariance[3] + cross_covariance[5] * cross_covariance[5] +
597
+ cross_covariance[6] * cross_covariance[6] + cross_covariance[7] * cross_covariance[7];
598
+ nk_f32_t optimal_rotation[9];
599
+ nk_f32_t trace_rotation_covariance;
600
+ nk_f32_t scale_factor;
601
+ if (covariance_offdiagonal_norm_squared < 1e-12f * covariance_diagonal_norm_squared && cross_covariance[0] > 0.0f &&
602
+ cross_covariance[4] > 0.0f && cross_covariance[8] > 0.0f) {
603
+ optimal_rotation[0] = 1, optimal_rotation[1] = 0, optimal_rotation[2] = 0, optimal_rotation[3] = 0,
604
+ optimal_rotation[4] = 1, optimal_rotation[5] = 0, optimal_rotation[6] = 0, optimal_rotation[7] = 0,
605
+ optimal_rotation[8] = 1;
606
+ trace_rotation_covariance = cross_covariance[0] + cross_covariance[4] + cross_covariance[8];
607
+ scale_factor = centered_norm_squared_a > 0.0f ? trace_rotation_covariance / centered_norm_squared_a : 0.0f;
608
+ }
609
+ else {
610
+ // SVD of H = U · S · Vᵀ
611
+ nk_f32_t svd_left[9], svd_diagonal[9], svd_right[9];
612
+ nk_svd3x3_f32_(cross_covariance, svd_left, svd_diagonal, svd_right);
613
+
614
+ // R = V · Uᵀ
615
+ optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
616
+ optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
617
+ optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
618
+ optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
619
+ optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
620
+ optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
621
+ optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
622
+ optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
623
+ optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
624
+
625
+ // Handle reflection and compute scale: c = trace(D · S) / ‖a-ā‖²
626
+ nk_f32_t rotation_determinant = nk_det3x3_f32_(optimal_rotation);
627
+ nk_f32_t sign_det = rotation_determinant < 0 ? -1.0f : 1.0f;
628
+ nk_f32_t trace_scaled_s = svd_diagonal[0] + svd_diagonal[4] + sign_det * svd_diagonal[8];
629
+ scale_factor = centered_norm_squared_a > 0.0f ? trace_scaled_s / centered_norm_squared_a : 0.0f;
630
+
631
+ if (rotation_determinant < 0) {
632
+ svd_right[2] = -svd_right[2], svd_right[5] = -svd_right[5], svd_right[8] = -svd_right[8];
633
+ optimal_rotation[0] = svd_right[0] * svd_left[0] + svd_right[1] * svd_left[1] + svd_right[2] * svd_left[2];
634
+ optimal_rotation[1] = svd_right[0] * svd_left[3] + svd_right[1] * svd_left[4] + svd_right[2] * svd_left[5];
635
+ optimal_rotation[2] = svd_right[0] * svd_left[6] + svd_right[1] * svd_left[7] + svd_right[2] * svd_left[8];
636
+ optimal_rotation[3] = svd_right[3] * svd_left[0] + svd_right[4] * svd_left[1] + svd_right[5] * svd_left[2];
637
+ optimal_rotation[4] = svd_right[3] * svd_left[3] + svd_right[4] * svd_left[4] + svd_right[5] * svd_left[5];
638
+ optimal_rotation[5] = svd_right[3] * svd_left[6] + svd_right[4] * svd_left[7] + svd_right[5] * svd_left[8];
639
+ optimal_rotation[6] = svd_right[6] * svd_left[0] + svd_right[7] * svd_left[1] + svd_right[8] * svd_left[2];
640
+ optimal_rotation[7] = svd_right[6] * svd_left[3] + svd_right[7] * svd_left[4] + svd_right[8] * svd_left[5];
641
+ optimal_rotation[8] = svd_right[6] * svd_left[6] + svd_right[7] * svd_left[7] + svd_right[8] * svd_left[8];
642
+ }
643
+
644
+ trace_rotation_covariance =
645
+ optimal_rotation[0] * cross_covariance[0] + optimal_rotation[1] * cross_covariance[3] +
646
+ optimal_rotation[2] * cross_covariance[6] + //
647
+ optimal_rotation[3] * cross_covariance[1] + optimal_rotation[4] * cross_covariance[4] +
648
+ optimal_rotation[5] * cross_covariance[7] + //
649
+ optimal_rotation[6] * cross_covariance[2] + optimal_rotation[7] * cross_covariance[5] +
650
+ optimal_rotation[8] * cross_covariance[8];
651
+ }
652
+ if (scale) *scale = scale_factor;
653
+
654
+ if (rotation)
655
+ for (int j = 0; j < 9; ++j) rotation[j] = optimal_rotation[j];
656
+
657
+ // Folded SSD with scale: c²·‖a-ā‖² + ‖b-b̄‖² − 2c·trace(R · H_centered).
658
+ nk_f32_t sum_squared = scale_factor * scale_factor * centered_norm_squared_a + centered_norm_squared_b -
659
+ 2.0f * scale_factor * trace_rotation_covariance;
660
+ if (sum_squared < 0.0f) sum_squared = 0.0f;
661
+ *result = nk_f32_sqrt_neon(sum_squared * inv_n);
662
+ }
663
+
664
+ #if defined(__clang__)
665
+ #pragma clang attribute pop
666
+ #elif defined(__GNUC__)
667
+ #pragma GCC pop_options
668
+ #endif
669
+
670
+ #if defined(__cplusplus)
671
+ } // extern "C"
672
+ #endif
673
+
674
+ #endif // NK_TARGET_NEONFHM
675
+ #endif // NK_TARGET_ARM64_
676
+ #endif // NK_MESH_NEONFHM_H