numkong 7.4.5 → 7.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +1 -0
  2. package/binding.gyp +81 -5
  3. package/c/dispatch_f16.c +23 -0
  4. package/c/numkong.c +0 -13
  5. package/include/numkong/attention/sme.h +34 -31
  6. package/include/numkong/capabilities.h +2 -15
  7. package/include/numkong/cast/neon.h +15 -0
  8. package/include/numkong/curved/smef64.h +82 -62
  9. package/include/numkong/dot/rvvbf16.h +1 -1
  10. package/include/numkong/dot/rvvhalf.h +1 -1
  11. package/include/numkong/dot/sve.h +6 -5
  12. package/include/numkong/dot/svebfdot.h +2 -1
  13. package/include/numkong/dot/svehalf.h +6 -5
  14. package/include/numkong/dot/svesdot.h +3 -2
  15. package/include/numkong/dots/graniteamx.h +733 -0
  16. package/include/numkong/dots/serial.h +11 -4
  17. package/include/numkong/dots/sme.h +172 -140
  18. package/include/numkong/dots/smebi32.h +14 -11
  19. package/include/numkong/dots/smef64.h +31 -26
  20. package/include/numkong/dots.h +29 -3
  21. package/include/numkong/each/serial.h +22 -0
  22. package/include/numkong/geospatial/haswell.h +1 -1
  23. package/include/numkong/geospatial/neon.h +1 -1
  24. package/include/numkong/geospatial/serial.h +1 -1
  25. package/include/numkong/geospatial/skylake.h +1 -1
  26. package/include/numkong/maxsim/sme.h +34 -33
  27. package/include/numkong/mesh/serial.h +22 -0
  28. package/include/numkong/reduce/neon.h +29 -0
  29. package/include/numkong/reduce/neonbfdot.h +2 -2
  30. package/include/numkong/reduce/neonfhm.h +4 -4
  31. package/include/numkong/reduce/sve.h +52 -0
  32. package/include/numkong/reduce.h +4 -0
  33. package/include/numkong/set/sve.h +6 -5
  34. package/include/numkong/sets/smebi32.h +35 -30
  35. package/include/numkong/sparse/sve2.h +3 -2
  36. package/include/numkong/spatial/sve.h +7 -6
  37. package/include/numkong/spatial/svebfdot.h +7 -4
  38. package/include/numkong/spatial/svehalf.h +5 -4
  39. package/include/numkong/spatial/svesdot.h +9 -8
  40. package/include/numkong/spatials/graniteamx.h +173 -0
  41. package/include/numkong/spatials/serial.h +22 -0
  42. package/include/numkong/spatials/sme.h +391 -350
  43. package/include/numkong/spatials/smef64.h +79 -70
  44. package/include/numkong/spatials.h +37 -4
  45. package/include/numkong/types.h +59 -0
  46. package/javascript/dist/cjs/numkong.js +13 -0
  47. package/javascript/dist/esm/numkong.js +13 -0
  48. package/javascript/numkong.c +56 -12
  49. package/javascript/numkong.ts +13 -0
  50. package/package.json +7 -7
  51. package/probes/probe.js +2 -2
  52. package/wasm/numkong.wasm +0 -0
@@ -13,6 +13,7 @@
13
13
  #if NK_TARGET_SME
14
14
 
15
15
  #include "numkong/dots/serial.h"
16
+ #include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
16
17
  #include "numkong/dots/smef64.h"
17
18
 
18
19
  #if defined(__cplusplus)
@@ -44,7 +45,7 @@ NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f32_ssve_(nk_f32_t const *data, nk_size_
44
45
  svfloat64_t values_odd_f64x = svcvtlt_f64_f32_x(predicate_odd_b64x, values_f32x);
45
46
  accumulator_odd_f64x = svmla_f64_m(predicate_odd_b64x, accumulator_odd_f64x, values_odd_f64x, values_odd_f64x);
46
47
  }
47
- return svaddv_f64(svptrue_b64(), accumulator_even_f64x) + svaddv_f64(svptrue_b64(), accumulator_odd_f64x);
48
+ return nk_svaddv_f64_(svptrue_b64(), accumulator_even_f64x) + nk_svaddv_f64_(svptrue_b64(), accumulator_odd_f64x);
48
49
  }
49
50
 
50
51
  NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f64_ssve_(nk_f64_t const *data, nk_size_t count) NK_STREAMING_ {
@@ -55,7 +56,7 @@ NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f64_ssve_(nk_f64_t const *data, nk_size_
55
56
  svfloat64_t values_f64x = svld1_f64(predicate_b64x, data + i);
56
57
  accumulator_f64x = svmla_f64_m(predicate_b64x, accumulator_f64x, values_f64x, values_f64x);
57
58
  }
58
- return svaddv_f64(svptrue_b64(), accumulator_f64x);
59
+ return nk_svaddv_f64_(svptrue_b64(), accumulator_f64x);
59
60
  }
60
61
 
61
62
  NK_PUBLIC svfloat64_t nk_angulars_from_dot_f64x_ssvef64_(svbool_t predicate_b64x, svfloat64_t dots_f64x,
@@ -85,10 +86,9 @@ NK_PUBLIC svfloat64_t nk_euclideans_from_dot_f64x_ssvef64_(svbool_t predicate_b6
85
86
 
86
87
  #pragma region F32 Packed Angular
87
88
 
88
- __arm_locally_streaming static void nk_angulars_packed_f32_smef64_finalize_streaming_( //
89
- nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
90
- nk_size_t rows, nk_size_t columns, nk_size_t depth, //
91
- nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
89
+ static void nk_angulars_packed_f32_smef64_finalize_ssve_( //
90
+ nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
91
+ nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
92
92
 
93
93
  nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
94
94
  nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
@@ -110,26 +110,26 @@ __arm_locally_streaming static void nk_angulars_packed_f32_smef64_finalize_strea
110
110
  }
111
111
  }
112
112
 
113
- NK_PUBLIC void nk_angulars_packed_f32_smef64( //
114
- nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
115
- nk_size_t rows, nk_size_t columns, nk_size_t depth, //
113
+ NK_PUBLIC void nk_angulars_packed_f32_smef64( //
114
+ nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
116
115
  nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
117
116
 
118
117
  nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
119
118
  nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
120
119
 
120
+ nk_sme_start_streaming_();
121
121
  nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
122
- nk_angulars_packed_f32_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
123
- c_stride_elements);
122
+ nk_angulars_packed_f32_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
123
+ c_stride_elements);
124
+ nk_sme_stop_streaming_();
124
125
  }
125
126
 
126
127
  #pragma endregion F32 Packed Angular
127
128
  #pragma region F32 Packed Euclidean
128
129
 
129
- __arm_locally_streaming static void nk_euclideans_packed_f32_smef64_finalize_streaming_( //
130
- nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
131
- nk_size_t rows, nk_size_t columns, nk_size_t depth, //
132
- nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
130
+ static void nk_euclideans_packed_f32_smef64_finalize_ssve_( //
131
+ nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
132
+ nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
133
133
 
134
134
  nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
135
135
  nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
@@ -151,25 +151,26 @@ __arm_locally_streaming static void nk_euclideans_packed_f32_smef64_finalize_str
151
151
  }
152
152
  }
153
153
 
154
- NK_PUBLIC void nk_euclideans_packed_f32_smef64( //
155
- nk_f32_t const *a, void const *b_packed, nk_f64_t *c, //
156
- nk_size_t rows, nk_size_t columns, nk_size_t depth, //
154
+ NK_PUBLIC void nk_euclideans_packed_f32_smef64( //
155
+ nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
157
156
  nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
158
157
 
159
158
  nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
160
159
  nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
161
160
 
161
+ nk_sme_start_streaming_();
162
162
  nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
163
- nk_euclideans_packed_f32_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
164
- c_stride_elements);
163
+ nk_euclideans_packed_f32_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
164
+ c_stride_elements);
165
+ nk_sme_stop_streaming_();
165
166
  }
166
167
 
167
168
  #pragma endregion F32 Packed Euclidean
168
169
  #pragma region F32 Symmetric Angular
169
170
 
170
- __arm_locally_streaming static void nk_angulars_symmetric_f32_smef64_finalize_streaming_( //
171
- nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, //
172
- nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
171
+ static void nk_angulars_symmetric_f32_smef64_finalize_ssve_( //
172
+ nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
173
+ nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
173
174
  // Phase 1: cache row norms on diagonal
174
175
  for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
175
176
  nk_f32_t const *row_vector = vectors + row_index * stride_elements;
@@ -204,25 +205,27 @@ __arm_locally_streaming static void nk_angulars_symmetric_f32_smef64_finalize_st
204
205
  result[row_index * result_stride_elements + row_index] = 0;
205
206
  }
206
207
 
207
- NK_PUBLIC void nk_angulars_symmetric_f32_smef64( //
208
- nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, //
209
- nk_f64_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
208
+ NK_PUBLIC void nk_angulars_symmetric_f32_smef64( //
209
+ nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
210
+ nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
210
211
 
211
212
  nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f32_t);
212
213
  nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
213
214
 
215
+ nk_sme_start_streaming_();
214
216
  nk_dots_symmetric_f32_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
215
217
  result_stride_elements, row_start, row_count);
216
- nk_angulars_symmetric_f32_smef64_finalize_streaming_(vectors, vectors_count, depth, stride_elements, result,
217
- result_stride_elements, row_start, row_count);
218
+ nk_angulars_symmetric_f32_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
219
+ result_stride_elements, row_start, row_count);
220
+ nk_sme_stop_streaming_();
218
221
  }
219
222
 
220
223
  #pragma endregion F32 Symmetric Angular
221
224
  #pragma region F32 Symmetric Euclidean
222
225
 
223
- __arm_locally_streaming static void nk_euclideans_symmetric_f32_smef64_finalize_streaming_( //
224
- nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, //
225
- nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
226
+ static void nk_euclideans_symmetric_f32_smef64_finalize_ssve_( //
227
+ nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
228
+ nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
226
229
  // Phase 1: cache row norms on diagonal
227
230
  for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
228
231
  nk_f32_t const *row_vector = vectors + row_index * stride_elements;
@@ -257,26 +260,27 @@ __arm_locally_streaming static void nk_euclideans_symmetric_f32_smef64_finalize_
257
260
  result[row_index * result_stride_elements + row_index] = 0;
258
261
  }
259
262
 
260
- NK_PUBLIC void nk_euclideans_symmetric_f32_smef64( //
261
- nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, //
262
- nk_f64_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
263
+ NK_PUBLIC void nk_euclideans_symmetric_f32_smef64( //
264
+ nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
265
+ nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
263
266
 
264
267
  nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f32_t);
265
268
  nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
266
269
 
270
+ nk_sme_start_streaming_();
267
271
  nk_dots_symmetric_f32_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
268
272
  result_stride_elements, row_start, row_count);
269
- nk_euclideans_symmetric_f32_smef64_finalize_streaming_(vectors, vectors_count, depth, stride_elements, result,
270
- result_stride_elements, row_start, row_count);
273
+ nk_euclideans_symmetric_f32_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
274
+ result_stride_elements, row_start, row_count);
275
+ nk_sme_stop_streaming_();
271
276
  }
272
277
 
273
278
  #pragma endregion F32 Symmetric Euclidean
274
279
  #pragma region F64 Packed Angular
275
280
 
276
- __arm_locally_streaming static void nk_angulars_packed_f64_smef64_finalize_streaming_( //
277
- nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
278
- nk_size_t rows, nk_size_t columns, nk_size_t depth, //
279
- nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
281
+ static void nk_angulars_packed_f64_smef64_finalize_ssve_( //
282
+ nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
283
+ nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
280
284
 
281
285
  nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
282
286
  nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
@@ -298,26 +302,26 @@ __arm_locally_streaming static void nk_angulars_packed_f64_smef64_finalize_strea
298
302
  }
299
303
  }
300
304
 
301
- NK_PUBLIC void nk_angulars_packed_f64_smef64( //
302
- nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
303
- nk_size_t rows, nk_size_t columns, nk_size_t depth, //
305
+ NK_PUBLIC void nk_angulars_packed_f64_smef64( //
306
+ nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
304
307
  nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
305
308
 
306
309
  nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
307
310
  nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
308
311
 
312
+ nk_sme_start_streaming_();
309
313
  nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
310
- nk_angulars_packed_f64_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
311
- c_stride_elements);
314
+ nk_angulars_packed_f64_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
315
+ c_stride_elements);
316
+ nk_sme_stop_streaming_();
312
317
  }
313
318
 
314
319
  #pragma endregion F64 Packed Angular
315
320
  #pragma region F64 Packed Euclidean
316
321
 
317
- __arm_locally_streaming static void nk_euclideans_packed_f64_smef64_finalize_streaming_( //
318
- nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
319
- nk_size_t rows, nk_size_t columns, nk_size_t depth, //
320
- nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
322
+ static void nk_euclideans_packed_f64_smef64_finalize_ssve_( //
323
+ nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
324
+ nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
321
325
 
322
326
  nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
323
327
  nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
@@ -339,25 +343,26 @@ __arm_locally_streaming static void nk_euclideans_packed_f64_smef64_finalize_str
339
343
  }
340
344
  }
341
345
 
342
- NK_PUBLIC void nk_euclideans_packed_f64_smef64( //
343
- nk_f64_t const *a, void const *b_packed, nk_f64_t *c, //
344
- nk_size_t rows, nk_size_t columns, nk_size_t depth, //
346
+ NK_PUBLIC void nk_euclideans_packed_f64_smef64( //
347
+ nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
345
348
  nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
346
349
 
347
350
  nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
348
351
  nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
349
352
 
353
+ nk_sme_start_streaming_();
350
354
  nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
351
- nk_euclideans_packed_f64_smef64_finalize_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements,
352
- c_stride_elements);
355
+ nk_euclideans_packed_f64_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
356
+ c_stride_elements);
357
+ nk_sme_stop_streaming_();
353
358
  }
354
359
 
355
360
  #pragma endregion F64 Packed Euclidean
356
361
  #pragma region F64 Symmetric Angular
357
362
 
358
- __arm_locally_streaming static void nk_angulars_symmetric_f64_smef64_finalize_streaming_( //
359
- nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, //
360
- nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
363
+ static void nk_angulars_symmetric_f64_smef64_finalize_ssve_( //
364
+ nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
365
+ nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
361
366
  // Phase 1: cache row norms on diagonal
362
367
  for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
363
368
  nk_f64_t const *row_vector = vectors + row_index * stride_elements;
@@ -392,25 +397,27 @@ __arm_locally_streaming static void nk_angulars_symmetric_f64_smef64_finalize_st
392
397
  result[row_index * result_stride_elements + row_index] = 0;
393
398
  }
394
399
 
395
- NK_PUBLIC void nk_angulars_symmetric_f64_smef64( //
396
- nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, //
397
- nk_f64_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
400
+ NK_PUBLIC void nk_angulars_symmetric_f64_smef64( //
401
+ nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
402
+ nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
398
403
 
399
404
  nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f64_t);
400
405
  nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
401
406
 
407
+ nk_sme_start_streaming_();
402
408
  nk_dots_symmetric_f64_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
403
409
  result_stride_elements, row_start, row_count);
404
- nk_angulars_symmetric_f64_smef64_finalize_streaming_(vectors, vectors_count, depth, stride_elements, result,
405
- result_stride_elements, row_start, row_count);
410
+ nk_angulars_symmetric_f64_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
411
+ result_stride_elements, row_start, row_count);
412
+ nk_sme_stop_streaming_();
406
413
  }
407
414
 
408
415
  #pragma endregion F64 Symmetric Angular
409
416
  #pragma region F64 Symmetric Euclidean
410
417
 
411
- __arm_locally_streaming static void nk_euclideans_symmetric_f64_smef64_finalize_streaming_( //
412
- nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, //
413
- nk_f64_t *result, nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) {
418
+ static void nk_euclideans_symmetric_f64_smef64_finalize_ssve_( //
419
+ nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
420
+ nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
414
421
  // Phase 1: cache row norms on diagonal
415
422
  for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
416
423
  nk_f64_t const *row_vector = vectors + row_index * stride_elements;
@@ -445,17 +452,19 @@ __arm_locally_streaming static void nk_euclideans_symmetric_f64_smef64_finalize_
445
452
  result[row_index * result_stride_elements + row_index] = 0;
446
453
  }
447
454
 
448
- NK_PUBLIC void nk_euclideans_symmetric_f64_smef64( //
449
- nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, //
450
- nk_f64_t *result, nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
455
+ NK_PUBLIC void nk_euclideans_symmetric_f64_smef64( //
456
+ nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
457
+ nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
451
458
 
452
459
  nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f64_t);
453
460
  nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
454
461
 
462
+ nk_sme_start_streaming_();
455
463
  nk_dots_symmetric_f64_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
456
464
  result_stride_elements, row_start, row_count);
457
- nk_euclideans_symmetric_f64_smef64_finalize_streaming_(vectors, vectors_count, depth, stride_elements, result,
458
- result_stride_elements, row_start, row_count);
465
+ nk_euclideans_symmetric_f64_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
466
+ result_stride_elements, row_start, row_count);
467
+ nk_sme_stop_streaming_();
459
468
  }
460
469
 
461
470
  #pragma endregion F64 Symmetric Euclidean
@@ -739,6 +739,28 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk
739
739
  nk_size_t row_start, nk_size_t row_count);
740
740
  #endif // NK_TARGET_SAPPHIREAMX
741
741
 
742
+ /* Granite Rapids backends using Intel AMX-FP16.
743
+ * Native FP16 spatial kernels.
744
+ */
745
+ #if NK_TARGET_GRANITEAMX
746
+ /** @copydoc nk_angulars_packed_f16 */
747
+ NK_PUBLIC void nk_angulars_packed_f16_graniteamx(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
748
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
749
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
750
+ /** @copydoc nk_angulars_symmetric_f16 */
751
+ NK_PUBLIC void nk_angulars_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
752
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
753
+ nk_size_t row_start, nk_size_t row_count);
754
+ /** @copydoc nk_euclideans_packed_f16 */
755
+ NK_PUBLIC void nk_euclideans_packed_f16_graniteamx(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
756
+ nk_size_t rows, nk_size_t cols, nk_size_t depth,
757
+ nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
758
+ /** @copydoc nk_euclideans_symmetric_f16 */
759
+ NK_PUBLIC void nk_euclideans_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
760
+ nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
761
+ nk_size_t row_start, nk_size_t row_count);
762
+ #endif // NK_TARGET_GRANITEAMX
763
+
742
764
  /* ARM SME backends using Scalable Matrix Extension.
743
765
  * SME provides ZA tile registers for outer product operations.
744
766
  * F16/BF16/I8/U8/E4M3 use ZA32 tiles, F32/F64 use ZA64 tiles (FEAT_SME_F64F64).
@@ -2078,6 +2100,7 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
2078
2100
  #include "numkong/spatials/alder.h"
2079
2101
  #include "numkong/spatials/sierra.h"
2080
2102
  #include "numkong/spatials/sapphireamx.h"
2103
+ #include "numkong/spatials/graniteamx.h"
2081
2104
  #include "numkong/spatials/rvv.h"
2082
2105
  #include "numkong/spatials/v128relaxed.h"
2083
2106
  #include "numkong/spatials/sme.h"
@@ -2290,7 +2313,9 @@ NK_PUBLIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t ve
2290
2313
  NK_PUBLIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
2291
2314
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
2292
2315
  nk_size_t r_stride_in_bytes) {
2293
- #if NK_TARGET_SME
2316
+ #if NK_TARGET_GRANITEAMX
2317
+ nk_angulars_packed_f16_graniteamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2318
+ #elif NK_TARGET_SME
2294
2319
  nk_angulars_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2295
2320
  #elif NK_TARGET_NEONFHM
2296
2321
  nk_angulars_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
@@ -2311,7 +2336,10 @@ NK_PUBLIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed, n
2311
2336
  NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2312
2337
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2313
2338
  nk_size_t row_start, nk_size_t row_count) {
2314
- #if NK_TARGET_SME
2339
+ #if NK_TARGET_GRANITEAMX
2340
+ nk_angulars_symmetric_f16_graniteamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2341
+ row_count);
2342
+ #elif NK_TARGET_SME
2315
2343
  nk_angulars_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2316
2344
  #elif NK_TARGET_NEONFHM
2317
2345
  nk_angulars_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
@@ -2337,7 +2365,9 @@ NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vect
2337
2365
  NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
2338
2366
  nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
2339
2367
  nk_size_t r_stride_in_bytes) {
2340
- #if NK_TARGET_SME
2368
+ #if NK_TARGET_GRANITEAMX
2369
+ nk_euclideans_packed_f16_graniteamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2370
+ #elif NK_TARGET_SME
2341
2371
  nk_euclideans_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
2342
2372
  #elif NK_TARGET_NEONFHM
2343
2373
  nk_euclideans_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
@@ -2358,7 +2388,10 @@ NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed,
2358
2388
  NK_PUBLIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
2359
2389
  nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
2360
2390
  nk_size_t row_start, nk_size_t row_count) {
2361
- #if NK_TARGET_SME
2391
+ #if NK_TARGET_GRANITEAMX
2392
+ nk_euclideans_symmetric_f16_graniteamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
2393
+ row_count);
2394
+ #elif NK_TARGET_SME
2362
2395
  nk_euclideans_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
2363
2396
  #elif NK_TARGET_NEONFHM
2364
2397
  nk_euclideans_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
@@ -69,6 +69,20 @@
69
69
  #define _GNU_SOURCE
70
70
  #endif
71
71
 
72
+ // MSan (MemorySanitizer) cannot track data flow through SVE horizontal reductions
73
+ // like `svaddv`, which move data from vector registers to scalar registers via
74
+ // architecture-specific paths invisible to the compiler. `nk_unpoison_` marks the
75
+ // resulting scalar as initialized so MSan does not report false positives.
76
+ #if defined(__has_feature)
77
+ #if __has_feature(memory_sanitizer)
78
+ #include <sanitizer/msan_interface.h>
79
+ #define nk_unpoison_(ptr, size) __msan_unpoison((ptr), (size))
80
+ #endif
81
+ #endif
82
+ #ifndef nk_unpoison_
83
+ #define nk_unpoison_(ptr, size) (void)(ptr), (void)(size)
84
+ #endif
85
+
72
86
  // Inferring target OS: Windows, macOS, Linux, or FreeBSD
73
87
  #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
74
88
  #define NK_DEFINED_WINDOWS_ 1
@@ -1627,6 +1641,51 @@ NK_INTERNAL nk_size_t nk_sme_cntd_(void) {
1627
1641
  __asm__ __volatile__("smstart sm\n\t" "cntd %0\n\t" "smstop sm" : "=r"(r));
1628
1642
  return (nk_size_t)r;
1629
1643
  }
1644
+
1645
+ /** @brief Enter streaming SVE mode (PSTATE.SM = 1). Caller is responsible for smstop. */
1646
+ NK_INTERNAL void nk_sme_start_streaming_(void) { __asm__ __volatile__("smstart sm" ::: "memory"); }
1647
+ /** @brief Exit streaming SVE mode (PSTATE.SM = 0). Must pair with nk_sme_start_streaming_. */
1648
+ NK_INTERNAL void nk_sme_stop_streaming_(void) { __asm__ __volatile__("smstop sm" ::: "memory"); }
1649
+
1650
+ /**
1651
+ * SME runtime stubs — weak definitions for symbols the compiler may reference
1652
+ * from __arm_streaming or __arm_new("za") functions. Every TU that includes
1653
+ * this header emits a weak copy; the linker deduplicates to one.
1654
+ *
1655
+ * - __arm_tpidr2_save / __arm_tpidr2_restore: lazy ZA save/restore protocol
1656
+ * used in __arm_new("za") prologues. Always no-ops in NumKong because no
1657
+ * NK_PUBLIC function carries ZA state (TPIDR2_EL0 is always null at entry).
1658
+ *
1659
+ * - __arm_sc_memset / __arm_sc_memcpy / __arm_sc_memmove: streaming-compatible
1660
+ * memory routines the compiler may emit inside __arm_streaming functions.
1661
+ * Apple Clang provides these in its runtime; upstream LLVM does not.
1662
+ */
1663
+ __attribute__((weak)) void __arm_tpidr2_save(void) {}
1664
+ __attribute__((weak)) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
1665
+ __attribute__((weak, target("+sme"))) void *__arm_sc_memset(void *d, int c, __SIZE_TYPE__ n) __arm_streaming_compatible {
1666
+ unsigned char *p = (unsigned char *)d;
1667
+ for (__SIZE_TYPE__ i = 0; i < n; i++) p[i] = (unsigned char)c;
1668
+ return d;
1669
+ }
1670
+ __attribute__((weak, target("+sme"))) void *__arm_sc_memcpy(void *d, void const *s,
1671
+ __SIZE_TYPE__ n) __arm_streaming_compatible {
1672
+ unsigned char *dp = (unsigned char *)d;
1673
+ unsigned char const *sp = (unsigned char const *)s;
1674
+ for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
1675
+ return d;
1676
+ }
1677
+ __attribute__((weak, target("+sme"))) void *__arm_sc_memmove(void *d, void const *s,
1678
+ __SIZE_TYPE__ n) __arm_streaming_compatible {
1679
+ unsigned char *dp = (unsigned char *)d;
1680
+ unsigned char const *sp = (unsigned char const *)s;
1681
+ if (dp < sp) {
1682
+ for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
1683
+ }
1684
+ else {
1685
+ for (__SIZE_TYPE__ i = n; i > 0; i--) dp[i - 1] = sp[i - 1];
1686
+ }
1687
+ return d;
1688
+ }
1630
1689
  #endif
1631
1690
 
1632
1691
  #ifdef __cplusplus
@@ -99,6 +99,19 @@ Object.defineProperty(exports, "PackedMatrix", { enumerable: true, get: function
99
99
  Object.defineProperty(exports, "DType", { enumerable: true, get: function () { return types_js_1.DType; } });
100
100
  Object.defineProperty(exports, "outputDtype", { enumerable: true, get: function () { return types_js_1.outputDtype; } });
101
101
  function loadNativeAddon() {
102
+ var _a;
103
+ // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
104
+ // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
105
+ // runtime (e.g. one loaded by another native addon) may already be
106
+ // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
107
+ // libiomp5 to coexist; it must be in `process.env` before the `require()`
108
+ // below triggers the addon's `dlopen`, since libomp's constructor reads
109
+ // the env during dependency resolution and is too late to influence
110
+ // afterwards. Left unguarded because the variable is harmless on
111
+ // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
112
+ // who set it to something else is respected by `??=`. See
113
+ // `python/numkong/__init__.py` for the Python analog.
114
+ (_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
102
115
  // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
103
116
  try {
104
117
  const req = (0, node_module_1.createRequire)(path.join(getDirName(), "noop.js"));
@@ -31,6 +31,19 @@ import { existsSync } from "node:fs";
31
31
  import { getFileName, getRoot } from "bindings";
32
32
  import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype } from "./types.js";
33
33
  function loadNativeAddon() {
34
+ var _a;
35
+ // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
36
+ // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
37
+ // runtime (e.g. one loaded by another native addon) may already be
38
+ // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
39
+ // libiomp5 to coexist; it must be in `process.env` before the `require()`
40
+ // below triggers the addon's `dlopen`, since libomp's constructor reads
41
+ // the env during dependency resolution and is too late to influence
42
+ // afterwards. Left unguarded because the variable is harmless on
43
+ // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
44
+ // who set it to something else is respected by `??=`. See
45
+ // `python/numkong/__init__.py` for the Python analog.
46
+ (_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
34
47
  // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
35
48
  try {
36
49
  const req = createRequire(path.join(getDirName(), "noop.js"));
@@ -9,10 +9,17 @@
9
9
 
10
10
  #include <string.h> // `strcmp` function
11
11
 
12
+ #if defined(NK_USE_OPENMP)
13
+ #include <omp.h>
14
+ #endif
15
+
12
16
  #include <node_api.h> // `napi_*` functions — N-API v6+ for BigInt (Node ≥ 10.20)
13
17
 
14
18
  #include <numkong/numkong.h> // `nk_*` functions — must be first to bring `_GNU_SOURCE`
15
19
 
20
+ #define NK_PARALLEL_PACKED_TILE 64
21
+ #define NK_PARALLEL_SYMMETRIC_TILE 32
22
+
16
23
  /** @brief Global variable that caches the CPU capabilities, and is computed just once, when the module is loaded. */
17
24
  nk_capability_t static_capabilities = nk_cap_serial_k;
18
25
 
@@ -482,11 +489,11 @@ static napi_value api_dots_pack(napi_env env, napi_callback_info info) {
482
489
  * dtype
483
490
  */
484
491
  static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
485
- size_t argc = 9;
486
- napi_value args[9];
492
+ size_t argc = 10;
493
+ napi_value args[10];
487
494
  napi_get_cb_info(env, info, &argc, args, NULL, NULL);
488
- if (argc != 9) {
489
- napi_throw_error(env, NULL, "Packed operation requires 9 arguments");
495
+ if (argc < 9 || argc > 10) {
496
+ napi_throw_error(env, NULL, "Packed operation requires 9-10 arguments (last is optional threads)");
490
497
  return NULL;
491
498
  }
492
499
 
@@ -533,8 +540,26 @@ static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_ke
533
540
  return NULL;
534
541
  }
535
542
 
536
- kernel(a_data, packed_data, result_data, (nk_size_t)height, (nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride,
537
- (nk_size_t)result_stride);
543
+ uint32_t threads = 1;
544
+ if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
545
+
546
+ #if defined(NK_USE_OPENMP)
547
+ if (threads == 0) threads = (uint32_t)omp_get_max_threads();
548
+ omp_set_num_threads((int)threads);
549
+ #endif
550
+
551
+ // `int` loop counter pre-declared: MSVC's OpenMP stays at 2.0 canonical
552
+ // form, which forbids in-init declarations and rejects 64-bit iterators
553
+ // — either would trip C3015.
554
+ int const tile_count = (int)nk_size_divide_round_up_(height, NK_PARALLEL_PACKED_TILE);
555
+ int tile_idx;
556
+ #pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
557
+ for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
558
+ nk_size_t row = (nk_size_t)tile_idx * NK_PARALLEL_PACKED_TILE;
559
+ nk_size_t chunk = (row + NK_PARALLEL_PACKED_TILE <= height) ? NK_PARALLEL_PACKED_TILE : (height - row);
560
+ kernel((char const *)a_data + row * a_stride, packed_data, (char *)result_data + row * result_stride, chunk,
561
+ (nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride, (nk_size_t)result_stride);
562
+ }
538
563
  return NULL;
539
564
  }
540
565
 
@@ -554,11 +579,11 @@ static napi_value api_euclideans_packed(napi_env env, napi_callback_info info) {
554
579
  * string dtype
555
580
  */
556
581
  static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
557
- size_t argc = 9;
558
- napi_value args[9];
582
+ size_t argc = 10;
583
+ napi_value args[10];
559
584
  napi_get_cb_info(env, info, &argc, args, NULL, NULL);
560
- if (argc != 9) {
561
- napi_throw_error(env, NULL, "Symmetric operation requires 9 arguments");
585
+ if (argc < 9 || argc > 10) {
586
+ napi_throw_error(env, NULL, "Symmetric operation requires 9-10 arguments (last is optional threads)");
562
587
  return NULL;
563
588
  }
564
589
 
@@ -601,8 +626,27 @@ static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk
601
626
  return NULL;
602
627
  }
603
628
 
604
- kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
605
- (nk_size_t)result_stride, (nk_size_t)row_start, (nk_size_t)row_count);
629
+ uint32_t threads = 1;
630
+ if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
631
+
632
+ #if defined(NK_USE_OPENMP)
633
+ if (threads == 0) threads = (uint32_t)omp_get_max_threads();
634
+ omp_set_num_threads((int)threads);
635
+ #endif
636
+
637
+ // `int` loop counter pre-declared: see note at `api_packed_common`.
638
+ int const tile_count = (int)nk_size_divide_round_up_(row_count, NK_PARALLEL_SYMMETRIC_TILE);
639
+ int tile_idx;
640
+ #pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
641
+ for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
642
+ nk_size_t tile_start = (nk_size_t)row_start + (nk_size_t)tile_idx * NK_PARALLEL_SYMMETRIC_TILE;
643
+ nk_size_t tile_rows = (tile_start + NK_PARALLEL_SYMMETRIC_TILE <= (nk_size_t)row_start + row_count)
644
+ ? NK_PARALLEL_SYMMETRIC_TILE
645
+ : ((nk_size_t)row_start + row_count - tile_start);
646
+ kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
647
+ (nk_size_t)result_stride, tile_start, tile_rows);
648
+ }
649
+
606
650
  return NULL;
607
651
  }
608
652
 
@@ -33,6 +33,19 @@ import { getFileName, getRoot } from "bindings";
33
33
  import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype, KernelFamily } from "./types.js";
34
34
 
35
35
  function loadNativeAddon(): any {
36
+ // Duplicate-libomp guard. We ship our own `libomp.dylib` next to
37
+ // `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
38
+ // runtime (e.g. one loaded by another native addon) may already be
39
+ // resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
40
+ // libiomp5 to coexist; it must be in `process.env` before the `require()`
41
+ // below triggers the addon's `dlopen`, since libomp's constructor reads
42
+ // the env during dependency resolution and is too late to influence
43
+ // afterwards. Left unguarded because the variable is harmless on
44
+ // platforms / runtimes (GCC libgomp) that don't recognize it, and a user
45
+ // who set it to something else is respected by `??=`. See
46
+ // `python/numkong/__init__.py` for the Python analog.
47
+ process.env.KMP_DUPLICATE_LIB_OK ??= "TRUE";
48
+
36
49
  // Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
37
50
  try {
38
51
  const req = createRequire(path.join(getDirName(), "noop.js"));