numkong 7.4.5 → 7.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/binding.gyp +81 -5
- package/c/dispatch_f16.c +23 -0
- package/c/numkong.c +0 -13
- package/include/numkong/attention/sme.h +34 -31
- package/include/numkong/capabilities.h +2 -15
- package/include/numkong/cast/neon.h +15 -0
- package/include/numkong/curved/smef64.h +82 -62
- package/include/numkong/dot/rvvbf16.h +1 -1
- package/include/numkong/dot/rvvhalf.h +1 -1
- package/include/numkong/dot/sve.h +6 -5
- package/include/numkong/dot/svebfdot.h +2 -1
- package/include/numkong/dot/svehalf.h +6 -5
- package/include/numkong/dot/svesdot.h +3 -2
- package/include/numkong/dots/graniteamx.h +733 -0
- package/include/numkong/dots/serial.h +11 -4
- package/include/numkong/dots/sme.h +172 -140
- package/include/numkong/dots/smebi32.h +14 -11
- package/include/numkong/dots/smef64.h +31 -26
- package/include/numkong/dots.h +29 -3
- package/include/numkong/each/serial.h +22 -0
- package/include/numkong/geospatial/haswell.h +1 -1
- package/include/numkong/geospatial/neon.h +1 -1
- package/include/numkong/geospatial/serial.h +1 -1
- package/include/numkong/geospatial/skylake.h +1 -1
- package/include/numkong/maxsim/sme.h +34 -33
- package/include/numkong/mesh/serial.h +22 -0
- package/include/numkong/reduce/neon.h +29 -0
- package/include/numkong/reduce/neonbfdot.h +2 -2
- package/include/numkong/reduce/neonfhm.h +4 -4
- package/include/numkong/reduce/sve.h +52 -0
- package/include/numkong/reduce.h +4 -0
- package/include/numkong/set/sve.h +6 -5
- package/include/numkong/sets/smebi32.h +35 -30
- package/include/numkong/sparse/sve2.h +3 -2
- package/include/numkong/spatial/sve.h +7 -6
- package/include/numkong/spatial/svebfdot.h +7 -4
- package/include/numkong/spatial/svehalf.h +5 -4
- package/include/numkong/spatial/svesdot.h +9 -8
- package/include/numkong/spatials/graniteamx.h +173 -0
- package/include/numkong/spatials/serial.h +22 -0
- package/include/numkong/spatials/sme.h +391 -350
- package/include/numkong/spatials/smef64.h +79 -70
- package/include/numkong/spatials.h +37 -4
- package/include/numkong/types.h +59 -0
- package/javascript/dist/cjs/numkong.js +13 -0
- package/javascript/dist/esm/numkong.js +13 -0
- package/javascript/numkong.c +56 -12
- package/javascript/numkong.ts +13 -0
- package/package.json +7 -7
- package/probes/probe.js +2 -2
- package/wasm/numkong.wasm +0 -0
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
#if NK_TARGET_SME
|
|
14
14
|
|
|
15
15
|
#include "numkong/dots/serial.h"
|
|
16
|
+
#include "numkong/reduce/sve.h" // `nk_svaddv_f64_`
|
|
16
17
|
#include "numkong/dots/smef64.h"
|
|
17
18
|
|
|
18
19
|
#if defined(__cplusplus)
|
|
@@ -44,7 +45,7 @@ NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f32_ssve_(nk_f32_t const *data, nk_size_
|
|
|
44
45
|
svfloat64_t values_odd_f64x = svcvtlt_f64_f32_x(predicate_odd_b64x, values_f32x);
|
|
45
46
|
accumulator_odd_f64x = svmla_f64_m(predicate_odd_b64x, accumulator_odd_f64x, values_odd_f64x, values_odd_f64x);
|
|
46
47
|
}
|
|
47
|
-
return
|
|
48
|
+
return nk_svaddv_f64_(svptrue_b64(), accumulator_even_f64x) + nk_svaddv_f64_(svptrue_b64(), accumulator_odd_f64x);
|
|
48
49
|
}
|
|
49
50
|
|
|
50
51
|
NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f64_ssve_(nk_f64_t const *data, nk_size_t count) NK_STREAMING_ {
|
|
@@ -55,7 +56,7 @@ NK_PUBLIC nk_f64_t nk_dots_reduce_sumsq_f64_ssve_(nk_f64_t const *data, nk_size_
|
|
|
55
56
|
svfloat64_t values_f64x = svld1_f64(predicate_b64x, data + i);
|
|
56
57
|
accumulator_f64x = svmla_f64_m(predicate_b64x, accumulator_f64x, values_f64x, values_f64x);
|
|
57
58
|
}
|
|
58
|
-
return
|
|
59
|
+
return nk_svaddv_f64_(svptrue_b64(), accumulator_f64x);
|
|
59
60
|
}
|
|
60
61
|
|
|
61
62
|
NK_PUBLIC svfloat64_t nk_angulars_from_dot_f64x_ssvef64_(svbool_t predicate_b64x, svfloat64_t dots_f64x,
|
|
@@ -85,10 +86,9 @@ NK_PUBLIC svfloat64_t nk_euclideans_from_dot_f64x_ssvef64_(svbool_t predicate_b6
|
|
|
85
86
|
|
|
86
87
|
#pragma region F32 Packed Angular
|
|
87
88
|
|
|
88
|
-
|
|
89
|
-
nk_f32_t const *a, void const *b_packed, nk_f64_t *c,
|
|
90
|
-
nk_size_t
|
|
91
|
-
nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
|
|
89
|
+
static void nk_angulars_packed_f32_smef64_finalize_ssve_( //
|
|
90
|
+
nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
|
|
91
|
+
nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
|
|
92
92
|
|
|
93
93
|
nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
|
|
94
94
|
nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
|
|
@@ -110,26 +110,26 @@ __arm_locally_streaming static void nk_angulars_packed_f32_smef64_finalize_strea
|
|
|
110
110
|
}
|
|
111
111
|
}
|
|
112
112
|
|
|
113
|
-
NK_PUBLIC void nk_angulars_packed_f32_smef64(
|
|
114
|
-
nk_f32_t const *a, void const *b_packed, nk_f64_t *c,
|
|
115
|
-
nk_size_t rows, nk_size_t columns, nk_size_t depth, //
|
|
113
|
+
NK_PUBLIC void nk_angulars_packed_f32_smef64( //
|
|
114
|
+
nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
|
|
116
115
|
nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
|
|
117
116
|
|
|
118
117
|
nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
|
|
119
118
|
nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
|
|
120
119
|
|
|
120
|
+
nk_sme_start_streaming_();
|
|
121
121
|
nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
nk_angulars_packed_f32_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
|
|
123
|
+
c_stride_elements);
|
|
124
|
+
nk_sme_stop_streaming_();
|
|
124
125
|
}
|
|
125
126
|
|
|
126
127
|
#pragma endregion F32 Packed Angular
|
|
127
128
|
#pragma region F32 Packed Euclidean
|
|
128
129
|
|
|
129
|
-
|
|
130
|
-
nk_f32_t const *a, void const *b_packed, nk_f64_t *c,
|
|
131
|
-
nk_size_t
|
|
132
|
-
nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
|
|
130
|
+
static void nk_euclideans_packed_f32_smef64_finalize_ssve_( //
|
|
131
|
+
nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
|
|
132
|
+
nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
|
|
133
133
|
|
|
134
134
|
nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
|
|
135
135
|
nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
|
|
@@ -151,25 +151,26 @@ __arm_locally_streaming static void nk_euclideans_packed_f32_smef64_finalize_str
|
|
|
151
151
|
}
|
|
152
152
|
}
|
|
153
153
|
|
|
154
|
-
NK_PUBLIC void nk_euclideans_packed_f32_smef64(
|
|
155
|
-
nk_f32_t const *a, void const *b_packed, nk_f64_t *c,
|
|
156
|
-
nk_size_t rows, nk_size_t columns, nk_size_t depth, //
|
|
154
|
+
NK_PUBLIC void nk_euclideans_packed_f32_smef64( //
|
|
155
|
+
nk_f32_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
|
|
157
156
|
nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
|
|
158
157
|
|
|
159
158
|
nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f32_t);
|
|
160
159
|
nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
|
|
161
160
|
|
|
161
|
+
nk_sme_start_streaming_();
|
|
162
162
|
nk_dots_packed_f32_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
|
|
163
|
-
|
|
164
|
-
|
|
163
|
+
nk_euclideans_packed_f32_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
|
|
164
|
+
c_stride_elements);
|
|
165
|
+
nk_sme_stop_streaming_();
|
|
165
166
|
}
|
|
166
167
|
|
|
167
168
|
#pragma endregion F32 Packed Euclidean
|
|
168
169
|
#pragma region F32 Symmetric Angular
|
|
169
170
|
|
|
170
|
-
|
|
171
|
-
nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements,
|
|
172
|
-
|
|
171
|
+
static void nk_angulars_symmetric_f32_smef64_finalize_ssve_( //
|
|
172
|
+
nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
|
|
173
|
+
nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
|
|
173
174
|
// Phase 1: cache row norms on diagonal
|
|
174
175
|
for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
|
|
175
176
|
nk_f32_t const *row_vector = vectors + row_index * stride_elements;
|
|
@@ -204,25 +205,27 @@ __arm_locally_streaming static void nk_angulars_symmetric_f32_smef64_finalize_st
|
|
|
204
205
|
result[row_index * result_stride_elements + row_index] = 0;
|
|
205
206
|
}
|
|
206
207
|
|
|
207
|
-
NK_PUBLIC void nk_angulars_symmetric_f32_smef64(
|
|
208
|
-
nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes,
|
|
209
|
-
|
|
208
|
+
NK_PUBLIC void nk_angulars_symmetric_f32_smef64( //
|
|
209
|
+
nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
|
|
210
|
+
nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
|
|
210
211
|
|
|
211
212
|
nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f32_t);
|
|
212
213
|
nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
|
|
213
214
|
|
|
215
|
+
nk_sme_start_streaming_();
|
|
214
216
|
nk_dots_symmetric_f32_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
|
|
215
217
|
result_stride_elements, row_start, row_count);
|
|
216
|
-
|
|
217
|
-
|
|
218
|
+
nk_angulars_symmetric_f32_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
|
|
219
|
+
result_stride_elements, row_start, row_count);
|
|
220
|
+
nk_sme_stop_streaming_();
|
|
218
221
|
}
|
|
219
222
|
|
|
220
223
|
#pragma endregion F32 Symmetric Angular
|
|
221
224
|
#pragma region F32 Symmetric Euclidean
|
|
222
225
|
|
|
223
|
-
|
|
224
|
-
nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements,
|
|
225
|
-
|
|
226
|
+
static void nk_euclideans_symmetric_f32_smef64_finalize_ssve_( //
|
|
227
|
+
nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
|
|
228
|
+
nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
|
|
226
229
|
// Phase 1: cache row norms on diagonal
|
|
227
230
|
for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
|
|
228
231
|
nk_f32_t const *row_vector = vectors + row_index * stride_elements;
|
|
@@ -257,26 +260,27 @@ __arm_locally_streaming static void nk_euclideans_symmetric_f32_smef64_finalize_
|
|
|
257
260
|
result[row_index * result_stride_elements + row_index] = 0;
|
|
258
261
|
}
|
|
259
262
|
|
|
260
|
-
NK_PUBLIC void nk_euclideans_symmetric_f32_smef64(
|
|
261
|
-
nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes,
|
|
262
|
-
|
|
263
|
+
NK_PUBLIC void nk_euclideans_symmetric_f32_smef64( //
|
|
264
|
+
nk_f32_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
|
|
265
|
+
nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
|
|
263
266
|
|
|
264
267
|
nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f32_t);
|
|
265
268
|
nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
|
|
266
269
|
|
|
270
|
+
nk_sme_start_streaming_();
|
|
267
271
|
nk_dots_symmetric_f32_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
|
|
268
272
|
result_stride_elements, row_start, row_count);
|
|
269
|
-
|
|
270
|
-
|
|
273
|
+
nk_euclideans_symmetric_f32_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
|
|
274
|
+
result_stride_elements, row_start, row_count);
|
|
275
|
+
nk_sme_stop_streaming_();
|
|
271
276
|
}
|
|
272
277
|
|
|
273
278
|
#pragma endregion F32 Symmetric Euclidean
|
|
274
279
|
#pragma region F64 Packed Angular
|
|
275
280
|
|
|
276
|
-
|
|
277
|
-
nk_f64_t const *a, void const *b_packed, nk_f64_t *c,
|
|
278
|
-
nk_size_t
|
|
279
|
-
nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
|
|
281
|
+
static void nk_angulars_packed_f64_smef64_finalize_ssve_( //
|
|
282
|
+
nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
|
|
283
|
+
nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
|
|
280
284
|
|
|
281
285
|
nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
|
|
282
286
|
nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
|
|
@@ -298,26 +302,26 @@ __arm_locally_streaming static void nk_angulars_packed_f64_smef64_finalize_strea
|
|
|
298
302
|
}
|
|
299
303
|
}
|
|
300
304
|
|
|
301
|
-
NK_PUBLIC void nk_angulars_packed_f64_smef64(
|
|
302
|
-
nk_f64_t const *a, void const *b_packed, nk_f64_t *c,
|
|
303
|
-
nk_size_t rows, nk_size_t columns, nk_size_t depth, //
|
|
305
|
+
NK_PUBLIC void nk_angulars_packed_f64_smef64( //
|
|
306
|
+
nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
|
|
304
307
|
nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
|
|
305
308
|
|
|
306
309
|
nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
|
|
307
310
|
nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
|
|
308
311
|
|
|
312
|
+
nk_sme_start_streaming_();
|
|
309
313
|
nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
|
|
310
|
-
|
|
311
|
-
|
|
314
|
+
nk_angulars_packed_f64_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
|
|
315
|
+
c_stride_elements);
|
|
316
|
+
nk_sme_stop_streaming_();
|
|
312
317
|
}
|
|
313
318
|
|
|
314
319
|
#pragma endregion F64 Packed Angular
|
|
315
320
|
#pragma region F64 Packed Euclidean
|
|
316
321
|
|
|
317
|
-
|
|
318
|
-
nk_f64_t const *a, void const *b_packed, nk_f64_t *c,
|
|
319
|
-
nk_size_t
|
|
320
|
-
nk_size_t a_stride_elements, nk_size_t c_stride_elements) {
|
|
322
|
+
static void nk_euclideans_packed_f64_smef64_finalize_ssve_( //
|
|
323
|
+
nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
|
|
324
|
+
nk_size_t a_stride_elements, nk_size_t c_stride_elements) NK_STREAMING_ {
|
|
321
325
|
|
|
322
326
|
nk_dots_sme_packed_header_t const *header = (nk_dots_sme_packed_header_t const *)b_packed;
|
|
323
327
|
nk_f64_t const *b_norms = (nk_f64_t const *)((char const *)b_packed + header->norms_offset);
|
|
@@ -339,25 +343,26 @@ __arm_locally_streaming static void nk_euclideans_packed_f64_smef64_finalize_str
|
|
|
339
343
|
}
|
|
340
344
|
}
|
|
341
345
|
|
|
342
|
-
NK_PUBLIC void nk_euclideans_packed_f64_smef64(
|
|
343
|
-
nk_f64_t const *a, void const *b_packed, nk_f64_t *c,
|
|
344
|
-
nk_size_t rows, nk_size_t columns, nk_size_t depth, //
|
|
346
|
+
NK_PUBLIC void nk_euclideans_packed_f64_smef64( //
|
|
347
|
+
nk_f64_t const *a, void const *b_packed, nk_f64_t *c, nk_size_t rows, nk_size_t columns, nk_size_t depth,
|
|
345
348
|
nk_size_t a_stride_in_bytes, nk_size_t c_stride_in_bytes) {
|
|
346
349
|
|
|
347
350
|
nk_size_t const a_stride_elements = a_stride_in_bytes / sizeof(nk_f64_t);
|
|
348
351
|
nk_size_t const c_stride_elements = c_stride_in_bytes / sizeof(nk_f64_t);
|
|
349
352
|
|
|
353
|
+
nk_sme_start_streaming_();
|
|
350
354
|
nk_dots_packed_f64_smef64_streaming_(a, b_packed, c, rows, columns, depth, a_stride_elements, c_stride_elements);
|
|
351
|
-
|
|
352
|
-
|
|
355
|
+
nk_euclideans_packed_f64_smef64_finalize_ssve_(a, b_packed, c, rows, columns, depth, a_stride_elements,
|
|
356
|
+
c_stride_elements);
|
|
357
|
+
nk_sme_stop_streaming_();
|
|
353
358
|
}
|
|
354
359
|
|
|
355
360
|
#pragma endregion F64 Packed Euclidean
|
|
356
361
|
#pragma region F64 Symmetric Angular
|
|
357
362
|
|
|
358
|
-
|
|
359
|
-
nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements,
|
|
360
|
-
|
|
363
|
+
static void nk_angulars_symmetric_f64_smef64_finalize_ssve_( //
|
|
364
|
+
nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
|
|
365
|
+
nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
|
|
361
366
|
// Phase 1: cache row norms on diagonal
|
|
362
367
|
for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
|
|
363
368
|
nk_f64_t const *row_vector = vectors + row_index * stride_elements;
|
|
@@ -392,25 +397,27 @@ __arm_locally_streaming static void nk_angulars_symmetric_f64_smef64_finalize_st
|
|
|
392
397
|
result[row_index * result_stride_elements + row_index] = 0;
|
|
393
398
|
}
|
|
394
399
|
|
|
395
|
-
NK_PUBLIC void nk_angulars_symmetric_f64_smef64(
|
|
396
|
-
nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes,
|
|
397
|
-
|
|
400
|
+
NK_PUBLIC void nk_angulars_symmetric_f64_smef64( //
|
|
401
|
+
nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
|
|
402
|
+
nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
|
|
398
403
|
|
|
399
404
|
nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f64_t);
|
|
400
405
|
nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
|
|
401
406
|
|
|
407
|
+
nk_sme_start_streaming_();
|
|
402
408
|
nk_dots_symmetric_f64_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
|
|
403
409
|
result_stride_elements, row_start, row_count);
|
|
404
|
-
|
|
405
|
-
|
|
410
|
+
nk_angulars_symmetric_f64_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
|
|
411
|
+
result_stride_elements, row_start, row_count);
|
|
412
|
+
nk_sme_stop_streaming_();
|
|
406
413
|
}
|
|
407
414
|
|
|
408
415
|
#pragma endregion F64 Symmetric Angular
|
|
409
416
|
#pragma region F64 Symmetric Euclidean
|
|
410
417
|
|
|
411
|
-
|
|
412
|
-
nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements,
|
|
413
|
-
|
|
418
|
+
static void nk_euclideans_symmetric_f64_smef64_finalize_ssve_( //
|
|
419
|
+
nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_elements, nk_f64_t *result,
|
|
420
|
+
nk_size_t result_stride_elements, nk_size_t row_start, nk_size_t row_count) NK_STREAMING_ {
|
|
414
421
|
// Phase 1: cache row norms on diagonal
|
|
415
422
|
for (nk_size_t row_index = row_start; row_index < row_start + row_count; ++row_index) {
|
|
416
423
|
nk_f64_t const *row_vector = vectors + row_index * stride_elements;
|
|
@@ -445,17 +452,19 @@ __arm_locally_streaming static void nk_euclideans_symmetric_f64_smef64_finalize_
|
|
|
445
452
|
result[row_index * result_stride_elements + row_index] = 0;
|
|
446
453
|
}
|
|
447
454
|
|
|
448
|
-
NK_PUBLIC void nk_euclideans_symmetric_f64_smef64(
|
|
449
|
-
nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes,
|
|
450
|
-
|
|
455
|
+
NK_PUBLIC void nk_euclideans_symmetric_f64_smef64( //
|
|
456
|
+
nk_f64_t const *vectors, nk_size_t vectors_count, nk_size_t depth, nk_size_t stride_in_bytes, nk_f64_t *result,
|
|
457
|
+
nk_size_t result_stride_in_bytes, nk_size_t row_start, nk_size_t row_count) {
|
|
451
458
|
|
|
452
459
|
nk_size_t const stride_elements = stride_in_bytes / sizeof(nk_f64_t);
|
|
453
460
|
nk_size_t const result_stride_elements = result_stride_in_bytes / sizeof(nk_f64_t);
|
|
454
461
|
|
|
462
|
+
nk_sme_start_streaming_();
|
|
455
463
|
nk_dots_symmetric_f64_smef64_streaming_(vectors, vectors_count, depth, stride_elements, result,
|
|
456
464
|
result_stride_elements, row_start, row_count);
|
|
457
|
-
|
|
458
|
-
|
|
465
|
+
nk_euclideans_symmetric_f64_smef64_finalize_ssve_(vectors, vectors_count, depth, stride_elements, result,
|
|
466
|
+
result_stride_elements, row_start, row_count);
|
|
467
|
+
nk_sme_stop_streaming_();
|
|
459
468
|
}
|
|
460
469
|
|
|
461
470
|
#pragma endregion F64 Symmetric Euclidean
|
|
@@ -739,6 +739,28 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_sapphireamx(nk_u8_t const *vectors, nk
|
|
|
739
739
|
nk_size_t row_start, nk_size_t row_count);
|
|
740
740
|
#endif // NK_TARGET_SAPPHIREAMX
|
|
741
741
|
|
|
742
|
+
/* Granite Rapids backends using Intel AMX-FP16.
|
|
743
|
+
* Native FP16 spatial kernels.
|
|
744
|
+
*/
|
|
745
|
+
#if NK_TARGET_GRANITEAMX
|
|
746
|
+
/** @copydoc nk_angulars_packed_f16 */
|
|
747
|
+
NK_PUBLIC void nk_angulars_packed_f16_graniteamx(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
|
|
748
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
749
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
750
|
+
/** @copydoc nk_angulars_symmetric_f16 */
|
|
751
|
+
NK_PUBLIC void nk_angulars_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
752
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
753
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
754
|
+
/** @copydoc nk_euclideans_packed_f16 */
|
|
755
|
+
NK_PUBLIC void nk_euclideans_packed_f16_graniteamx(nk_f16_t const *a, void const *b_packed, nk_f32_t *result,
|
|
756
|
+
nk_size_t rows, nk_size_t cols, nk_size_t depth,
|
|
757
|
+
nk_size_t a_stride_in_bytes, nk_size_t r_stride_in_bytes);
|
|
758
|
+
/** @copydoc nk_euclideans_symmetric_f16 */
|
|
759
|
+
NK_PUBLIC void nk_euclideans_symmetric_f16_graniteamx(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
760
|
+
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
761
|
+
nk_size_t row_start, nk_size_t row_count);
|
|
762
|
+
#endif // NK_TARGET_GRANITEAMX
|
|
763
|
+
|
|
742
764
|
/* ARM SME backends using Scalable Matrix Extension.
|
|
743
765
|
* SME provides ZA tile registers for outer product operations.
|
|
744
766
|
* F16/BF16/I8/U8/E4M3 use ZA32 tiles, F32/F64 use ZA64 tiles (FEAT_SME_F64F64).
|
|
@@ -2078,6 +2100,7 @@ NK_PUBLIC void nk_euclideans_symmetric_u8_rvv(nk_u8_t const *vectors, nk_size_t
|
|
|
2078
2100
|
#include "numkong/spatials/alder.h"
|
|
2079
2101
|
#include "numkong/spatials/sierra.h"
|
|
2080
2102
|
#include "numkong/spatials/sapphireamx.h"
|
|
2103
|
+
#include "numkong/spatials/graniteamx.h"
|
|
2081
2104
|
#include "numkong/spatials/rvv.h"
|
|
2082
2105
|
#include "numkong/spatials/v128relaxed.h"
|
|
2083
2106
|
#include "numkong/spatials/sme.h"
|
|
@@ -2290,7 +2313,9 @@ NK_PUBLIC void nk_euclideans_symmetric_f32(nk_f32_t const *vectors, nk_size_t ve
|
|
|
2290
2313
|
NK_PUBLIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
2291
2314
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
2292
2315
|
nk_size_t r_stride_in_bytes) {
|
|
2293
|
-
#if
|
|
2316
|
+
#if NK_TARGET_GRANITEAMX
|
|
2317
|
+
nk_angulars_packed_f16_graniteamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2318
|
+
#elif NK_TARGET_SME
|
|
2294
2319
|
nk_angulars_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2295
2320
|
#elif NK_TARGET_NEONFHM
|
|
2296
2321
|
nk_angulars_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
@@ -2311,7 +2336,10 @@ NK_PUBLIC void nk_angulars_packed_f16(nk_f16_t const *a, void const *b_packed, n
|
|
|
2311
2336
|
NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2312
2337
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2313
2338
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2314
|
-
#if
|
|
2339
|
+
#if NK_TARGET_GRANITEAMX
|
|
2340
|
+
nk_angulars_symmetric_f16_graniteamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2341
|
+
row_count);
|
|
2342
|
+
#elif NK_TARGET_SME
|
|
2315
2343
|
nk_angulars_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2316
2344
|
#elif NK_TARGET_NEONFHM
|
|
2317
2345
|
nk_angulars_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
@@ -2337,7 +2365,9 @@ NK_PUBLIC void nk_angulars_symmetric_f16(nk_f16_t const *vectors, nk_size_t vect
|
|
|
2337
2365
|
NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed, nk_f32_t *result, nk_size_t rows,
|
|
2338
2366
|
nk_size_t cols, nk_size_t depth, nk_size_t a_stride_in_bytes,
|
|
2339
2367
|
nk_size_t r_stride_in_bytes) {
|
|
2340
|
-
#if
|
|
2368
|
+
#if NK_TARGET_GRANITEAMX
|
|
2369
|
+
nk_euclideans_packed_f16_graniteamx(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2370
|
+
#elif NK_TARGET_SME
|
|
2341
2371
|
nk_euclideans_packed_f16_sme(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
2342
2372
|
#elif NK_TARGET_NEONFHM
|
|
2343
2373
|
nk_euclideans_packed_f16_neonfhm(a, b_packed, result, rows, cols, depth, a_stride_in_bytes, r_stride_in_bytes);
|
|
@@ -2358,7 +2388,10 @@ NK_PUBLIC void nk_euclideans_packed_f16(nk_f16_t const *a, void const *b_packed,
|
|
|
2358
2388
|
NK_PUBLIC void nk_euclideans_symmetric_f16(nk_f16_t const *vectors, nk_size_t vectors_count, nk_size_t depth,
|
|
2359
2389
|
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
|
|
2360
2390
|
nk_size_t row_start, nk_size_t row_count) {
|
|
2361
|
-
#if
|
|
2391
|
+
#if NK_TARGET_GRANITEAMX
|
|
2392
|
+
nk_euclideans_symmetric_f16_graniteamx(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
|
2393
|
+
row_count);
|
|
2394
|
+
#elif NK_TARGET_SME
|
|
2362
2395
|
nk_euclideans_symmetric_f16_sme(vectors, vectors_count, depth, stride, result, result_stride, row_start, row_count);
|
|
2363
2396
|
#elif NK_TARGET_NEONFHM
|
|
2364
2397
|
nk_euclideans_symmetric_f16_neonfhm(vectors, vectors_count, depth, stride, result, result_stride, row_start,
|
package/include/numkong/types.h
CHANGED
|
@@ -69,6 +69,20 @@
|
|
|
69
69
|
#define _GNU_SOURCE
|
|
70
70
|
#endif
|
|
71
71
|
|
|
72
|
+
// MSan (MemorySanitizer) cannot track data flow through SVE horizontal reductions
|
|
73
|
+
// like `svaddv`, which move data from vector registers to scalar registers via
|
|
74
|
+
// architecture-specific paths invisible to the compiler. `nk_unpoison_` marks the
|
|
75
|
+
// resulting scalar as initialized so MSan does not report false positives.
|
|
76
|
+
#if defined(__has_feature)
|
|
77
|
+
#if __has_feature(memory_sanitizer)
|
|
78
|
+
#include <sanitizer/msan_interface.h>
|
|
79
|
+
#define nk_unpoison_(ptr, size) __msan_unpoison((ptr), (size))
|
|
80
|
+
#endif
|
|
81
|
+
#endif
|
|
82
|
+
#ifndef nk_unpoison_
|
|
83
|
+
#define nk_unpoison_(ptr, size) (void)(ptr), (void)(size)
|
|
84
|
+
#endif
|
|
85
|
+
|
|
72
86
|
// Inferring target OS: Windows, macOS, Linux, or FreeBSD
|
|
73
87
|
#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)
|
|
74
88
|
#define NK_DEFINED_WINDOWS_ 1
|
|
@@ -1627,6 +1641,51 @@ NK_INTERNAL nk_size_t nk_sme_cntd_(void) {
|
|
|
1627
1641
|
__asm__ __volatile__("smstart sm\n\t" "cntd %0\n\t" "smstop sm" : "=r"(r));
|
|
1628
1642
|
return (nk_size_t)r;
|
|
1629
1643
|
}
|
|
1644
|
+
|
|
1645
|
+
/** @brief Enter streaming SVE mode (PSTATE.SM = 1). Caller is responsible for smstop. */
|
|
1646
|
+
NK_INTERNAL void nk_sme_start_streaming_(void) { __asm__ __volatile__("smstart sm" ::: "memory"); }
|
|
1647
|
+
/** @brief Exit streaming SVE mode (PSTATE.SM = 0). Must pair with nk_sme_start_streaming_. */
|
|
1648
|
+
NK_INTERNAL void nk_sme_stop_streaming_(void) { __asm__ __volatile__("smstop sm" ::: "memory"); }
|
|
1649
|
+
|
|
1650
|
+
/**
|
|
1651
|
+
* SME runtime stubs — weak definitions for symbols the compiler may reference
|
|
1652
|
+
* from __arm_streaming or __arm_new("za") functions. Every TU that includes
|
|
1653
|
+
* this header emits a weak copy; the linker deduplicates to one.
|
|
1654
|
+
*
|
|
1655
|
+
* - __arm_tpidr2_save / __arm_tpidr2_restore: lazy ZA save/restore protocol
|
|
1656
|
+
* used in __arm_new("za") prologues. Always no-ops in NumKong because no
|
|
1657
|
+
* NK_PUBLIC function carries ZA state (TPIDR2_EL0 is always null at entry).
|
|
1658
|
+
*
|
|
1659
|
+
* - __arm_sc_memset / __arm_sc_memcpy / __arm_sc_memmove: streaming-compatible
|
|
1660
|
+
* memory routines the compiler may emit inside __arm_streaming functions.
|
|
1661
|
+
* Apple Clang provides these in its runtime; upstream LLVM does not.
|
|
1662
|
+
*/
|
|
1663
|
+
__attribute__((weak)) void __arm_tpidr2_save(void) {}
|
|
1664
|
+
__attribute__((weak)) void __arm_tpidr2_restore(void *blk) { nk_unused_(blk); }
|
|
1665
|
+
__attribute__((weak, target("+sme"))) void *__arm_sc_memset(void *d, int c, __SIZE_TYPE__ n) __arm_streaming_compatible {
|
|
1666
|
+
unsigned char *p = (unsigned char *)d;
|
|
1667
|
+
for (__SIZE_TYPE__ i = 0; i < n; i++) p[i] = (unsigned char)c;
|
|
1668
|
+
return d;
|
|
1669
|
+
}
|
|
1670
|
+
__attribute__((weak, target("+sme"))) void *__arm_sc_memcpy(void *d, void const *s,
|
|
1671
|
+
__SIZE_TYPE__ n) __arm_streaming_compatible {
|
|
1672
|
+
unsigned char *dp = (unsigned char *)d;
|
|
1673
|
+
unsigned char const *sp = (unsigned char const *)s;
|
|
1674
|
+
for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
|
|
1675
|
+
return d;
|
|
1676
|
+
}
|
|
1677
|
+
__attribute__((weak, target("+sme"))) void *__arm_sc_memmove(void *d, void const *s,
|
|
1678
|
+
__SIZE_TYPE__ n) __arm_streaming_compatible {
|
|
1679
|
+
unsigned char *dp = (unsigned char *)d;
|
|
1680
|
+
unsigned char const *sp = (unsigned char const *)s;
|
|
1681
|
+
if (dp < sp) {
|
|
1682
|
+
for (__SIZE_TYPE__ i = 0; i < n; i++) dp[i] = sp[i];
|
|
1683
|
+
}
|
|
1684
|
+
else {
|
|
1685
|
+
for (__SIZE_TYPE__ i = n; i > 0; i--) dp[i - 1] = sp[i - 1];
|
|
1686
|
+
}
|
|
1687
|
+
return d;
|
|
1688
|
+
}
|
|
1630
1689
|
#endif
|
|
1631
1690
|
|
|
1632
1691
|
#ifdef __cplusplus
|
|
@@ -99,6 +99,19 @@ Object.defineProperty(exports, "PackedMatrix", { enumerable: true, get: function
|
|
|
99
99
|
Object.defineProperty(exports, "DType", { enumerable: true, get: function () { return types_js_1.DType; } });
|
|
100
100
|
Object.defineProperty(exports, "outputDtype", { enumerable: true, get: function () { return types_js_1.outputDtype; } });
|
|
101
101
|
function loadNativeAddon() {
|
|
102
|
+
var _a;
|
|
103
|
+
// Duplicate-libomp guard. We ship our own `libomp.dylib` next to
|
|
104
|
+
// `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
|
|
105
|
+
// runtime (e.g. one loaded by another native addon) may already be
|
|
106
|
+
// resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
|
|
107
|
+
// libiomp5 to coexist; it must be in `process.env` before the `require()`
|
|
108
|
+
// below triggers the addon's `dlopen`, since libomp's constructor reads
|
|
109
|
+
// the env during dependency resolution and is too late to influence
|
|
110
|
+
// afterwards. Left unguarded because the variable is harmless on
|
|
111
|
+
// platforms / runtimes (GCC libgomp) that don't recognize it, and a user
|
|
112
|
+
// who set it to something else is respected by `??=`. See
|
|
113
|
+
// `python/numkong/__init__.py` for the Python analog.
|
|
114
|
+
(_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
|
|
102
115
|
// Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
|
|
103
116
|
try {
|
|
104
117
|
const req = (0, node_module_1.createRequire)(path.join(getDirName(), "noop.js"));
|
|
@@ -31,6 +31,19 @@ import { existsSync } from "node:fs";
|
|
|
31
31
|
import { getFileName, getRoot } from "bindings";
|
|
32
32
|
import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype } from "./types.js";
|
|
33
33
|
function loadNativeAddon() {
|
|
34
|
+
var _a;
|
|
35
|
+
// Duplicate-libomp guard. We ship our own `libomp.dylib` next to
|
|
36
|
+
// `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
|
|
37
|
+
// runtime (e.g. one loaded by another native addon) may already be
|
|
38
|
+
// resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
|
|
39
|
+
// libiomp5 to coexist; it must be in `process.env` before the `require()`
|
|
40
|
+
// below triggers the addon's `dlopen`, since libomp's constructor reads
|
|
41
|
+
// the env during dependency resolution and is too late to influence
|
|
42
|
+
// afterwards. Left unguarded because the variable is harmless on
|
|
43
|
+
// platforms / runtimes (GCC libgomp) that don't recognize it, and a user
|
|
44
|
+
// who set it to something else is respected by `??=`. See
|
|
45
|
+
// `python/numkong/__init__.py` for the Python analog.
|
|
46
|
+
(_a = process.env).KMP_DUPLICATE_LIB_OK ?? (_a.KMP_DUPLICATE_LIB_OK = "TRUE");
|
|
34
47
|
// Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
|
|
35
48
|
try {
|
|
36
49
|
const req = createRequire(path.join(getDirName(), "noop.js"));
|
package/javascript/numkong.c
CHANGED
|
@@ -9,10 +9,17 @@
|
|
|
9
9
|
|
|
10
10
|
#include <string.h> // `strcmp` function
|
|
11
11
|
|
|
12
|
+
#if defined(NK_USE_OPENMP)
|
|
13
|
+
#include <omp.h>
|
|
14
|
+
#endif
|
|
15
|
+
|
|
12
16
|
#include <node_api.h> // `napi_*` functions — N-API v6+ for BigInt (Node ≥ 10.20)
|
|
13
17
|
|
|
14
18
|
#include <numkong/numkong.h> // `nk_*` functions — must be first to bring `_GNU_SOURCE`
|
|
15
19
|
|
|
20
|
+
#define NK_PARALLEL_PACKED_TILE 64
|
|
21
|
+
#define NK_PARALLEL_SYMMETRIC_TILE 32
|
|
22
|
+
|
|
16
23
|
/** @brief Global variable that caches the CPU capabilities, and is computed just once, when the module is loaded. */
|
|
17
24
|
nk_capability_t static_capabilities = nk_cap_serial_k;
|
|
18
25
|
|
|
@@ -482,11 +489,11 @@ static napi_value api_dots_pack(napi_env env, napi_callback_info info) {
|
|
|
482
489
|
* dtype
|
|
483
490
|
*/
|
|
484
491
|
static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
|
|
485
|
-
size_t argc =
|
|
486
|
-
napi_value args[
|
|
492
|
+
size_t argc = 10;
|
|
493
|
+
napi_value args[10];
|
|
487
494
|
napi_get_cb_info(env, info, &argc, args, NULL, NULL);
|
|
488
|
-
if (argc
|
|
489
|
-
napi_throw_error(env, NULL, "Packed operation requires 9 arguments");
|
|
495
|
+
if (argc < 9 || argc > 10) {
|
|
496
|
+
napi_throw_error(env, NULL, "Packed operation requires 9-10 arguments (last is optional threads)");
|
|
490
497
|
return NULL;
|
|
491
498
|
}
|
|
492
499
|
|
|
@@ -533,8 +540,26 @@ static napi_value api_packed_common(napi_env env, napi_callback_info info, nk_ke
|
|
|
533
540
|
return NULL;
|
|
534
541
|
}
|
|
535
542
|
|
|
536
|
-
|
|
537
|
-
|
|
543
|
+
uint32_t threads = 1;
|
|
544
|
+
if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
|
|
545
|
+
|
|
546
|
+
#if defined(NK_USE_OPENMP)
|
|
547
|
+
if (threads == 0) threads = (uint32_t)omp_get_max_threads();
|
|
548
|
+
omp_set_num_threads((int)threads);
|
|
549
|
+
#endif
|
|
550
|
+
|
|
551
|
+
// `int` loop counter pre-declared: MSVC's OpenMP stays at 2.0 canonical
|
|
552
|
+
// form, which forbids in-init declarations and rejects 64-bit iterators
|
|
553
|
+
// — either would trip C3015.
|
|
554
|
+
int const tile_count = (int)nk_size_divide_round_up_(height, NK_PARALLEL_PACKED_TILE);
|
|
555
|
+
int tile_idx;
|
|
556
|
+
#pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
|
|
557
|
+
for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
|
|
558
|
+
nk_size_t row = (nk_size_t)tile_idx * NK_PARALLEL_PACKED_TILE;
|
|
559
|
+
nk_size_t chunk = (row + NK_PARALLEL_PACKED_TILE <= height) ? NK_PARALLEL_PACKED_TILE : (height - row);
|
|
560
|
+
kernel((char const *)a_data + row * a_stride, packed_data, (char *)result_data + row * result_stride, chunk,
|
|
561
|
+
(nk_size_t)width, (nk_size_t)depth, (nk_size_t)a_stride, (nk_size_t)result_stride);
|
|
562
|
+
}
|
|
538
563
|
return NULL;
|
|
539
564
|
}
|
|
540
565
|
|
|
@@ -554,11 +579,11 @@ static napi_value api_euclideans_packed(napi_env env, napi_callback_info info) {
|
|
|
554
579
|
* string dtype
|
|
555
580
|
*/
|
|
556
581
|
static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk_kernel_kind_t kernel_kind) {
|
|
557
|
-
size_t argc =
|
|
558
|
-
napi_value args[
|
|
582
|
+
size_t argc = 10;
|
|
583
|
+
napi_value args[10];
|
|
559
584
|
napi_get_cb_info(env, info, &argc, args, NULL, NULL);
|
|
560
|
-
if (argc
|
|
561
|
-
napi_throw_error(env, NULL, "Symmetric operation requires 9 arguments");
|
|
585
|
+
if (argc < 9 || argc > 10) {
|
|
586
|
+
napi_throw_error(env, NULL, "Symmetric operation requires 9-10 arguments (last is optional threads)");
|
|
562
587
|
return NULL;
|
|
563
588
|
}
|
|
564
589
|
|
|
@@ -601,8 +626,27 @@ static napi_value api_symmetric_common(napi_env env, napi_callback_info info, nk
|
|
|
601
626
|
return NULL;
|
|
602
627
|
}
|
|
603
628
|
|
|
604
|
-
|
|
605
|
-
|
|
629
|
+
uint32_t threads = 1;
|
|
630
|
+
if (argc == 10) napi_get_value_uint32(env, args[9], &threads);
|
|
631
|
+
|
|
632
|
+
#if defined(NK_USE_OPENMP)
|
|
633
|
+
if (threads == 0) threads = (uint32_t)omp_get_max_threads();
|
|
634
|
+
omp_set_num_threads((int)threads);
|
|
635
|
+
#endif
|
|
636
|
+
|
|
637
|
+
// `int` loop counter pre-declared: see note at `api_packed_common`.
|
|
638
|
+
int const tile_count = (int)nk_size_divide_round_up_(row_count, NK_PARALLEL_SYMMETRIC_TILE);
|
|
639
|
+
int tile_idx;
|
|
640
|
+
#pragma omp parallel for schedule(dynamic, 1) if (threads > 1)
|
|
641
|
+
for (tile_idx = 0; tile_idx < tile_count; tile_idx++) {
|
|
642
|
+
nk_size_t tile_start = (nk_size_t)row_start + (nk_size_t)tile_idx * NK_PARALLEL_SYMMETRIC_TILE;
|
|
643
|
+
nk_size_t tile_rows = (tile_start + NK_PARALLEL_SYMMETRIC_TILE <= (nk_size_t)row_start + row_count)
|
|
644
|
+
? NK_PARALLEL_SYMMETRIC_TILE
|
|
645
|
+
: ((nk_size_t)row_start + row_count - tile_start);
|
|
646
|
+
kernel(vectors_data, (nk_size_t)n_vectors, (nk_size_t)depth, (nk_size_t)vectors_stride, result_data,
|
|
647
|
+
(nk_size_t)result_stride, tile_start, tile_rows);
|
|
648
|
+
}
|
|
649
|
+
|
|
606
650
|
return NULL;
|
|
607
651
|
}
|
|
608
652
|
|
package/javascript/numkong.ts
CHANGED
|
@@ -33,6 +33,19 @@ import { getFileName, getRoot } from "bindings";
|
|
|
33
33
|
import { setConversionFunctions, Float16Array, BFloat16Array, E4M3Array, E5M2Array, BinaryArray, TensorBase, VectorBase, VectorView, Vector, MatrixBase, Matrix, PackedMatrix, DType, dtypeToString, outputDtype, KernelFamily } from "./types.js";
|
|
34
34
|
|
|
35
35
|
function loadNativeAddon(): any {
|
|
36
|
+
// Duplicate-libomp guard. We ship our own `libomp.dylib` next to
|
|
37
|
+
// `numkong.node` in each `@numkong/darwin-*` package, but another OpenMP
|
|
38
|
+
// runtime (e.g. one loaded by another native addon) may already be
|
|
39
|
+
// resident. `KMP_DUPLICATE_LIB_OK=TRUE` tells LLVM libomp / Intel
|
|
40
|
+
// libiomp5 to coexist; it must be in `process.env` before the `require()`
|
|
41
|
+
// below triggers the addon's `dlopen`, since libomp's constructor reads
|
|
42
|
+
// the env during dependency resolution and is too late to influence
|
|
43
|
+
// afterwards. Left unguarded because the variable is harmless on
|
|
44
|
+
// platforms / runtimes (GCC libgomp) that don't recognize it, and a user
|
|
45
|
+
// who set it to something else is respected by `??=`. See
|
|
46
|
+
// `python/numkong/__init__.py` for the Python analog.
|
|
47
|
+
process.env.KMP_DUPLICATE_LIB_OK ??= "TRUE";
|
|
48
|
+
|
|
36
49
|
// Tier 1: platform-specific optional dependency (@numkong/<os>-<arch>)
|
|
37
50
|
try {
|
|
38
51
|
const req = createRequire(path.join(getDirName(), "noop.js"));
|