bblean 0.6.0b1__cp313-cp313-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bblean/csrc/README.md ADDED
@@ -0,0 +1 @@
1
+ # C++ extensions for accellerated similarity calculations
@@ -0,0 +1,521 @@
1
+ #include <pybind11/numpy.h>
2
+ #include <pybind11/pybind11.h>
3
+ #include <pybind11/stl.h>
4
+
5
+ #include <algorithm>
6
+ #include <cmath>
7
+ #include <cstddef>
8
+ #include <cstdint>
9
+ #include <iostream>
10
+ #include <numeric>
11
+ #include <optional>
12
+ #include <stdexcept>
13
+ #include <vector>
14
+
15
+ // Scalar popcount intrinsics:
16
+ #if defined(__SSE_4_2__) || defined(_M_SSE4_2)
17
+ // Compiler-portable, but *not available in systems that do not have SSE*
18
+ // (which should be almost no CPUs nowadays)
19
+ // Not actually vector instructions, they just live in the SSE header
20
+ // Should be *exactly as fast* as __(builtin_)popcnt(ll) (compile to the same
21
+ // code)
22
+ //
23
+ // nmmintrin.h is the SSE4.2 intrinsics (only) header for all compilers
24
+ // NOTE: This ifdef is probably overkill, almost all cases should be covered by
25
+ // the GCC|Clang|MSVC ifdefs, but it doesn't hurt to add it
26
+ #include <nmmintrin.h>
27
+ #define POPCOUNT_32 _mm_popcnt_u32
28
+ #define POPCOUNT_64 _mm_popcnt_u64
29
+ #elif defined(_MSC_VER)
30
+ // Windows (MSVC compiler)
31
+ #include <intrin.h>
32
+ #define POPCOUNT_32 __popcnt
33
+ #define POPCOUNT_64 __popcnt64
34
+ #elif defined(__GNUC__) || defined(__clang__)
35
+ // GCC | Clang
36
+ #define POPCOUNT_32 __builtin_popcount
37
+ #define POPCOUNT_64 __builtin_popcountll
38
+ #else
39
+ // If popcnt is not hardware supported numpy rolls out its own hand-coded
40
+ // version, fail for simplicity since it is not worth it to support those archs
41
+ #error "Popcount not supported in target architecture"
42
+ #endif
43
+
44
+ // TODO: See if worth it to use vector popcount intrinsics (AVX-512, only some
45
+ // CPU) like jt_sim_packed
46
+ namespace py = pybind11;
47
+
48
+ template <typename T>
49
+ using CArrayForcecast =
50
+ py::array_t<T, py::array::c_style | py::array::forcecast>;
51
+
52
+ auto is_8byte_aligned(const py::array_t<uint8_t>& a) -> bool {
53
+ // Convert between ptr and integer requires reinterpret
54
+ return reinterpret_cast<std::uintptr_t>(a.data()) % alignof(uint64_t) == 0;
55
+ }
56
+
57
+ auto print_8byte_alignment_check(const py::array_t<uint8_t>& arr) -> void {
58
+ py::print("arr buf addr: ", reinterpret_cast<std::uintptr_t>(arr.data()));
59
+ py::print("uint64_t alignment requirement: ", alignof(uint64_t));
60
+ py::print("Is 8-byte aligned: ", is_8byte_aligned(arr));
61
+ }
62
+
63
+ uint32_t _popcount_1d(const py::array_t<uint8_t>& arr) {
64
+ if (arr.ndim() != 1) {
65
+ throw std::runtime_error("Input array must be 1-dimensional");
66
+ }
67
+ #ifdef DEBUG_LOGS
68
+ print_8byte_alignment_check(arr);
69
+ #endif
70
+ uint32_t count{0}; // Output scalar
71
+ py::ssize_t steps = arr.shape(0);
72
+ if (is_8byte_aligned(arr) and (steps % 64 == 0)) {
73
+ #ifdef DEBUG_LOGS
74
+ py::print("DEBUG: _popcount_1d fn triggered uint64 + popcount 64");
75
+ #endif
76
+ // Aligned to 64-bit boundary, interpret as uint64_t
77
+ steps /= sizeof(uint64_t);
78
+ auto in_cptr = static_cast<const uint64_t*>(arr.request().ptr);
79
+ for (py::ssize_t i{0}; i != steps; ++i) { // not auto-vec by GCC
80
+ count += POPCOUNT_64(in_cptr[i]);
81
+ }
82
+ return count;
83
+ }
84
+
85
+ #ifdef DEBUG_LOGS
86
+ py::print("DEBUG: _popcount_1d fn triggered uint8 + popcount 32");
87
+ #endif
88
+ // Misaligned, loop over bytes
89
+ auto in_cptr = arr.data();
90
+ for (py::ssize_t i{0}; i != steps; ++i) { // not auto-vec by GCC
91
+ count += POPCOUNT_32(in_cptr[i]); // uint8 promoted to uint32
92
+ }
93
+ return count;
94
+ }
95
+
96
+ // TODO: Currently this is pretty slow unless hitting the "uint64_t" branch,
97
+ // maybe two pass approach? first compute all popcounts, then sum (Numpy does
98
+ // this). Maybe the additions could be auto-vec?
99
+ py::array_t<uint32_t> _popcount_2d(const CArrayForcecast<uint8_t>& arr) {
100
+ if (arr.ndim() != 2) {
101
+ throw std::runtime_error("Input array must be 2-dimensional");
102
+ }
103
+ const py::ssize_t n_samples = arr.shape(0);
104
+
105
+ auto out = py::array_t<uint32_t>(n_samples);
106
+ auto out_ptr = out.mutable_data();
107
+ std::memset(out_ptr, 0, out.nbytes());
108
+
109
+ #ifdef DEBUG_LOGS
110
+ print_8byte_alignment_check(arr);
111
+ #endif
112
+ py::ssize_t steps = arr.shape(1);
113
+ if (is_8byte_aligned(arr) and (steps % 64 == 0)) {
114
+ #ifdef DEBUG_LOGS
115
+ py::print("DEBUG: _popcount_2d fn triggered uint64 + popcount 64");
116
+ #endif
117
+ // Aligned to 64-bit boundary, interpret as uint64_t
118
+ steps /= sizeof(uint64_t);
119
+ auto in_cptr = static_cast<const uint64_t*>(arr.request().ptr);
120
+ for (py::ssize_t i{0}; i != n_samples; ++i) { // not auto-vec by GCC
121
+ const uint64_t* row_cptr = in_cptr + i * steps;
122
+ for (py::ssize_t j{0}; j != steps; ++j) { // not auto-vec by GCC
123
+ out_ptr[i] += POPCOUNT_64(row_cptr[j]);
124
+ }
125
+ }
126
+ return out;
127
+ }
128
+
129
+ #ifdef DEBUG_LOGS
130
+ py::print("DEBUG: _popcount_2d fn triggered uint8 + popcount 32");
131
+ #endif
132
+ // Misaligned, loop over bytes
133
+ auto in_cptr = arr.data();
134
+ for (py::ssize_t i{0}; i != n_samples; ++i) { // not auto-vec by GCC
135
+ const uint8_t* row_cptr = in_cptr + i * steps;
136
+ for (py::ssize_t j{0}; j != steps; ++j) { // not auto-vec by GCC
137
+ out_ptr[i] += POPCOUNT_32(row_cptr[j]);
138
+ }
139
+ }
140
+ return out;
141
+ }
142
+
143
+ // The BitToByte table has shape (256, 8), and holds, for each
144
+ // value in the range 0-255, a row with the 8 associated bits as uint8_t values
145
+ constexpr std::array<std::array<uint8_t, 8>, 256> makeByteToBitsLookupTable() {
146
+ std::array<std::array<uint8_t, 8>, 256> byteToBits{};
147
+ for (int i{0}; i != 256; ++i) {
148
+ for (int b{0}; b != 8; ++b) {
149
+ // Shift right by b and, and fetch the least-significant-bit by
150
+ // and'ng with 1 = 000...1
151
+ byteToBits[i][7 - b] = (i >> b) & 1;
152
+ }
153
+ }
154
+ return byteToBits;
155
+ }
156
+
157
+ constexpr auto BYTE_TO_BITS = makeByteToBitsLookupTable();
158
+
159
+ py::array_t<uint8_t> _nochecks_unpack_fingerprints_1d(
160
+ const CArrayForcecast<uint8_t>& packed_fps,
161
+ std::optional<py::ssize_t> n_features_opt = std::nullopt) {
162
+ py::ssize_t n_bytes = packed_fps.shape(0);
163
+ py::ssize_t n_features = n_features_opt.value_or(n_bytes * 8);
164
+ if (n_features % 8 != 0) {
165
+ throw std::runtime_error("Only n_features divisible by 8 is supported");
166
+ }
167
+ auto out = py::array_t<uint8_t>(n_features);
168
+ auto out_ptr = out.mutable_data();
169
+ auto in_cptr = packed_fps.data();
170
+ for (py::ssize_t j{0}; j != n_features; j += 8) { // not auto-vec by GCC
171
+ // Copy the next 8 uint8 values in one go
172
+ std::memcpy(out_ptr + j, BYTE_TO_BITS[in_cptr[j / 8]].data(), 8);
173
+ }
174
+ return out;
175
+ }
176
+
177
+ py::array_t<uint8_t> _nochecks_unpack_fingerprints_2d(
178
+ const CArrayForcecast<uint8_t>& packed_fps,
179
+ std::optional<py::ssize_t> n_features_opt = std::nullopt) {
180
+ py::ssize_t n_samples = packed_fps.shape(0);
181
+ py::ssize_t n_bytes = packed_fps.shape(1);
182
+ py::ssize_t n_features = n_features_opt.value_or(n_bytes * 8);
183
+ if (n_features % 8 != 0) {
184
+ throw std::runtime_error("Only features divisible by 8 is supported");
185
+ }
186
+ auto out = py::array_t<uint8_t>({n_samples, n_features});
187
+ // Unchecked accessors (benchmarked and there is no real advantage to using
188
+ // ptrs)
189
+ auto acc_in = packed_fps.unchecked<2>();
190
+ auto acc_out = out.mutable_unchecked<2>();
191
+
192
+ for (py::ssize_t i{0}; i != n_samples; ++i) { // not auto-vec by GCC
193
+ for (py::ssize_t j{0}; j != n_features;
194
+ j += 8) { // not auto-vec by GCC
195
+ // Copy the next 8 uint8 values in one go
196
+ std::memcpy(&acc_out(i, j), BYTE_TO_BITS[acc_in(i, j / 8)].data(),
197
+ 8);
198
+ }
199
+ }
200
+ return out;
201
+ }
202
+
203
+ // Wrapper over _nochecks_unpack_fingerprints that performs ndim checks
204
+ py::array_t<uint8_t> unpack_fingerprints(
205
+ const CArrayForcecast<uint8_t>& packed_fps,
206
+ std::optional<py::ssize_t> n_features_opt = std::nullopt) {
207
+ if (packed_fps.ndim() == 1) {
208
+ return _nochecks_unpack_fingerprints_1d(packed_fps, n_features_opt);
209
+ }
210
+ if (packed_fps.ndim() == 2) {
211
+ return _nochecks_unpack_fingerprints_2d(packed_fps, n_features_opt);
212
+ }
213
+ throw std::runtime_error("Input array must be 1- or 2-dimensional");
214
+ }
215
+
216
+ template <typename T>
217
+ py::array_t<uint8_t> centroid_from_sum(const CArrayForcecast<T>& linear_sum,
218
+ int64_t n_samples, bool pack = true) {
219
+ if (linear_sum.ndim() != 1) {
220
+ throw std::runtime_error("linear_sum must be 1-dimensional");
221
+ }
222
+
223
+ py::ssize_t n_features = linear_sum.shape(0);
224
+ auto linear_sum_cptr = linear_sum.data();
225
+
226
+ py::array_t<uint8_t> centroid_unpacked(n_features);
227
+ auto centroid_unpacked_ptr = centroid_unpacked.mutable_data();
228
+ if (n_samples <= 1) {
229
+ for (int i{0}; i != n_features;
230
+ ++i) { // yes auto-vec by GCC (versioned due to possible alias)
231
+ // Cast not required, but added for clarity since this is a
232
+ // narrowing conversion. if n_samples <= 1 then linear_sum is
233
+ // guaranteed to have a value that a uint8_t can hold (it should be
234
+ // 0 or 1)
235
+ // memcpy not possible due to the required cast
236
+ centroid_unpacked_ptr[i] = static_cast<uint8_t>(linear_sum_cptr[i]);
237
+ }
238
+ } else {
239
+ auto threshold = n_samples * 0.5;
240
+ for (int i{0}; i != n_features; ++i) { // not auto-vec by GCC
241
+ centroid_unpacked_ptr[i] =
242
+ (linear_sum_cptr[i] >= threshold) ? 1 : 0;
243
+ }
244
+ }
245
+
246
+ if (not pack) {
247
+ return centroid_unpacked;
248
+ }
249
+
250
+ auto centroid_unpacked_cptr = centroid_unpacked.data();
251
+ int n_bytes = (n_features + 7) / 8;
252
+ auto centroid_packed = py::array_t<uint8_t>(n_bytes);
253
+ auto centroid_packed_ptr = centroid_packed.mutable_data();
254
+ std::memset(centroid_packed_ptr, 0, centroid_packed.nbytes());
255
+
256
+ // Slower than numpy, due to lack of SIMD
257
+ // The following loop is *marginally slower* (benchmkd') than the
258
+ // implemented one: for (int i{0}; i != n_features; ++i) {
259
+ // if (centroid_unpacked_cptr[i]) {
260
+ // centroid_packed_ptr[i / 8] |= (1 << (7 - (i % 8)));
261
+ // }
262
+ // }
263
+ // TODO: Check if GCC is auto-vectorizing
264
+ for (int i{0}, stride{0}; i != n_bytes; i++, stride += 8) {
265
+ for (int b{0}; b != 8; ++b) {
266
+ centroid_packed_ptr[i] <<= 1;
267
+ centroid_packed_ptr[i] |= centroid_unpacked_cptr[stride + b];
268
+ }
269
+ }
270
+ return centroid_packed;
271
+ }
272
+
273
+ double jt_isim_from_sum(const CArrayForcecast<uint64_t>& linear_sum,
274
+ int64_t n_objects) {
275
+ if (n_objects < 2) {
276
+ PyErr_WarnEx(PyExc_RuntimeWarning,
277
+ "Invalid n_objects in isim. Expected n_objects >= 2", 1);
278
+ return std::numeric_limits<double>::quiet_NaN();
279
+ }
280
+ if (linear_sum.ndim() != 1) {
281
+ throw std::runtime_error("linear_sum must be a 1D array");
282
+ }
283
+ py::ssize_t n_features = linear_sum.shape(0);
284
+
285
+ auto in_cptr = linear_sum.data();
286
+ uint64_t sum_kq{0};
287
+ for (py::ssize_t i{0}; i != n_features; ++i) { // yes auto-vec by GCC
288
+ sum_kq += in_cptr[i];
289
+ }
290
+
291
+ if (sum_kq == 0) {
292
+ return 1.0;
293
+ }
294
+
295
+ uint64_t sum_kqsq{0};
296
+ for (py::ssize_t i{0}; i != n_features; ++i) { // yes auto-vec by GCC
297
+ sum_kqsq += in_cptr[i] * in_cptr[i];
298
+ }
299
+ auto a = (sum_kqsq - sum_kq) / 2.0;
300
+ return a / ((a + (n_objects * sum_kq)) - sum_kqsq);
301
+ }
302
+
303
+ // Contraint: T must be uint64_t or uint8_t
304
+ template <typename T>
305
+ void _calc_arr_vec_jt(const py::array_t<uint8_t>& arr,
306
+ const py::array_t<uint8_t>& vec,
307
+ const py::ssize_t n_samples, const py::ssize_t n_features,
308
+ const uint32_t vec_popcount,
309
+ const py::array_t<uint32_t>& cardinalities,
310
+ py::array_t<double>& out) {
311
+ const py::ssize_t steps = n_features / sizeof(T);
312
+ auto arr_cptr = static_cast<const T*>(arr.request().ptr);
313
+ auto vec_cptr = static_cast<const T*>(vec.request().ptr);
314
+ auto card_cptr = cardinalities.data();
315
+ auto out_ptr = out.mutable_data();
316
+
317
+ for (py::ssize_t i{0}; i != n_samples; ++i) { // not auto-vec by GCC
318
+ const T* arr_row_cptr = arr_cptr + i * steps;
319
+ uint32_t intersection{0};
320
+ for (py::ssize_t j{0}; j != steps; ++j) { // not auto-vec by GCC
321
+ if constexpr (std::is_same_v<T, uint64_t>) {
322
+ intersection += POPCOUNT_64(arr_row_cptr[j] & vec_cptr[j]);
323
+ } else {
324
+ intersection += POPCOUNT_32(arr_row_cptr[j] & vec_cptr[j]);
325
+ }
326
+ }
327
+ auto denominator = card_cptr[i] + vec_popcount - intersection;
328
+ // Cast is technically unnecessary since std::max promotes to double,
329
+ // but added here for clarity (should compile to nop)
330
+ out_ptr[i] =
331
+ intersection / std::max(static_cast<double>(denominator), 1.0);
332
+ }
333
+ }
334
+
335
+ // # NOTE: This function is the bottleneck for bb compute calculations
336
+ // In this function, _popcount_2d takes around ~25% of the time, _popcount_1d
337
+ // around 5%. The internal loop with the popcounts is also quite heavy.
338
+ // TODO: Investigate simple SIMD vectorization of these loops
339
+ // TODO: Does this function return a copy?
340
+ py::array_t<double> jt_sim_packed_precalc_cardinalities(
341
+ const py::array_t<uint8_t>& arr, const py::array_t<uint8_t>& vec,
342
+ const py::array_t<uint32_t>& cardinalities) {
343
+ py::ssize_t n_samples = arr.shape(0);
344
+ py::ssize_t n_features = arr.shape(1);
345
+ if (arr.ndim() != 2 || vec.ndim() != 1) {
346
+ throw std::runtime_error("arr must be 2D, vec must be 1D");
347
+ }
348
+ if (n_features != vec.shape(0)) {
349
+ throw std::runtime_error(
350
+ "Shapes should be (N, F) for arr and (F,) for vec");
351
+ }
352
+ auto out = py::array_t<double>(n_samples);
353
+
354
+ if (is_8byte_aligned(arr) and is_8byte_aligned(vec) and
355
+ (n_features % 64 == 0)) {
356
+ #ifdef DEBUG_LOGS
357
+ py::print("DEBUG: jt_sim_packed fn triggered uint64 + popcount 64");
358
+ #endif
359
+ // Aligned to 64-bit boundary, interpret as uint64_t
360
+ _calc_arr_vec_jt<uint64_t>(arr, vec, n_samples, n_features,
361
+ _popcount_1d(vec), cardinalities, out);
362
+ return out;
363
+ }
364
+
365
+ #ifdef DEBUG_LOGS
366
+ py::print("DEBUG: jt_sim_packed fn triggered uint8 + popcount 32");
367
+ #endif
368
+ // Misaligned, loop over bytes
369
+ _calc_arr_vec_jt<uint8_t>(arr, vec, n_samples, n_features,
370
+ _popcount_1d(vec), cardinalities, out);
371
+ return out;
372
+ }
373
+
374
+ py::array_t<double> _jt_sim_arr_vec_packed(const py::array_t<uint8_t>& arr,
375
+ const py::array_t<uint8_t>& vec) {
376
+ return jt_sim_packed_precalc_cardinalities(arr, vec, _popcount_2d(arr));
377
+ }
378
+
379
+ // NOTE: This is only *slightly* faster for C++ than numpy, **only if the
380
+ // array is uint8_t** if the array is uint64 already, it is slower
381
+ template <typename T>
382
+ py::array_t<uint64_t> add_rows(const CArrayForcecast<T>& arr) {
383
+ if (arr.ndim() != 2) {
384
+ throw std::runtime_error("Input array must be 2-dimensional");
385
+ }
386
+ auto arr_ptr = arr.data();
387
+ auto out = py::array_t<uint64_t>(arr.shape(1));
388
+ auto out_ptr = out.mutable_data();
389
+ std::memset(out_ptr, 0, out.nbytes());
390
+ py::ssize_t n_samples = arr.shape(0);
391
+ py::ssize_t n_features = arr.shape(1);
392
+ // Check GCC / CLang vectorize this
393
+ for (py::ssize_t i = 0; i < n_samples; ++i) {
394
+ const uint8_t* arr_row_ptr = arr_ptr + i * n_features;
395
+ for (py::ssize_t j = 0; j < n_features; ++j) {
396
+ out_ptr[j] += static_cast<uint64_t>(arr_row_ptr[j]);
397
+ }
398
+ }
399
+ return out;
400
+ }
401
+
402
+ double jt_isim_unpacked_u8(const CArrayForcecast<uint8_t>& arr) {
403
+ return jt_isim_from_sum(add_rows<uint8_t>(arr), arr.shape(0));
404
+ }
405
+
406
+ double jt_isim_packed_u8(
407
+ const CArrayForcecast<uint8_t>& arr,
408
+ std::optional<py::ssize_t> n_features_opt = std::nullopt) {
409
+ return jt_isim_from_sum(add_rows<uint8_t>(unpack_fingerprints(arr, n_features_opt)),
410
+ arr.shape(0));
411
+ }
412
+
413
+ py::tuple jt_most_dissimilar_packed(
414
+ CArrayForcecast<uint8_t> fps_packed,
415
+ std::optional<py::ssize_t> n_features_opt = std::nullopt) {
416
+ if (fps_packed.ndim() != 2) {
417
+ throw std::runtime_error("Input array must be 2-dimensional");
418
+ }
419
+ py::ssize_t n_samples = fps_packed.shape(0);
420
+ py::ssize_t n_features_packed = fps_packed.shape(1);
421
+
422
+ auto fps_unpacked =
423
+ _nochecks_unpack_fingerprints_2d(fps_packed, n_features_opt);
424
+ py::ssize_t n_features_unpacked = fps_unpacked.shape(1);
425
+
426
+ auto linear_sum = py::array_t<uint64_t>(n_features_unpacked);
427
+ auto linear_sum_ptr = linear_sum.mutable_data();
428
+ std::memset(linear_sum_ptr, 0, linear_sum.nbytes());
429
+
430
+ // TODO: This sum could be vectorized manually or automatically
431
+ auto fps_unpacked_ptr = fps_unpacked.data();
432
+ for (py::ssize_t i{0}; i != n_samples; ++i) {
433
+ const uint8_t* row_cptr = fps_unpacked_ptr + i * n_features_unpacked;
434
+ for (py::ssize_t j{0}; j != n_features_unpacked;
435
+ ++j) { // yes auto-vec by GCC (versioned due to possible alias)
436
+ linear_sum_ptr[j] += row_cptr[j];
437
+ }
438
+ }
439
+
440
+ auto centroid_packed =
441
+ centroid_from_sum<uint64_t>(linear_sum, n_samples, true);
442
+ auto cardinalities = _popcount_2d(fps_packed);
443
+
444
+ auto sims_cent = jt_sim_packed_precalc_cardinalities(
445
+ fps_packed, centroid_packed, cardinalities);
446
+ auto sims_cent_ptr = sims_cent.data();
447
+
448
+ auto fps_packed_cptr = fps_packed.data();
449
+
450
+ // argmin
451
+ py::ssize_t fp1_idx = std::distance(
452
+ sims_cent_ptr,
453
+ std::min_element(sims_cent_ptr, sims_cent_ptr + n_samples));
454
+ auto fp1_packed = py::array_t<uint8_t>(
455
+ n_features_packed, fps_packed_cptr + fp1_idx * n_features_packed);
456
+
457
+ auto sims_fp1 = jt_sim_packed_precalc_cardinalities(fps_packed, fp1_packed,
458
+ cardinalities);
459
+ auto sims_fp1_ptr = sims_fp1.data();
460
+
461
+ // argmin
462
+ py::ssize_t fp2_idx = std::distance(
463
+ sims_fp1_ptr, std::min_element(sims_fp1_ptr, sims_fp1_ptr + n_samples));
464
+ auto fp2_packed = py::array_t<uint8_t>(
465
+ n_features_packed, fps_packed_cptr + fp2_idx * n_features_packed);
466
+
467
+ auto sims_fp2 = jt_sim_packed_precalc_cardinalities(fps_packed, fp2_packed,
468
+ cardinalities);
469
+
470
+ return py::make_tuple(fp1_idx, fp2_idx, sims_fp1, sims_fp2);
471
+ }
472
+
473
+ PYBIND11_MODULE(_cpp_similarity, m) {
474
+ m.doc() = "Optimized molecular similarity calculators (C++ extensions)";
475
+
476
+ // Only bound for debugging purposes
477
+ m.def("_nochecks_unpack_fingerprints_2d", &_nochecks_unpack_fingerprints_2d,
478
+ "Unpack packed fingerprints", py::arg("a"),
479
+ py::arg("n_features") = std::nullopt);
480
+ m.def("_nochecks_unpack_fingerprints_1d", &_nochecks_unpack_fingerprints_1d,
481
+ "Unpack packed fingerprints", py::arg("a"),
482
+ py::arg("n_features") = std::nullopt);
483
+
484
+ // NOTE: There are some gains from using this fn but only ~3%, so don't warn
485
+ // for now if this fails, and don't expose it
486
+ m.def("unpack_fingerprints", &unpack_fingerprints,
487
+ "Unpack packed fingerprints", py::arg("a"),
488
+ py::arg("n_features") = std::nullopt);
489
+
490
+ // NOTE: pybind11's dynamic dispatch is *significantly* more
491
+ // expensive than casting to uint64_t always
492
+ // still this function is *barely* faster than python if no casts are
493
+ // needed, and slightly slower if casts are needed so it is not useful
494
+ // outside the C++ code, and it should not be exposed by default in any
495
+ // module (only for internal use and debugging)
496
+ m.def("centroid_from_sum", &centroid_from_sum<uint64_t>,
497
+ "centroid calculation", py::arg("linear_sum"), py::arg("n_samples"),
498
+ py::arg("pack") = true);
499
+
500
+ m.def("_popcount_2d", &_popcount_2d, "2D popcount", py::arg("a"));
501
+ m.def("_popcount_1d", &_popcount_1d, "1D popcount", py::arg("a"));
502
+ m.def("add_rows", &add_rows<uint8_t>, "add_rows", py::arg("arr"));
503
+
504
+ // API
505
+ m.def("jt_isim_from_sum", &jt_isim_from_sum,
506
+ "iSIM Tanimoto calculation from sum", py::arg("c_total"),
507
+ py::arg("n_objects"));
508
+ m.def("jt_isim_packed_u8", &jt_isim_packed_u8, "iSIM Tanimoto calculation",
509
+ py::arg("arr"), py::arg("n_features") = std::nullopt);
510
+ m.def("jt_isim_unpacked_u8", &jt_isim_unpacked_u8,
511
+ "iSIM Tanimoto calculation", py::arg("arr"));
512
+
513
+ m.def("_jt_sim_arr_vec_packed", &_jt_sim_arr_vec_packed,
514
+ "Tanimoto similarity between a matrix of packed fps and a single "
515
+ "packed fp",
516
+ py::arg("arr"), py::arg("vec"));
517
+ m.def("jt_most_dissimilar_packed", &jt_most_dissimilar_packed,
518
+ "Finds two fps in a packed fp array that are the most "
519
+ "Tanimoto-dissimilar",
520
+ py::arg("Y"), py::arg("n_features") = std::nullopt);
521
+ }