arraykit 1.5.0__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {arraykit-1.5.0/arraykit.egg-info → arraykit-1.7.0}/PKG-INFO +14 -3
  2. {arraykit-1.5.0 → arraykit-1.7.0}/README.rst +13 -2
  3. arraykit-1.7.0/VERSION +2 -0
  4. {arraykit-1.5.0 → arraykit-1.7.0/arraykit.egg-info}/PKG-INFO +14 -3
  5. {arraykit-1.5.0 → arraykit-1.7.0}/arraykit.egg-info/SOURCES.txt +2 -0
  6. {arraykit-1.5.0 → arraykit-1.7.0}/src/__init__.py +2 -0
  7. {arraykit-1.5.0 → arraykit-1.7.0}/src/__init__.pyi +6 -0
  8. {arraykit-1.5.0 → arraykit-1.7.0}/src/_arraykit.c +8 -0
  9. {arraykit-1.5.0 → arraykit-1.7.0}/src/auto_map.c +400 -0
  10. {arraykit-1.5.0 → arraykit-1.7.0}/src/auto_map.h +2 -0
  11. {arraykit-1.5.0 → arraykit-1.7.0}/src/methods.c +143 -0
  12. {arraykit-1.5.0 → arraykit-1.7.0}/src/methods.h +3 -0
  13. arraykit-1.7.0/test/test_factorize.py +274 -0
  14. arraykit-1.7.0/test/test_group_ordering.py +231 -0
  15. arraykit-1.5.0/VERSION +0 -2
  16. {arraykit-1.5.0 → arraykit-1.7.0}/LICENSE.txt +0 -0
  17. {arraykit-1.5.0 → arraykit-1.7.0}/MANIFEST.in +0 -0
  18. {arraykit-1.5.0 → arraykit-1.7.0}/arraykit.egg-info/dependency_links.txt +0 -0
  19. {arraykit-1.5.0 → arraykit-1.7.0}/arraykit.egg-info/requires.txt +0 -0
  20. {arraykit-1.5.0 → arraykit-1.7.0}/arraykit.egg-info/top_level.txt +0 -0
  21. {arraykit-1.5.0 → arraykit-1.7.0}/pyproject.toml +0 -0
  22. {arraykit-1.5.0 → arraykit-1.7.0}/setup.cfg +0 -0
  23. {arraykit-1.5.0 → arraykit-1.7.0}/setup.py +0 -0
  24. {arraykit-1.5.0 → arraykit-1.7.0}/src/array_go.c +0 -0
  25. {arraykit-1.5.0 → arraykit-1.7.0}/src/array_go.h +0 -0
  26. {arraykit-1.5.0 → arraykit-1.7.0}/src/array_to_tuple.c +0 -0
  27. {arraykit-1.5.0 → arraykit-1.7.0}/src/array_to_tuple.h +0 -0
  28. {arraykit-1.5.0 → arraykit-1.7.0}/src/block_index.c +0 -0
  29. {arraykit-1.5.0 → arraykit-1.7.0}/src/block_index.h +0 -0
  30. {arraykit-1.5.0 → arraykit-1.7.0}/src/delimited_to_arrays.c +0 -0
  31. {arraykit-1.5.0 → arraykit-1.7.0}/src/delimited_to_arrays.h +0 -0
  32. {arraykit-1.5.0 → arraykit-1.7.0}/src/py.typed +0 -0
  33. {arraykit-1.5.0 → arraykit-1.7.0}/src/tri_map.c +0 -0
  34. {arraykit-1.5.0 → arraykit-1.7.0}/src/tri_map.h +0 -0
  35. {arraykit-1.5.0 → arraykit-1.7.0}/src/utilities.h +0 -0
  36. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_array_go.py +0 -0
  37. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_astype_array.py +0 -0
  38. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_auto_map.py +0 -0
  39. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_auto_map_property.py +0 -0
  40. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_block_index.py +0 -0
  41. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_delimited_to_arrays.py +0 -0
  42. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_delimited_to_arrays_integration.py +0 -0
  43. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_delimited_to_arrays_property.py +0 -0
  44. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_nonzero_1d.py +0 -0
  45. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_nonzero_1d_property.py +0 -0
  46. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_objectable.py +0 -0
  47. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_pyi.py +0 -0
  48. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_split_after_count.py +0 -0
  49. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_transition_slices_from_group.py +0 -0
  50. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_tri_map.py +0 -0
  51. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_type_discovery.py +0 -0
  52. {arraykit-1.5.0 → arraykit-1.7.0}/test/test_util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arraykit
3
- Version: 1.5.0
3
+ Version: 1.7.0
4
4
  Summary: Array utilities for StaticFrame
5
5
  Author: Christopher Ariza, Brandt Bucher, Charles Burkland
6
6
  License: MIT
@@ -47,7 +47,7 @@ arraykit
47
47
 
48
48
  The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions.
49
49
 
50
- Code: https://github.com/InvestmentSystems/arraykit
50
+ Code: https://github.com/static-frame/arraykit
51
51
 
52
52
  Packages: https://pypi.org/project/arraykit
53
53
 
@@ -64,9 +64,20 @@ ArrayKit requires the following:
64
64
  What is New in ArrayKit
65
65
  -------------------------
66
66
 
67
+ 1.7.0
68
+ ............
69
+
70
+ Added ``group_ordering()``.
71
+
72
+
73
+ 1.6.0
74
+ ............
75
+ Added ``factorize()``.
76
+
77
+
67
78
  1.5.0
68
79
  ............
69
- Added ``transition_slices_from_group``.
80
+ Added ``transition_slices_from_group()``.
70
81
 
71
82
 
72
83
  1.4.0
@@ -18,7 +18,7 @@ arraykit
18
18
 
19
19
  The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions.
20
20
 
21
- Code: https://github.com/InvestmentSystems/arraykit
21
+ Code: https://github.com/static-frame/arraykit
22
22
 
23
23
  Packages: https://pypi.org/project/arraykit
24
24
 
@@ -35,9 +35,20 @@ ArrayKit requires the following:
35
35
  What is New in ArrayKit
36
36
  -------------------------
37
37
 
38
+ 1.7.0
39
+ ............
40
+
41
+ Added ``group_ordering()``.
42
+
43
+
44
+ 1.6.0
45
+ ............
46
+ Added ``factorize()``.
47
+
48
+
38
49
  1.5.0
39
50
  ............
40
- Added ``transition_slices_from_group``.
51
+ Added ``transition_slices_from_group()``.
41
52
 
42
53
 
43
54
  1.4.0
arraykit-1.7.0/VERSION ADDED
@@ -0,0 +1,2 @@
1
+ 1.7.0
2
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arraykit
3
- Version: 1.5.0
3
+ Version: 1.7.0
4
4
  Summary: Array utilities for StaticFrame
5
5
  Author: Christopher Ariza, Brandt Bucher, Charles Burkland
6
6
  License: MIT
@@ -47,7 +47,7 @@ arraykit
47
47
 
48
48
  The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions.
49
49
 
50
- Code: https://github.com/InvestmentSystems/arraykit
50
+ Code: https://github.com/static-frame/arraykit
51
51
 
52
52
  Packages: https://pypi.org/project/arraykit
53
53
 
@@ -64,9 +64,20 @@ ArrayKit requires the following:
64
64
  What is New in ArrayKit
65
65
  -------------------------
66
66
 
67
+ 1.7.0
68
+ ............
69
+
70
+ Added ``group_ordering()``.
71
+
72
+
73
+ 1.6.0
74
+ ............
75
+ Added ``factorize()``.
76
+
77
+
67
78
  1.5.0
68
79
  ............
69
- Added ``transition_slices_from_group``.
80
+ Added ``transition_slices_from_group()``.
70
81
 
71
82
 
72
83
  1.4.0
@@ -36,6 +36,8 @@ test/test_block_index.py
36
36
  test/test_delimited_to_arrays.py
37
37
  test/test_delimited_to_arrays_integration.py
38
38
  test/test_delimited_to_arrays_property.py
39
+ test/test_factorize.py
40
+ test/test_group_ordering.py
39
41
  test/test_nonzero_1d.py
40
42
  test/test_nonzero_1d_property.py
41
43
  test/test_objectable.py
@@ -25,6 +25,8 @@ from ._arraykit import iterable_str_to_array_1d as iterable_str_to_array_1d
25
25
  from ._arraykit import split_after_count as split_after_count
26
26
  from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen
27
27
  from ._arraykit import write_array_to_file as write_array_to_file
28
+ from ._arraykit import factorize as factorize
29
+ from ._arraykit import group_ordering as group_ordering
28
30
  from ._arraykit import count_iteration as count_iteration
29
31
  from ._arraykit import first_true_1d as first_true_1d
30
32
  from ._arraykit import first_true_2d as first_true_2d
@@ -224,6 +224,12 @@ def write_array_to_file(
224
224
  fortran_order: bool = False,
225
225
  buffersize: int = 8192,
226
226
  ) -> None: ...
227
+ def factorize(
228
+ array: np.ndarray, *, sort: bool = ...
229
+ ) -> tp.Tuple[np.ndarray, np.ndarray]: ...
230
+ def group_ordering(
231
+ codes: np.ndarray, *, size: tp.Optional[int] = ...
232
+ ) -> tp.Tuple[np.ndarray, np.ndarray]: ...
227
233
  def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ...
228
234
  def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ...
229
235
  def nonzero_1d(__array: np.ndarray, /) -> np.ndarray: ...
@@ -70,6 +70,14 @@ static PyMethodDef arraykit_methods[] = {
70
70
  (PyCFunction)write_array_to_file,
71
71
  METH_VARARGS | METH_KEYWORDS,
72
72
  NULL},
73
+ {"factorize",
74
+ (PyCFunction)factorize,
75
+ METH_VARARGS | METH_KEYWORDS,
76
+ NULL},
77
+ {"group_ordering",
78
+ (PyCFunction)group_ordering,
79
+ METH_VARARGS | METH_KEYWORDS,
80
+ NULL},
73
81
  {NULL},
74
82
  };
75
83
 
@@ -2520,6 +2520,406 @@ error:
2520
2520
  # undef INSERT_FLEXIBLE
2521
2521
 
2522
2522
 
2523
+ //------------------------------------------------------------------------------
2524
+ // factorize
2525
+
2526
+ // A fast float hash for factorize only. Unlike double_to_hash (CPython-
2527
+ // compatible, so hash(1.0)==hash(1), via frexp + a loop), factorize never needs
2528
+ // cross-type hashing -- it only compares floats to floats from the same array by
2529
+ // `==`. So we canonicalize -0.0 to +0.0 (they compare equal, must hash equal),
2530
+ // reinterpret the bits, and apply a splitmix64 finalizer for good avalanche.
2531
+ // NaN never reaches here (handled before probing); +/-inf hash distinctly, which
2532
+ // is correct since they are distinct values. lookup_hash_double takes the hash as
2533
+ // a parameter, so this stays self-consistent within factorize's scratch table.
2534
+ static inline Py_hash_t
2535
+ factorize_double_to_hash(double v)
2536
+ {
2537
+ if (v == 0.0) {
2538
+ v = 0.0; // collapse -0.0 to +0.0
2539
+ }
2540
+ npy_uint64 x;
2541
+ memcpy(&x, &v, sizeof(x));
2542
+ x ^= x >> 33;
2543
+ x *= 0xff51afd7ed558ccdULL;
2544
+ x ^= x >> 33;
2545
+ x *= 0xc4ceb9fe1a85ec53ULL;
2546
+ x ^= x >> 33;
2547
+ Py_hash_t h = (Py_hash_t)x;
2548
+ return h == -1 ? -2 : h; // -1 marks an empty slot
2549
+ }
2550
+
2551
+ // Detect a Python-level float NaN (a Python float or a NumPy floating scalar) so
2552
+ // that object-dtype arrays collapse all NaN into a single code, matching the
2553
+ // float-dtype behavior. Only real floats are treated as NaN here; None, NaT, and
2554
+ // complex NaN remain ordinary distinct keys. Returns 1 for float NaN, else 0.
2555
+ static inline int
2556
+ factorize_obj_is_float_nan(PyObject* key)
2557
+ {
2558
+ if (PyFloat_Check(key)) {
2559
+ return isnan(PyFloat_AS_DOUBLE(key));
2560
+ }
2561
+ if (PyArray_IsScalar(key, Half)) {
2562
+ return npy_half_isnan(PyArrayScalar_VAL(key, Half));
2563
+ }
2564
+ if (PyArray_IsScalar(key, Float32)) {
2565
+ return isnan(PyArrayScalar_VAL(key, Float32));
2566
+ }
2567
+ if (PyArray_IsScalar(key, Float64)) {
2568
+ return isnan(PyArrayScalar_VAL(key, Float64));
2569
+ }
2570
+ return 0;
2571
+ }
2572
+
2573
+ // Given a probe result `pos_expr` and its `hash_val` for the element at index
2574
+ // `i`, either assign a new sequential code (empty slot) or reuse the code of the
2575
+ // already-seen key (occupied slot). Stores the first-occurrence input index in
2576
+ // the table so the reused lookup_hash_* probes compare against the right value.
2577
+ // Requires `scratch`, `codes`, `code_of_index`, `first_index`, `k`, `i` in scope.
2578
+ # define FACTORIZE_RECORD(pos_expr, hash_val) \
2579
+ { \
2580
+ Py_ssize_t _pos = (pos_expr); \
2581
+ if (_pos < 0) { \
2582
+ goto fail; \
2583
+ } \
2584
+ if (scratch.table[_pos].hash == -1) { \
2585
+ scratch.table[_pos].keys_pos = i; \
2586
+ scratch.table[_pos].hash = (hash_val); \
2587
+ first_index[k] = i; \
2588
+ code_of_index[i] = k; \
2589
+ codes[i] = k; \
2590
+ k++; \
2591
+ } \
2592
+ else { \
2593
+ codes[i] = code_of_index[scratch.table[_pos].keys_pos]; \
2594
+ } \
2595
+ } \
2596
+
2597
+ // Integer/unsigned scalar factorize loop. When the input is C-contiguous, index
2598
+ // a typed pointer (`b[i]`, compile-time itemsize) instead of PyArray_GETPTR1's
2599
+ // runtime stride multiply -- materially faster, matching INSERT_SCALARS.
2600
+ # define FACTORIZE_SCALAR_LOOP(npy_type, value_t, hash_func, lookup_func, kat_lookup) \
2601
+ { \
2602
+ if (contiguous) { \
2603
+ const npy_type* b = (const npy_type*)PyArray_DATA(a); \
2604
+ const npy_type* b_end = b + n; \
2605
+ npy_intp i = 0; \
2606
+ while (b < b_end) { \
2607
+ value_t v = (value_t)*b; \
2608
+ Py_hash_t hash = hash_func(v); \
2609
+ FACTORIZE_RECORD(lookup_func(&scratch, v, hash, kat_lookup), hash); \
2610
+ b++; \
2611
+ i++; \
2612
+ } \
2613
+ } \
2614
+ else { \
2615
+ for (npy_intp i = 0; i < n; i++) { \
2616
+ value_t v = (value_t)*(const npy_type*)PyArray_GETPTR1(a, i); \
2617
+ Py_hash_t hash = hash_func(v); \
2618
+ FACTORIZE_RECORD(lookup_func(&scratch, v, hash, kat_lookup), hash); \
2619
+ } \
2620
+ } \
2621
+ } \
2622
+
2623
+ # define FACTORIZE_INT(npy_type, kat_lookup) \
2624
+ FACTORIZE_SCALAR_LOOP(npy_type, npy_int64, int_to_hash, lookup_hash_int, kat_lookup)
2625
+
2626
+ # define FACTORIZE_UINT(npy_type, kat_lookup) \
2627
+ FACTORIZE_SCALAR_LOOP(npy_type, npy_uint64, uint_to_hash, lookup_hash_uint, kat_lookup)
2628
+
2629
+ // Per-element float body. All NaN collapse to one code and never enter the table
2630
+ // (the map compares with `==`, so NaN would otherwise never match itself);
2631
+ // +0.0/-0.0 and inf are handled correctly by the normal path.
2632
+ # define FACTORIZE_FLOAT_ELEM(value_expr, kat_lookup) \
2633
+ { \
2634
+ npy_double v = (value_expr); \
2635
+ if (v != v) { \
2636
+ if (nan_code < 0) { \
2637
+ nan_code = k; \
2638
+ first_index[k] = i; \
2639
+ codes[i] = k; \
2640
+ k++; \
2641
+ } \
2642
+ else { \
2643
+ codes[i] = nan_code; \
2644
+ } \
2645
+ } \
2646
+ else { \
2647
+ Py_hash_t hash = factorize_double_to_hash(v); \
2648
+ FACTORIZE_RECORD(lookup_hash_double(&scratch, v, hash, kat_lookup), hash); \
2649
+ } \
2650
+ } \
2651
+
2652
+ # define FACTORIZE_FLOAT(npy_type, kat_lookup, post_deref) \
2653
+ { \
2654
+ if (contiguous) { \
2655
+ const npy_type* b = (const npy_type*)PyArray_DATA(a); \
2656
+ const npy_type* b_end = b + n; \
2657
+ npy_intp i = 0; \
2658
+ while (b < b_end) { \
2659
+ FACTORIZE_FLOAT_ELEM(post_deref(*b), kat_lookup) \
2660
+ b++; \
2661
+ i++; \
2662
+ } \
2663
+ } \
2664
+ else { \
2665
+ for (npy_intp i = 0; i < n; i++) { \
2666
+ FACTORIZE_FLOAT_ELEM(post_deref(*(const npy_type*)PyArray_GETPTR1(a, i)), kat_lookup) \
2667
+ } \
2668
+ } \
2669
+ } \
2670
+
2671
+ // Flexible (unicode/string) loop. In the contiguous case step a running pointer
2672
+ // by dt_size (incremental add, no per-element multiply), matching INSERT_FLEXIBLE.
2673
+ # define FACTORIZE_FLEXIBLE(char_type, lookup_func, hash_func, get_end_func, dt_size_expr) \
2674
+ { \
2675
+ Py_ssize_t dt_size = (dt_size_expr); \
2676
+ if (contiguous) { \
2677
+ char_type* v = (char_type*)PyArray_DATA(a); \
2678
+ for (npy_intp i = 0; i < n; i++) { \
2679
+ Py_ssize_t ksize = get_end_func(v, dt_size) - v; \
2680
+ Py_hash_t hash = hash_func(v, ksize); \
2681
+ FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \
2682
+ v += dt_size; \
2683
+ } \
2684
+ } \
2685
+ else { \
2686
+ for (npy_intp i = 0; i < n; i++) { \
2687
+ char_type* v = (char_type*)PyArray_GETPTR1(a, i); \
2688
+ Py_ssize_t ksize = get_end_func(v, dt_size) - v; \
2689
+ Py_hash_t hash = hash_func(v, ksize); \
2690
+ FACTORIZE_RECORD(lookup_func(&scratch, v, ksize, hash), hash); \
2691
+ } \
2692
+ } \
2693
+ } \
2694
+
2695
+ // Hash-based factorize: return (uniques, codes) such that
2696
+ // array[i] == uniques[codes[i]], in O(n), reusing the AutoMap hash table.
2697
+ PyObject *
2698
+ factorize(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
2699
+ {
2700
+ static char *kwlist[] = {"array", "sort", NULL};
2701
+ PyObject *array_obj = NULL;
2702
+ int sort = 0;
2703
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs,
2704
+ "O|$p:factorize", kwlist,
2705
+ &array_obj,
2706
+ &sort)) {
2707
+ return NULL;
2708
+ }
2709
+ if (!PyArray_Check(array_obj)) {
2710
+ PyErr_Format(PyExc_TypeError,
2711
+ "Expected a NumPy array, not %s.", Py_TYPE(array_obj)->tp_name);
2712
+ return NULL;
2713
+ }
2714
+ PyArrayObject *a = (PyArrayObject *)array_obj;
2715
+ if (PyArray_NDIM(a) != 1) {
2716
+ PyErr_SetString(PyExc_TypeError, "Array must be 1-dimensional");
2717
+ return NULL;
2718
+ }
2719
+
2720
+ npy_intp n = PyArray_SIZE(a);
2721
+ int array_t = PyArray_TYPE(a);
2722
+ KeysArrayType kat = at_to_kat(array_t, a);
2723
+
2724
+ PyObject *codes_arr = NULL;
2725
+ PyObject *idx_arr = NULL;
2726
+ PyObject *uniques = NULL;
2727
+ PyObject *list = NULL; // object/KAT_LIST path only
2728
+ npy_intp *code_of_index = NULL; // input index -> code
2729
+ npy_intp *first_index = NULL; // code -> first-occurrence input index
2730
+ npy_intp *rank = NULL; // sort remap
2731
+ Py_ssize_t k = 0; // running distinct count
2732
+ Py_ssize_t nan_code = -1; // shared code for all NaN, -1 until seen
2733
+
2734
+ // A private, stack-allocated scratch table; never exposed to Python. Only
2735
+ // the fields read by grow_table / lookup_hash_* are set.
2736
+ FAMObject scratch;
2737
+ scratch.table = NULL;
2738
+ scratch.table_size = 0;
2739
+ scratch.keys = NULL;
2740
+ scratch.keys_array_type = kat;
2741
+ scratch.keys_size = n;
2742
+ scratch.key_buffer = NULL; // never needed by the array lookups
2743
+
2744
+ codes_arr = PyArray_EMPTY(1, &n, NPY_INTP, 0);
2745
+ if (!codes_arr) {
2746
+ goto fail;
2747
+ }
2748
+ npy_intp *codes = (npy_intp*)PyArray_DATA((PyArrayObject*)codes_arr);
2749
+
2750
+ if (n > 0) {
2751
+ code_of_index = PyMem_New(npy_intp, n);
2752
+ first_index = PyMem_New(npy_intp, n);
2753
+ if (!code_of_index || !first_index) {
2754
+ PyErr_NoMemory();
2755
+ goto fail;
2756
+ }
2757
+ }
2758
+
2759
+ // The lookups dereference scratch.keys at stored input indices. For a usable
2760
+ // array KAT that is the (borrowed) input array; otherwise build a list.
2761
+ if (kat) {
2762
+ scratch.keys = array_obj;
2763
+ }
2764
+ else {
2765
+ if (array_t == NPY_DATETIME || array_t == NPY_TIMEDELTA) {
2766
+ list = PySequence_List(array_obj);
2767
+ }
2768
+ else {
2769
+ list = PyArray_ToList(a);
2770
+ }
2771
+ if (!list) {
2772
+ goto fail;
2773
+ }
2774
+ scratch.keys = list;
2775
+ }
2776
+
2777
+ if (grow_table(&scratch, n)) {
2778
+ goto fail;
2779
+ }
2780
+
2781
+ // Enables the typed-pointer fast path in the scalar/flexible loops below.
2782
+ int contiguous = PyArray_IS_C_CONTIGUOUS(a);
2783
+
2784
+ switch (kat) {
2785
+ case KAT_INT8: FACTORIZE_INT(npy_int8, KAT_INT8); break;
2786
+ case KAT_INT16: FACTORIZE_INT(npy_int16, KAT_INT16); break;
2787
+ case KAT_INT32: FACTORIZE_INT(npy_int32, KAT_INT32); break;
2788
+ case KAT_INT64: FACTORIZE_INT(npy_int64, KAT_INT64); break;
2789
+ case KAT_UINT8: FACTORIZE_UINT(npy_uint8, KAT_UINT8); break;
2790
+ case KAT_UINT16: FACTORIZE_UINT(npy_uint16, KAT_UINT16); break;
2791
+ case KAT_UINT32: FACTORIZE_UINT(npy_uint32, KAT_UINT32); break;
2792
+ case KAT_UINT64: FACTORIZE_UINT(npy_uint64, KAT_UINT64); break;
2793
+ case KAT_FLOAT16: FACTORIZE_FLOAT(npy_half, KAT_FLOAT16, npy_half_to_double); break;
2794
+ case KAT_FLOAT32: FACTORIZE_FLOAT(npy_float, KAT_FLOAT32, ); break;
2795
+ case KAT_FLOAT64: FACTORIZE_FLOAT(npy_double, KAT_FLOAT64, ); break;
2796
+ case KAT_UNICODE:
2797
+ FACTORIZE_FLEXIBLE(Py_UCS4, lookup_hash_unicode, unicode_to_hash,
2798
+ ucs4_get_end_p, PyArray_ITEMSIZE(a) / UCS4_SIZE);
2799
+ break;
2800
+ case KAT_STRING:
2801
+ FACTORIZE_FLEXIBLE(char, lookup_hash_string, string_to_hash,
2802
+ char_get_end_p, PyArray_ITEMSIZE(a));
2803
+ break;
2804
+ case KAT_DTY:
2805
+ case KAT_DTM:
2806
+ case KAT_DTW:
2807
+ case KAT_DTD:
2808
+ case KAT_DTh:
2809
+ case KAT_DTm:
2810
+ case KAT_DTs:
2811
+ case KAT_DTms:
2812
+ case KAT_DTus:
2813
+ case KAT_DTns:
2814
+ case KAT_DTps:
2815
+ case KAT_DTfs:
2816
+ case KAT_DTas:
2817
+ // datetime64/timedelta64 store an int64; NaT (INT64_MIN) compares
2818
+ // equal to itself here, so all NaT collapse into one code naturally.
2819
+ FACTORIZE_INT(npy_int64, KAT_INT64);
2820
+ break;
2821
+ default: { // KAT_LIST: object dtype, complex, dt64 without a unit
2822
+ for (npy_intp i = 0; i < n; i++) {
2823
+ PyObject* key = PyList_GET_ITEM(list, i); // borrowed
2824
+ if (factorize_obj_is_float_nan(key)) {
2825
+ if (nan_code < 0) {
2826
+ nan_code = k;
2827
+ first_index[k] = i;
2828
+ codes[i] = k;
2829
+ k++;
2830
+ }
2831
+ else {
2832
+ codes[i] = nan_code;
2833
+ }
2834
+ continue;
2835
+ }
2836
+ Py_hash_t hash = PyObject_Hash(key);
2837
+ if (hash == -1) {
2838
+ goto fail;
2839
+ }
2840
+ FACTORIZE_RECORD(lookup_hash_obj(&scratch, key, hash), hash);
2841
+ }
2842
+ break;
2843
+ }
2844
+ }
2845
+
2846
+ // uniques: take the first-occurrence values out of the input, same dtype.
2847
+ idx_arr = PyArray_EMPTY(1, &k, NPY_INTP, 0);
2848
+ if (!idx_arr) {
2849
+ goto fail;
2850
+ }
2851
+ if (k > 0) {
2852
+ memcpy(PyArray_DATA((PyArrayObject*)idx_arr), first_index, k * sizeof(npy_intp));
2853
+ }
2854
+ uniques = PyArray_TakeFrom(a, idx_arr, 0, NULL, NPY_RAISE);
2855
+ if (!uniques) {
2856
+ goto fail;
2857
+ }
2858
+
2859
+ if (sort && k > 1) {
2860
+ PyObject* order = PyArray_ArgSort((PyArrayObject*)uniques, 0, NPY_QUICKSORT);
2861
+ if (!order) {
2862
+ goto fail;
2863
+ }
2864
+ npy_intp* order_data = (npy_intp*)PyArray_DATA((PyArrayObject*)order);
2865
+ rank = PyMem_New(npy_intp, k);
2866
+ if (!rank) {
2867
+ Py_DECREF(order);
2868
+ PyErr_NoMemory();
2869
+ goto fail;
2870
+ }
2871
+ for (npy_intp j = 0; j < k; j++) {
2872
+ rank[order_data[j]] = j;
2873
+ }
2874
+ for (npy_intp i = 0; i < n; i++) {
2875
+ codes[i] = rank[codes[i]];
2876
+ }
2877
+ PyObject* uniques_sorted = PyArray_TakeFrom(
2878
+ (PyArrayObject*)uniques, order, 0, NULL, NPY_RAISE);
2879
+ Py_DECREF(order);
2880
+ if (!uniques_sorted) {
2881
+ goto fail;
2882
+ }
2883
+ Py_SETREF(uniques, uniques_sorted);
2884
+ }
2885
+
2886
+ PyArray_CLEARFLAGS((PyArrayObject*)codes_arr, NPY_ARRAY_WRITEABLE);
2887
+ PyArray_CLEARFLAGS((PyArrayObject*)uniques, NPY_ARRAY_WRITEABLE);
2888
+
2889
+ PyMem_Free(scratch.table);
2890
+ PyMem_Free(code_of_index);
2891
+ PyMem_Free(first_index);
2892
+ PyMem_Free(rank);
2893
+ Py_DECREF(idx_arr);
2894
+ Py_XDECREF(list);
2895
+
2896
+ PyObject* result = PyTuple_Pack(2, uniques, codes_arr);
2897
+ Py_DECREF(uniques);
2898
+ Py_DECREF(codes_arr);
2899
+ return result;
2900
+
2901
+ fail:
2902
+ PyMem_Free(scratch.table);
2903
+ PyMem_Free(code_of_index);
2904
+ PyMem_Free(first_index);
2905
+ PyMem_Free(rank);
2906
+ Py_XDECREF(idx_arr);
2907
+ Py_XDECREF(list);
2908
+ Py_XDECREF(codes_arr);
2909
+ Py_XDECREF(uniques);
2910
+ return NULL;
2911
+ }
2912
+
2913
+ # undef FACTORIZE_RECORD
2914
+ # undef FACTORIZE_SCALAR_LOOP
2915
+ # undef FACTORIZE_INT
2916
+ # undef FACTORIZE_UINT
2917
+ # undef FACTORIZE_FLOAT_ELEM
2918
+ # undef FACTORIZE_FLOAT
2919
+ # undef FACTORIZE_FLEXIBLE
2920
+
2921
+ //------------------------------------------------------------------------------
2922
+
2523
2923
  static PyObject *
2524
2924
  fam_repr(FAMObject *self)
2525
2925
  {
@@ -10,5 +10,7 @@ extern PyTypeObject FAMVType;
10
10
  extern PyTypeObject FAMType;
11
11
  extern PyObject *NonUniqueError;
12
12
 
13
+ PyObject *factorize(PyObject *m, PyObject *args, PyObject *kwargs);
14
+
13
15
 
14
16
  # endif /* ARRAYKIT_SRC_AUTO_MAP_H_ */