PyPI - arraykit - Versions diffs - 1.6.0__tar.gz → 1.7.0__tar.gz - Mend

arraykit 1.6.0tar.gz → 1.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

{arraykit-1.6.0/arraykit.egg-info → arraykit-1.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arraykit
-Version: 1.6.0
+Version: 1.7.0
 Summary: Array utilities for StaticFrame
 Author: Christopher Ariza, Brandt Bucher, Charles Burkland
 License: MIT
@@ -64,14 +64,20 @@ ArrayKit requires the following:
 What is New in ArrayKit
 -------------------------
+1.7.0
+............
+Added ``group_ordering()``.
 1.6.0
 ............
-Added ``factorize``.
+Added ``factorize()``.
 1.5.0
 ............
-Added ``transition_slices_from_group``.
+Added ``transition_slices_from_group()``.
 1.4.0

{arraykit-1.6.0 → arraykit-1.7.0}/README.rst RENAMED Viewed

@@ -35,14 +35,20 @@ ArrayKit requires the following:
 What is New in ArrayKit
 -------------------------
+1.7.0
+............
+Added ``group_ordering()``.
 1.6.0
 ............
-Added ``factorize``.
+Added ``factorize()``.
 1.5.0
 ............
-Added ``transition_slices_from_group``.
+Added ``transition_slices_from_group()``.
 1.4.0

arraykit-1.7.0/VERSION ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 1.7.0
2	+

{arraykit-1.6.0 → arraykit-1.7.0/arraykit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arraykit
-Version: 1.6.0
+Version: 1.7.0
 Summary: Array utilities for StaticFrame
 Author: Christopher Ariza, Brandt Bucher, Charles Burkland
 License: MIT
@@ -64,14 +64,20 @@ ArrayKit requires the following:
 What is New in ArrayKit
 -------------------------
+1.7.0
+............
+Added ``group_ordering()``.
 1.6.0
 ............
-Added ``factorize``.
+Added ``factorize()``.
 1.5.0
 ............
-Added ``transition_slices_from_group``.
+Added ``transition_slices_from_group()``.
 1.4.0

{arraykit-1.6.0 → arraykit-1.7.0}/arraykit.egg-info/SOURCES.txt RENAMED Viewed

@@ -37,6 +37,7 @@ test/test_delimited_to_arrays.py
 test/test_delimited_to_arrays_integration.py
 test/test_delimited_to_arrays_property.py
 test/test_factorize.py
+test/test_group_ordering.py
 test/test_nonzero_1d.py
 test/test_nonzero_1d_property.py
 test/test_objectable.py

{arraykit-1.6.0 → arraykit-1.7.0}/src/__init__.py RENAMED Viewed

@@ -26,6 +26,7 @@ from ._arraykit import split_after_count as split_after_count
 from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen
 from ._arraykit import write_array_to_file as write_array_to_file
 from ._arraykit import factorize as factorize
+from ._arraykit import group_ordering as group_ordering
 from ._arraykit import count_iteration as count_iteration
 from ._arraykit import first_true_1d as first_true_1d
 from ._arraykit import first_true_2d as first_true_2d

{arraykit-1.6.0 → arraykit-1.7.0}/src/__init__.pyi RENAMED Viewed

@@ -227,6 +227,9 @@ def write_array_to_file(
 def factorize(
     array: np.ndarray, *, sort: bool = ...
 ) -> tp.Tuple[np.ndarray, np.ndarray]: ...
+def group_ordering(
+    codes: np.ndarray, *, size: tp.Optional[int] = ...
+) -> tp.Tuple[np.ndarray, np.ndarray]: ...
 def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ...
 def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ...
 def nonzero_1d(__array: np.ndarray, /) -> np.ndarray: ...

{arraykit-1.6.0 → arraykit-1.7.0}/src/_arraykit.c RENAMED Viewed

@@ -74,6 +74,10 @@ static PyMethodDef arraykit_methods[] =  {
             (PyCFunction)factorize,
             METH_VARARGS | METH_KEYWORDS,
             NULL},
+    {"group_ordering",
+            (PyCFunction)group_ordering,
+            METH_VARARGS | METH_KEYWORDS,
+            NULL},
     {NULL},
 };

{arraykit-1.6.0 → arraykit-1.7.0}/src/methods.c RENAMED Viewed

@@ -985,6 +985,149 @@ first_true_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
     return (PyObject *)array_pos;
 }
+static char *group_ordering_kwarg_names[] = {
+    "codes",
+    "size",
+    NULL
+};
+// Stable counting sort of dense factorize codes. Given `codes` in [0, size),
+// return (permutation, offsets) such that permutation[offsets[g]:offsets[g+1]]
+// are the input positions of group g, in ascending (stable) order. This is an
+// O(n) alternative to np.argsort(codes, kind='stable') for already-dense codes.
+PyObject *
+group_ordering(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs)
+{
+    PyArrayObject *codes = NULL;
+    PyObject *size_obj = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs,
+            "O!|$O:group_ordering",
+            group_ordering_kwarg_names,
+            &PyArray_Type, &codes,
+            &size_obj
+            )) {
+        return NULL;
+    }
+    if (PyArray_NDIM(codes) != 1) {
+        PyErr_SetString(PyExc_ValueError, "Array must be 1-dimensional");
+        return NULL;
+    }
+    if (PyArray_TYPE(codes) != NPY_INTP) {
+        PyErr_SetString(PyExc_ValueError, "Array must be of type intp");
+        return NULL;
+    }
+    if (!PyArray_IS_C_CONTIGUOUS(codes)) {
+        PyErr_SetString(PyExc_ValueError, "Array must be contiguous");
+        return NULL;
+    }
+    npy_intp n = PyArray_SIZE(codes);
+    npy_intp *codes_buffer = (npy_intp*)PyArray_DATA(codes);
+    // Determine the number of groups: caller-provided, else max(codes) + 1.
+    npy_intp size = 0;
+    int size_given = (size_obj != NULL && size_obj != Py_None);
+    if (size_given) {
+        size = (npy_intp)PyNumber_AsSsize_t(size_obj, PyExc_OverflowError);
+        if (size == -1 && PyErr_Occurred()) {
+            return NULL;
+        }
+        if (size < 0) {
+            PyErr_SetString(PyExc_ValueError, "size must be non-negative");
+            return NULL;
+        }
+    }
+    else {
+        for (npy_intp i = 0; i < n; i++) {
+            npy_intp c = codes_buffer[i];
+            if (c < 0) {
+                PyErr_SetString(PyExc_ValueError, "codes must be non-negative");
+                return NULL;
+            }
+            // guard c + 1 against signed overflow (undefined behavior)
+            if (c == NPY_MAX_INTP) {
+                PyErr_SetString(PyExc_OverflowError,
+                        "cannot infer size: code value too large");
+                return NULL;
+            }
+            if (c + 1 > size) {
+                size = c + 1;
+            }
+        }
+    }
+    // offsets has length size + 1; guard that against signed overflow (covers
+    // both a caller-provided size and an inferred one)
+    if (size == NPY_MAX_INTP) {
+        PyErr_SetString(PyExc_OverflowError, "size too large");
+        return NULL;
+    }
+    PyObject *perm_arr = NULL;
+    PyObject *offsets_arr = NULL;
+    npy_intp *cursor = NULL;
+    perm_arr = PyArray_EMPTY(1, &n, NPY_INTP, 0);
+    if (!perm_arr) {
+        goto fail;
+    }
+    npy_intp size_plus = size + 1;
+    offsets_arr = PyArray_ZEROS(1, &size_plus, NPY_INTP, 0);
+    if (!offsets_arr) {
+        goto fail;
+    }
+    npy_intp *perm = (npy_intp*)PyArray_DATA((PyArrayObject*)perm_arr);
+    npy_intp *offsets = (npy_intp*)PyArray_DATA((PyArrayObject*)offsets_arr);
+    // Count pass: tally each group into offsets[c + 1]. When size was inferred
+    // the codes are already known to be in [0, size); only a caller-provided
+    // size needs the range validated here.
+    for (npy_intp i = 0; i < n; i++) {
+        npy_intp c = codes_buffer[i];
+        if (size_given && (c < 0 || c >= size)) {
+            PyErr_Format(PyExc_ValueError,
+                    "code %zd out of range [0, %zd)",
+                    (Py_ssize_t)c, (Py_ssize_t)size);
+            goto fail;
+        }
+        offsets[c + 1]++;
+    }
+    // Prefix sum: offsets[g] becomes the start index of group g; offsets[size] == n.
+    for (npy_intp g = 0; g < size; g++) {
+        offsets[g + 1] += offsets[g];
+    }
+    // Scatter pass: place each input position at its group's running cursor.
+    // Ascending i preserves original order within each group (stability).
+    if (size > 0) {
+        cursor = PyMem_New(npy_intp, size);
+        if (!cursor) {
+            PyErr_NoMemory();
+            goto fail;
+        }
+        memcpy(cursor, offsets, size * sizeof(npy_intp));
+        for (npy_intp i = 0; i < n; i++) {
+            perm[cursor[codes_buffer[i]]++] = i;
+        }
+    }
+    PyArray_CLEARFLAGS((PyArrayObject*)perm_arr, NPY_ARRAY_WRITEABLE);
+    PyArray_CLEARFLAGS((PyArrayObject*)offsets_arr, NPY_ARRAY_WRITEABLE);
+    PyMem_Free(cursor);
+    PyObject *result = PyTuple_Pack(2, perm_arr, offsets_arr);
+    Py_DECREF(perm_arr);
+    Py_DECREF(offsets_arr);
+    return result;
+fail:
+    PyMem_Free(cursor);
+    Py_XDECREF(perm_arr);
+    Py_XDECREF(offsets_arr);
+    return NULL;
+}
 PyObject *
 dtype_from_element(PyObject *Py_UNUSED(m), PyObject *arg)
 {

{arraykit-1.6.0 → arraykit-1.7.0}/src/methods.h RENAMED Viewed

@@ -69,6 +69,9 @@ first_true_1d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs);
 PyObject *
 first_true_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs);
+PyObject *
+group_ordering(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs);
 PyObject *
 dtype_from_element(PyObject *Py_UNUSED(m), PyObject *arg);

arraykit-1.7.0/test/test_group_ordering.py ADDED Viewed

@@ -0,0 +1,231 @@
+import unittest
+import numpy as np
+from arraykit import group_ordering
+from arraykit import factorize
+def offsets_from_codes(codes, size):
+    # CSR-style offsets: [0, *cumsum(bincount(codes, minlength=size))]
+    counts = np.bincount(codes, minlength=size)
+    return np.concatenate([[0], np.cumsum(counts)]).astype(np.intp)
+class TestUnit(unittest.TestCase):
+    # ------------------------------------------------------------------
+    # basic behavior
+    def test_group_ordering_basic_a(self) -> None:
+        codes = np.array([0, 0, 0, 1, 1, 2], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertEqual(perm.tolist(), [0, 1, 2, 3, 4, 5])
+        self.assertEqual(offsets.tolist(), [0, 3, 5, 6])
+    def test_group_ordering_basic_b(self) -> None:
+        codes = np.array([2, 0, 0, 2, 1, 1, 0, 0, 3, 0], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertEqual(perm.tolist(), [1, 2, 6, 7, 9, 4, 5, 0, 3, 8])
+        self.assertEqual(offsets.tolist(), [0,  5,  7,  9, 10])
+    def test_group_ordering_interleaved(self) -> None:
+        codes = np.array([2, 0, 1, 0, 2, 1], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        # group 0 -> positions 1, 3; group 1 -> 2, 5; group 2 -> 0, 4
+        self.assertEqual(offsets.tolist(), [0, 2, 4, 6])
+        self.assertEqual(perm[offsets[0]:offsets[1]].tolist(), [1, 3])
+        self.assertEqual(perm[offsets[1]:offsets[2]].tolist(), [2, 5])
+        self.assertEqual(perm[offsets[2]:offsets[3]].tolist(), [0, 4])
+    def test_group_ordering_stability(self) -> None:
+        # original positions within each group must stay ascending
+        codes = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertEqual(perm[offsets[0]:offsets[1]].tolist(), [0, 2, 4])
+        self.assertEqual(perm[offsets[1]:offsets[2]].tolist(), [1, 3, 5])
+    # ------------------------------------------------------------------
+    # parity against numpy oracle
+    def test_group_ordering_parity_argsort(self) -> None:
+        rng = np.random.default_rng(0)
+        for size in (1, 5, 50, 500):
+            codes = rng.integers(0, size, size=10_000).astype(np.intp)
+            perm, offsets = group_ordering(codes)
+            expected = np.argsort(codes, kind='stable').astype(np.intp)
+            self.assertEqual(perm.tolist(), expected.tolist())
+            self.assertEqual(
+                offsets.tolist(), offsets_from_codes(codes, size).tolist()
+            )
+    def test_group_ordering_parity_inferred_size(self) -> None:
+        rng = np.random.default_rng(1)
+        codes = rng.integers(0, 100, size=5_000).astype(np.intp)
+        perm, offsets = group_ordering(codes)
+        size = int(codes.max()) + 1
+        self.assertEqual(len(offsets), size + 1)
+        expected = np.argsort(codes, kind='stable').astype(np.intp)
+        self.assertEqual(perm.tolist(), expected.tolist())
+    # ------------------------------------------------------------------
+    # dtype / shape
+    def test_group_ordering_dtypes(self) -> None:
+        codes = np.array([0, 1, 0, 2], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertEqual(perm.dtype, np.dtype(np.intp))
+        self.assertEqual(offsets.dtype, np.dtype(np.intp))
+    def test_group_ordering_offsets_length(self) -> None:
+        codes = np.array([0, 1, 2, 3], dtype=np.intp)
+        _, offsets = group_ordering(codes, size=10)
+        self.assertEqual(len(offsets), 11)
+        self.assertEqual(offsets[-1], 4)
+    # ------------------------------------------------------------------
+    # size keyword
+    def test_group_ordering_size_explicit(self) -> None:
+        codes = np.array([0, 0, 1, 1], dtype=np.intp)
+        perm, offsets = group_ordering(codes, size=2)
+        self.assertEqual(perm.tolist(), [0, 1, 2, 3])
+        self.assertEqual(offsets.tolist(), [0, 2, 4])
+    def test_group_ordering_size_trailing_empty(self) -> None:
+        codes = np.array([0, 0, 1], dtype=np.intp)
+        _, offsets = group_ordering(codes, size=4)
+        # groups 2 and 3 are empty: offsets[g] == offsets[g+1]
+        self.assertEqual(offsets.tolist(), [0, 2, 3, 3, 3])
+    def test_group_ordering_size_none(self) -> None:
+        codes = np.array([0, 1, 1], dtype=np.intp)
+        perm, offsets = group_ordering(codes, size=None)
+        self.assertEqual(offsets.tolist(), [0, 1, 3])
+    def test_group_ordering_size_is_keyword_only(self) -> None:
+        codes = np.array([0, 1], dtype=np.intp)
+        with self.assertRaises(TypeError):
+            group_ordering(codes, 2)
+    # ------------------------------------------------------------------
+    # edge cases
+    def test_group_ordering_empty(self) -> None:
+        codes = np.array([], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertEqual(perm.tolist(), [])
+        self.assertEqual(offsets.tolist(), [0])
+    def test_group_ordering_empty_with_size(self) -> None:
+        codes = np.array([], dtype=np.intp)
+        perm, offsets = group_ordering(codes, size=3)
+        self.assertEqual(perm.tolist(), [])
+        self.assertEqual(offsets.tolist(), [0, 0, 0, 0])
+    def test_group_ordering_single(self) -> None:
+        codes = np.array([0], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertEqual(perm.tolist(), [0])
+        self.assertEqual(offsets.tolist(), [0, 1])
+    def test_group_ordering_single_group(self) -> None:
+        codes = np.array([0, 0, 0], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertEqual(perm.tolist(), [0, 1, 2])
+        self.assertEqual(offsets.tolist(), [0, 3])
+    def test_group_ordering_all_distinct(self) -> None:
+        codes = np.array([3, 2, 1, 0], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertEqual(perm.tolist(), [3, 2, 1, 0])
+        self.assertEqual(offsets.tolist(), [0, 1, 2, 3, 4])
+    # ------------------------------------------------------------------
+    # validation
+    def test_group_ordering_not_array(self) -> None:
+        with self.assertRaises(TypeError):
+            group_ordering([0, 1, 2])
+    def test_group_ordering_2d(self) -> None:
+        codes = np.array([[0, 1], [1, 0]], dtype=np.intp)
+        with self.assertRaises(ValueError):
+            group_ordering(codes)
+    def test_group_ordering_wrong_dtype(self) -> None:
+        # pick an integer width that differs from intp on this platform
+        # (intp is 32-bit on some Windows builds, 64-bit elsewhere)
+        wrong = np.int32 if np.dtype(np.intp).itemsize != 4 else np.int64
+        codes = np.array([0, 1, 2], dtype=wrong)
+        with self.assertRaises(ValueError):
+            group_ordering(codes)
+    def test_group_ordering_wrong_dtype_float(self) -> None:
+        codes = np.array([0.0, 1.0, 2.0], dtype=np.float64)
+        with self.assertRaises(ValueError):
+            group_ordering(codes)
+    def test_group_ordering_non_contiguous(self) -> None:
+        codes = np.arange(10, dtype=np.intp)[::2]
+        self.assertFalse(codes.flags['C_CONTIGUOUS'])
+        with self.assertRaises(ValueError):
+            group_ordering(codes)
+    def test_group_ordering_negative_code_inferred(self) -> None:
+        codes = np.array([0, -1, 1], dtype=np.intp)
+        with self.assertRaises(ValueError):
+            group_ordering(codes)
+    def test_group_ordering_out_of_range(self) -> None:
+        codes = np.array([0, 1, 5], dtype=np.intp)
+        with self.assertRaises(ValueError):
+            group_ordering(codes, size=3)
+    def test_group_ordering_negative_size(self) -> None:
+        codes = np.array([0, 1], dtype=np.intp)
+        with self.assertRaises(ValueError):
+            group_ordering(codes, size=-1)
+    def test_group_ordering_size_out_of_range_zero(self) -> None:
+        codes = np.array([0, 1], dtype=np.intp)
+        with self.assertRaises(ValueError):
+            group_ordering(codes, size=0)
+    def test_group_ordering_infer_overflow(self) -> None:
+        # a code at the intp max would overflow when inferring size = c + 1
+        codes = np.array([np.iinfo(np.intp).max], dtype=np.intp)
+        with self.assertRaises(OverflowError):
+            group_ordering(codes)
+    def test_group_ordering_size_overflow(self) -> None:
+        # an explicit size at the intp max would overflow computing size + 1
+        codes = np.array([0, 1], dtype=np.intp)
+        with self.assertRaises(OverflowError):
+            group_ordering(codes, size=np.iinfo(np.intp).max)
+    # ------------------------------------------------------------------
+    # immutability
+    def test_group_ordering_outputs_immutable(self) -> None:
+        codes = np.array([0, 1, 0], dtype=np.intp)
+        perm, offsets = group_ordering(codes)
+        self.assertFalse(perm.flags.writeable)
+        self.assertFalse(offsets.flags.writeable)
+    # ------------------------------------------------------------------
+    # round-trip with factorize
+    def test_group_ordering_with_factorize(self) -> None:
+        a = np.array(['b', 'a', 'b', 'c', 'a', 'a'])
+        uniques, codes = factorize(a)
+        perm, offsets = group_ordering(codes, size=len(uniques))
+        ordered = a[perm]
+        # each group's slice of the reordered array is constant
+        for g in range(len(uniques)):
+            segment = ordered[offsets[g]:offsets[g + 1]]
+            self.assertTrue((segment == segment[0]).all())
+if __name__ == '__main__':
+    unittest.main()