mlsort 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mlsort-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Siddharth Chaudhary
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
mlsort-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.4
2
+ Name: mlsort
3
+ Version: 0.1.0
4
+ Summary: ML-guided sorting backend selector with install-time benchmarking
5
+ Author: Siddharth Chaudhary
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Siddharth Chaudhary
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/sidcoding/mlsort
29
+ Project-URL: Repository, https://github.com/sidcoding/mlsort
30
+ Project-URL: Issues, https://github.com/sidcoding/mlsort/issues
31
+ Keywords: sorting,machine-learning,numpy,performance,benchmark,timsort,radix,counting-sort,decision-tree
32
+ Classifier: Programming Language :: Python :: 3
33
+ Classifier: Programming Language :: Python :: 3 :: Only
34
+ Classifier: Programming Language :: Python :: 3.9
35
+ Classifier: Programming Language :: Python :: 3.10
36
+ Classifier: Programming Language :: Python :: 3.11
37
+ Classifier: License :: OSI Approved :: MIT License
38
+ Classifier: Operating System :: OS Independent
39
+ Classifier: Intended Audience :: Developers
40
+ Classifier: Topic :: Software Development :: Libraries
41
+ Classifier: Topic :: System :: Benchmark
42
+ Requires-Python: >=3.9
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Requires-Dist: numpy>=1.24
46
+ Requires-Dist: scikit-learn>=1.3
47
+ Requires-Dist: scipy>=1.10
48
+ Requires-Dist: joblib>=1.3
49
+ Dynamic: license-file
50
+
51
+ # mlsort
52
+
53
+ ML-guided sorting backend selector. Chooses between Python Timsort, NumPy sorts, and integer-only counting/radix based on cheap, sampled properties of your data. Defaults are safe; selection only activates for large arrays.
54
+
55
+ ## Install
56
+
57
+ ```bash
58
+ pip install mlsort
59
+ ```
60
+
61
+ Optionally initialize thresholds and optimized cutoffs (recommended once per machine/user):
62
+
63
+ ```bash
64
+ mlsort-init # all params optional; see below
65
+ ```
66
+
67
+ ## Quick usage
68
+
69
+ Top-level API:
70
+
71
+ ```python
72
+ from mlsort import sort, select_algorithm
73
+
74
+ data = [3, 1, 2, 5, 4]
75
+ algo = select_algorithm(data) # e.g., 'timsort' or a NumPy backend
76
+ out = sort(data) # returns a new sorted list
77
+
78
+ # Options compatible with Python sort()
79
+ out_desc = sort(data, reverse=True)
80
+ out_by_len = sort(["aa", "b", "cccc"], key=len) # forces builtin Timsort
81
+ ```
82
+
83
+ Behavior summary:
84
+ - Mixed/object/string inputs default to builtin Timsort for safety and compatibility.
85
+ - Passing a key function forces builtin Timsort (NumPy/counting/radix do not support key).
86
+ - reverse=True is supported for all backends; for non-Timsort, results are reversed after sorting.
87
+ - For small arrays, Timsort is used; for medium arrays, NumPy quicksort; the ML decision runs only for very large arrays.
88
+
89
+ ## CLI: initialize thresholds (optional)
90
+
91
+ ```bash
92
+ mlsort-init \
93
+ --samples 1200 \ # training samples (default 1200)
94
+ --max-n 200000 \ # max array size to consider (default 200000)
95
+ --seed 42 \ # default from MLSORT_SEED or 42
96
+ --artifacts /path/to/cache # default MLSORT_ARTIFACTS_DIR or OS cache
97
+ ```
98
+
99
+ This writes `thresholds.json` under the artifacts directory and optimizes two size thresholds:
100
+ - cutoff_n: below this, always use Timsort.
101
+ - activation_n: only run ML decision at/above this size; between cutoff and activation use a fast default (NumPy quicksort).
102
+
103
+ ## Configuration
104
+
105
+ Use environment variables to control behavior:
106
+
107
+ - MLSORT_ARTIFACTS_DIR: directory for cached artifacts (default: OS cache, e.g., `~/Library/Caches/mlsort` on macOS).
108
+ - MLSORT_ENABLE_INSTALL_BENCH=1: allow benchmarking during lazy first-use initialization.
109
+ - MLSORT_INIT_ON_IMPORT=1: opt-in to run a short init automatically on first import if artifacts are missing.
110
+ - MLSORT_SEED=...: deterministic random seed for benchmarking.
111
+ - MLSORT_DEBUG=1: debug logs showing the selected algorithm and paths.
112
+
113
+ ## Supported algorithms
114
+
115
+ - Python Timsort (`list.sort`)
116
+ - NumPy quicksort and mergesort
117
+ - Counting sort (integers only; guarded by range to avoid large memory)
118
+ - Radix LSD sort (integers only)
119
+
120
+ ## Safety and limits
121
+
122
+ - Always-safe fallback: if selection fails or types are unsupported, we use builtin Timsort.
123
+ - Type handling: strings/bytes/mixed objects use Timsort. Numeric-only arrays may use NumPy or integer algorithms.
124
+ - Resource bounds: counting/radix only used when safe; decision is skipped for small/medium arrays to avoid overhead.
125
+
126
+ ## Python versions
127
+
128
+ Tested on Python 3.9–3.11 in CI.
129
+
130
+ ## Troubleshooting
131
+
132
+ - Selection slower than a single baseline: ensure you ran `mlsort-init` and that your data sizes reach the activation threshold. For mostly small arrays, Timsort/NumPy will be chosen automatically.
133
+ - Custom cache location: set `MLSORT_ARTIFACTS_DIR` before running `mlsort-init` or your program.
134
+ - Need full control: call `select_algorithm(...)` to see what would be chosen, then run your preferred sort.
135
+
mlsort-0.1.0/README.md ADDED
@@ -0,0 +1,85 @@
1
+ # mlsort
2
+
3
+ ML-guided sorting backend selector. Chooses between Python Timsort, NumPy sorts, and integer-only counting/radix based on cheap, sampled properties of your data. Defaults are safe; selection only activates for large arrays.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install mlsort
9
+ ```
10
+
11
+ Optionally initialize thresholds and optimized cutoffs (recommended once per machine/user):
12
+
13
+ ```bash
14
+ mlsort-init # all params optional; see below
15
+ ```
16
+
17
+ ## Quick usage
18
+
19
+ Top-level API:
20
+
21
+ ```python
22
+ from mlsort import sort, select_algorithm
23
+
24
+ data = [3, 1, 2, 5, 4]
25
+ algo = select_algorithm(data) # e.g., 'timsort' or a NumPy backend
26
+ out = sort(data) # returns a new sorted list
27
+
28
+ # Options compatible with Python sort()
29
+ out_desc = sort(data, reverse=True)
30
+ out_by_len = sort(["aa", "b", "cccc"], key=len) # forces builtin Timsort
31
+ ```
32
+
33
+ Behavior summary:
34
+ - Mixed/object/string inputs default to builtin Timsort for safety and compatibility.
35
+ - Passing a key function forces builtin Timsort (NumPy/counting/radix do not support key).
36
+ - reverse=True is supported for all backends; for non-Timsort, results are reversed after sorting.
37
+ - For small arrays, Timsort is used; for medium arrays, NumPy quicksort; the ML decision runs only for very large arrays.
38
+
39
+ ## CLI: initialize thresholds (optional)
40
+
41
+ ```bash
42
+ mlsort-init \
43
+ --samples 1200 \ # training samples (default 1200)
44
+ --max-n 200000 \ # max array size to consider (default 200000)
45
+ --seed 42 \ # default from MLSORT_SEED or 42
46
+ --artifacts /path/to/cache # default MLSORT_ARTIFACTS_DIR or OS cache
47
+ ```
48
+
49
+ This writes `thresholds.json` under the artifacts directory and optimizes two size thresholds:
50
+ - cutoff_n: below this, always use Timsort.
51
+ - activation_n: only run ML decision at/above this size; between cutoff and activation use a fast default (NumPy quicksort).
52
+
53
+ ## Configuration
54
+
55
+ Use environment variables to control behavior:
56
+
57
+ - MLSORT_ARTIFACTS_DIR: directory for cached artifacts (default: OS cache, e.g., `~/Library/Caches/mlsort` on macOS).
58
+ - MLSORT_ENABLE_INSTALL_BENCH=1: allow benchmarking during lazy first-use initialization.
59
+ - MLSORT_INIT_ON_IMPORT=1: opt-in to run a short init automatically on first import if artifacts are missing.
60
+ - MLSORT_SEED=...: deterministic random seed for benchmarking.
61
+ - MLSORT_DEBUG=1: debug logs showing the selected algorithm and paths.
62
+
63
+ ## Supported algorithms
64
+
65
+ - Python Timsort (`list.sort`)
66
+ - NumPy quicksort and mergesort
67
+ - Counting sort (integers only; guarded by range to avoid large memory)
68
+ - Radix LSD sort (integers only)
69
+
70
+ ## Safety and limits
71
+
72
+ - Always-safe fallback: if selection fails or types are unsupported, we use builtin Timsort.
73
+ - Type handling: strings/bytes/mixed objects use Timsort. Numeric-only arrays may use NumPy or integer algorithms.
74
+ - Resource bounds: counting/radix only used when safe; decision is skipped for small/medium arrays to avoid overhead.
75
+
76
+ ## Python versions
77
+
78
+ Tested on Python 3.9–3.11 in CI.
79
+
80
+ ## Troubleshooting
81
+
82
+ - Selection slower than a single baseline: ensure you ran `mlsort-init` and that your data sizes reach the activation threshold. For mostly small arrays, Timsort/NumPy will be chosen automatically.
83
+ - Custom cache location: set `MLSORT_ARTIFACTS_DIR` before running `mlsort-init` or your program.
84
+ - Need full control: call `select_algorithm(...)` to see what would be chosen, then run your preferred sort.
85
+
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ from .api import sort, select_algorithm
6
+ from .config import get_artifacts_dir, get_env_bool, get_seed
7
+ from .installer import train_thresholds, save_thresholds, load_thresholds
8
+ from .optimize import gen_cases, optimize_cutoffs
9
+
10
+ __all__ = ["sort", "select_algorithm", "features", "algorithms", "baseline", "model"]
11
+
12
+
13
+ def _maybe_init_on_import() -> None:
14
+ if not get_env_bool("MLSORT_INIT_ON_IMPORT", False):
15
+ return
16
+ thr_path = os.path.join(get_artifacts_dir(), "thresholds.json")
17
+ if os.path.exists(thr_path):
18
+ return
19
+ os.makedirs(os.path.dirname(thr_path) or ".", exist_ok=True)
20
+ seed = get_seed()
21
+ th = train_thresholds(num_samples=600, max_n=120_000, seed=seed, max_depth=3)
22
+ save_thresholds(thr_path, th)
23
+ arrays = gen_cases(num_samples=250, max_n=120_000, seed=seed + 7)
24
+ res = optimize_cutoffs(th, arrays)
25
+ th.cutoff_n = int(res["best"]["cutoff_n"]) # type: ignore[attr-defined]
26
+ th.activation_n = int(res["best"]["activation_n"]) # type: ignore[attr-defined]
27
+ save_thresholds(thr_path, th)
28
+
29
+
30
+ _maybe_init_on_import()
@@ -0,0 +1,160 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import time
5
+ from typing import Any, Dict, List, Sequence, Tuple
6
+
7
+ import numpy as np
8
+
9
+
10
+ ALG_TIMSORT = "timsort"
11
+ ALG_NP_QUICK = "np_quick"
12
+ ALG_NP_MERGE = "np_merge"
13
+ ALG_COUNTING = "counting"
14
+ ALG_RADIX = "radix"
15
+
16
+ ALL_ALGOS = [ALG_TIMSORT, ALG_NP_QUICK, ALG_NP_MERGE, ALG_COUNTING, ALG_RADIX]
17
+
18
+
19
+ def _as_numpy(arr: Sequence[Any]) -> np.ndarray:
20
+ if isinstance(arr, np.ndarray):
21
+ return arr
22
+ return np.asarray(arr)
23
+
24
+
25
+ def _as_list(arr: Sequence[Any]) -> List[Any]:
26
+ if isinstance(arr, list):
27
+ return list(arr)
28
+ return list(arr)
29
+
30
+
31
+ def sort_timsort(arr: Sequence[Any]) -> List[Any]:
32
+ a = _as_list(arr)
33
+ a.sort()
34
+ return a
35
+
36
+
37
+ def sort_np(arr: Sequence[Any], kind: str) -> np.ndarray:
38
+ a = _as_numpy(arr)
39
+ return np.sort(a, kind=kind)
40
+
41
+
42
+ def sort_counting(arr: Sequence[int]) -> List[int]:
43
+ a = _as_numpy(arr)
44
+ if not np.issubdtype(a.dtype, np.integer):
45
+ raise TypeError("counting sort requires integer dtype")
46
+ if a.size == 0:
47
+ return []
48
+ amin = int(a.min())
49
+ amax = int(a.max())
50
+ rng = amax - amin + 1
51
+ # Safety cap: avoid huge memory
52
+ if rng > 1_000_000:
53
+ raise ValueError("range too large for counting sort")
54
+ counts = np.zeros(rng, dtype=np.int64)
55
+ # Shift values to zero-based
56
+ shifted = (a - amin).astype(np.int64)
57
+ for v in shifted:
58
+ counts[v] += 1
59
+ # Build output
60
+ out = np.empty_like(shifted)
61
+ total = 0
62
+ for i in range(rng):
63
+ c = int(counts[i])
64
+ if c:
65
+ out[total: total + c] = i
66
+ total += c
67
+ # Shift back
68
+ out = (out + amin).astype(a.dtype, copy=False)
69
+ return out.tolist()
70
+
71
+
72
+ def sort_radix_lsd(arr: Sequence[int], base: int = 256) -> List[int]:
73
+ a = _as_numpy(arr)
74
+ if not np.issubdtype(a.dtype, np.integer):
75
+ raise TypeError("radix sort requires integer dtype")
76
+ if a.size == 0:
77
+ return []
78
+ # Use 32-bit buckets for speed; bias signed to unsigned
79
+ dtype = a.dtype
80
+ bits = np.iinfo(dtype).bits
81
+ bias = 1 << (bits - 1)
82
+ u = (a.astype(np.int64) + bias).astype(np.uint64)
83
+ out = u.copy()
84
+ mask = base - 1
85
+ shift = 0
86
+ tmp = np.empty_like(out)
87
+ while shift < bits:
88
+ counts = np.zeros(base, dtype=np.int64)
89
+ # Count
90
+ for v in out:
91
+ counts[(v >> shift) & mask] += 1
92
+ # Prefix sums
93
+ total = 0
94
+ for i in range(base):
95
+ c = counts[i]
96
+ counts[i] = total
97
+ total += c
98
+ # Reorder
99
+ for v in out:
100
+ b = (v >> shift) & mask
101
+ tmp[counts[b]] = v
102
+ counts[b] += 1
103
+ out, tmp = tmp, out
104
+ shift += int(math.log2(base))
105
+ # Un-bias
106
+ res = (out.astype(np.int64) - bias).astype(dtype, copy=False)
107
+ return res.tolist()
108
+
109
+
110
+ def available_algorithms_for(arr: Sequence[Any]) -> List[str]:
111
+ a = _as_numpy(arr)
112
+ algos = [ALG_TIMSORT, ALG_NP_QUICK, ALG_NP_MERGE]
113
+ if np.issubdtype(a.dtype, np.integer):
114
+ # counting only if range manageable
115
+ if a.size > 0:
116
+ amin = int(a.min())
117
+ amax = int(a.max())
118
+ rng = amax - amin + 1
119
+ if rng <= 100_000 and rng <= 8 * a.size:
120
+ algos.append(ALG_COUNTING)
121
+ algos.append(ALG_RADIX)
122
+ return algos
123
+
124
+
125
+ def time_algorithm(arr: Sequence[Any], algo: str, repeats: int = 1) -> float:
126
+ start = time.perf_counter
127
+ best = float("inf")
128
+ for _ in range(repeats):
129
+ t0 = start()
130
+ if algo == ALG_TIMSORT:
131
+ sort_timsort(arr)
132
+ elif algo == ALG_NP_QUICK:
133
+ sort_np(arr, kind="quicksort")
134
+ elif algo == ALG_NP_MERGE:
135
+ sort_np(arr, kind="mergesort")
136
+ elif algo == ALG_COUNTING:
137
+ sort_counting(arr)
138
+ elif algo == ALG_RADIX:
139
+ sort_radix_lsd(arr)
140
+ else:
141
+ raise ValueError(f"unknown algo {algo}")
142
+ best = min(best, start() - t0)
143
+ return best
144
+
145
+
146
+ def measure_best_algorithm(arr: Sequence[Any], repeats: int = 1):
147
+ algos = available_algorithms_for(arr)
148
+ times: Dict[str, float] = {}
149
+ for algo in algos:
150
+ try:
151
+ t = time_algorithm(arr, algo, repeats=repeats)
152
+ times[algo] = t
153
+ except Exception:
154
+ # skip invalid
155
+ continue
156
+ if not times:
157
+ # fallback
158
+ return ALG_TIMSORT, {ALG_TIMSORT: float("inf")}
159
+ best_algo = min(times.items(), key=lambda kv: kv[1])[0]
160
+ return best_algo, times
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ from typing import Any, Dict, Iterable, List, Sequence, Tuple
6
+
7
+ import numpy as np
8
+
9
+ from .config import get_artifacts_dir, get_env_bool, get_seed
10
+ from .decision import decide
11
+ from .installer import load_thresholds, train_thresholds, save_thresholds, Thresholds
12
+ from .optimize import gen_cases, optimize_cutoffs
13
+ from .algorithms import (
14
+ ALG_TIMSORT, ALG_NP_QUICK, ALG_NP_MERGE, ALG_COUNTING, ALG_RADIX,
15
+ sort_timsort, sort_np, sort_counting, sort_radix_lsd, available_algorithms_for
16
+ )
17
+
18
+
19
+ log = logging.getLogger("mlsort")
20
+
21
+
22
+ def _ensure_thresholds(path: str) -> Thresholds:
23
+ # Lazy init: controlled by env flags
24
+ if os.path.exists(path):
25
+ return load_thresholds(path)
26
+ if not get_env_bool("MLSORT_ENABLE_INSTALL_BENCH", False):
27
+ # Safe default if benchmarks disabled
28
+ th = Thresholds(cutoff_n=1024, activation_n=98304, tree={"leaf": True, "label": ALG_NP_QUICK}, feature_names=[
29
+ "n","dtype_code","est_sortedness","est_dup_ratio","est_range","est_entropy","est_run_len"
30
+ ])
31
+ os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
32
+ save_thresholds(path, th)
33
+ return th
34
+ # Run small-budget train+optimize
35
+ seed = get_seed()
36
+ th = train_thresholds(num_samples=600, max_n=120_000, seed=seed, max_depth=3)
37
+ save_thresholds(path, th)
38
+ arrays = gen_cases(num_samples=250, max_n=120_000, seed=seed + 7)
39
+ res = optimize_cutoffs(th, arrays)
40
+ th.cutoff_n = int(res["best"]["cutoff_n"]) # type: ignore[attr-defined]
41
+ th.activation_n = int(res["best"]["activation_n"]) # type: ignore[attr-defined]
42
+ save_thresholds(path, th)
43
+ return th
44
+
45
+
46
+ def select_algorithm(arr: Sequence[Any], thresholds_path: str | None = None, *, key: Any = None, reverse: bool = False) -> str:
47
+ # Input validation
48
+ try:
49
+ n = len(arr) # type: ignore[arg-type]
50
+ except Exception:
51
+ raise TypeError("arr must be a sequence with __len__ and indexable by int")
52
+ if n == 0:
53
+ return ALG_TIMSORT
54
+ # If a key function is provided, prefer builtin Timsort for correctness and stability
55
+ if key is not None:
56
+ return ALG_TIMSORT
57
+ # If data are strings or mixed/object types, default to Python's Timsort
58
+ try:
59
+ if isinstance(arr, np.ndarray):
60
+ if arr.dtype.kind in {"O", "U", "S"}:
61
+ return ALG_TIMSORT
62
+ else:
63
+ # Sample a subset to determine type categories
64
+ sample_count = min(n, 256)
65
+ idxs = range(sample_count)
66
+ cats = set()
67
+ for i in idxs:
68
+ v = arr[i]
69
+ if isinstance(v, str) or isinstance(v, bytes):
70
+ cats.add("string")
71
+ elif isinstance(v, (int, float, np.integer, np.floating)):
72
+ cats.add("number")
73
+ elif v is None:
74
+ cats.add("other")
75
+ else:
76
+ # Unknown/object type
77
+ cats.add("other")
78
+ if len(cats) > 1:
79
+ break
80
+ if "string" in cats:
81
+ return ALG_TIMSORT
82
+ if len(cats) > 1 or (cats and next(iter(cats)) == "other"):
83
+ return ALG_TIMSORT
84
+ except Exception:
85
+ # On any detection error, prefer safe fallback
86
+ return ALG_TIMSORT
87
+ # Ensure thresholds
88
+ thr_path = thresholds_path or os.path.join(get_artifacts_dir(), "thresholds.json")
89
+ os.makedirs(os.path.dirname(thr_path) or ".", exist_ok=True)
90
+ th = _ensure_thresholds(thr_path)
91
+ algo = decide(arr, th)
92
+ if get_env_bool("MLSORT_DEBUG", False):
93
+ log.debug("mlsort.select algo=%s n=%d path=%s", algo, n, thr_path)
94
+ return algo
95
+
96
+
97
+ def sort(
98
+ arr: Sequence[Any],
99
+ thresholds_path: str | None = None,
100
+ *,
101
+ key: Any = None,
102
+ reverse: bool = False,
103
+ ) -> List[Any]:
104
+ # Always safe fallback path
105
+ try:
106
+ algo = select_algorithm(arr, thresholds_path, key=key, reverse=reverse)
107
+ except Exception as e: # strict safety: fallback
108
+ if get_env_bool("MLSORT_DEBUG", False):
109
+ log.debug("mlsort.select failed: %s; falling back to timsort", e)
110
+ algo = ALG_TIMSORT
111
+
112
+ # Execute with correct key/reverse handling
113
+ if algo == ALG_TIMSORT:
114
+ a = list(arr)
115
+ a.sort(key=key, reverse=reverse)
116
+ return a
117
+
118
+ # For non-Timsort backends, key is unsupported (would have forced Timsort above)
119
+ if algo == ALG_NP_QUICK:
120
+ res = sort_np(arr, kind="quicksort").tolist()
121
+ return res[::-1] if reverse else res
122
+ if algo == ALG_NP_MERGE:
123
+ res = sort_np(arr, kind="mergesort").tolist()
124
+ return res[::-1] if reverse else res
125
+ if algo == ALG_COUNTING:
126
+ try:
127
+ res = sort_counting(arr)
128
+ return res[::-1] if reverse else res
129
+ except Exception:
130
+ res = sort_np(arr, kind="quicksort").tolist()
131
+ return res[::-1] if reverse else res
132
+ if algo == ALG_RADIX:
133
+ try:
134
+ res = sort_radix_lsd(arr)
135
+ return res[::-1] if reverse else res
136
+ except Exception:
137
+ res = sort_np(arr, kind="quicksort").tolist()
138
+ return res[::-1] if reverse else res
139
+
140
+ # Last resort: builtin
141
+ a = list(arr)
142
+ a.sort(key=key, reverse=reverse)
143
+ return a
144
+
145
+
146
+ def profile_decisions(samples: int = 100, max_n: int = 200_000, thresholds_path: str | None = None) -> Dict[str, Any]:
147
+ import time
148
+ from .algorithms import time_algorithm
149
+ thr_path = thresholds_path or os.path.join(get_artifacts_dir(), "thresholds.json")
150
+ th = _ensure_thresholds(thr_path)
151
+ arrays = gen_cases(samples, max_n, seed=get_seed()+99)
152
+ rows = []
153
+ for arr in arrays:
154
+ t0 = time.perf_counter()
155
+ algo = decide(arr, th)
156
+ t1 = time.perf_counter()
157
+ t_sort = time_algorithm(arr, algo, repeats=1)
158
+ rows.append({"n": len(arr), "algo": algo, "decision_ms": (t1-t0)*1000.0, "sort_s": t_sort})
159
+ return {"count": len(rows), "rows": rows[:50]}
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict
4
+
5
+ from .algorithms import ALG_COUNTING, ALG_NP_MERGE, ALG_NP_QUICK, ALG_RADIX, ALG_TIMSORT
6
+
7
+
8
+ def heuristic_baseline(props: Dict[str, float]) -> str:
9
+ n = props["n"]
10
+ dtype = int(props["dtype_code"]) # 0 float, 1 int
11
+ sortedness = props["est_sortedness"]
12
+ dup_ratio = props["est_dup_ratio"]
13
+ rng = props["est_range"]
14
+ entropy = props["est_entropy"]
15
+ run_len = props["est_run_len"]
16
+
17
+ # If almost sorted or long runs, Timsort shines
18
+ if sortedness >= 0.9 or run_len >= 32:
19
+ return ALG_TIMSORT
20
+
21
+ if dtype == 1:
22
+ # Counting sort when range relatively small and many duplicates
23
+ if rng > 0 and rng <= max(1024.0, 8.0 * n) and dup_ratio >= 0.3 and entropy <= 0.7:
24
+ return ALG_COUNTING
25
+ # Radix for wide range ints with moderate entropy
26
+ if n >= 512 and entropy <= 0.9:
27
+ return ALG_RADIX
28
+
29
+ # For general cases prefer NumPy quicksort for speed, merge for stability/some patterns
30
+ if n >= 2000:
31
+ return ALG_NP_QUICK
32
+ else:
33
+ return ALG_NP_MERGE