mlsort 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mlsort/data.py ADDED
@@ -0,0 +1,109 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from dataclasses import dataclass
5
+ from typing import Any, Dict, List, Sequence, Tuple
6
+
7
+ import numpy as np
8
+
9
+ from .algorithms import measure_best_algorithm
10
+ from .features import estimate_properties
11
+
12
+
13
+ @dataclass
14
+ class Sample:
15
+ X: List[float]
16
+ y: str
17
+ props: Dict[str, float]
18
+
19
+
20
+ # Generators
21
+
22
+ def gen_sorted(n: int, dtype: str = "int") -> Sequence:
23
+ if dtype == "int":
24
+ return np.arange(n, dtype=np.int32)
25
+ else:
26
+ return np.linspace(0.0, 1.0, n, dtype=np.float64)
27
+
28
+
29
+ def gen_reverse(n: int, dtype: str = "int") -> Sequence:
30
+ if dtype == "int":
31
+ return np.arange(n, 0, -1, dtype=np.int32)
32
+ else:
33
+ return np.linspace(1.0, 0.0, n, dtype=np.float64)
34
+
35
+
36
+ def gen_nearly_sorted(n: int, swaps: int = 10, dtype: str = "int") -> Sequence:
37
+ a = gen_sorted(n, dtype)
38
+ a = np.array(a, copy=True)
39
+ swaps = min(swaps, max(1, n // 50))
40
+ for _ in range(swaps):
41
+ i = random.randrange(n)
42
+ j = random.randrange(n)
43
+ a[i], a[j] = a[j], a[i]
44
+ return a
45
+
46
+
47
+ def gen_uniform(n: int, dtype: str = "int", low: int = 0, high: int = 10000) -> Sequence:
48
+ if dtype == "int":
49
+ return np.random.randint(low, high, size=n, dtype=np.int32)
50
+ else:
51
+ return np.random.uniform(0.0, 1.0, size=n).astype(np.float64)
52
+
53
+
54
+ def gen_small_range(n: int, k: int = 256) -> Sequence:
55
+ # ints with many duplicates
56
+ return np.random.randint(0, k, size=n, dtype=np.int32)
57
+
58
+
59
+ def gen_zipf(n: int, a: float = 2.0, dtype: str = "int", max_val: int = 100000) -> Sequence:
60
+ # Zipf distributed positive integers, clipped
61
+ vals = np.random.zipf(a, size=n)
62
+ vals = np.clip(vals, 0, max_val)
63
+ if dtype == "int":
64
+ return vals.astype(np.int32)
65
+ else:
66
+ return vals.astype(np.float64)
67
+
68
+
69
+ def gen_normal(n: int, mean: float = 0.0, std: float = 1.0, dtype: str = "float") -> Sequence:
70
+ vals = np.random.normal(mean, std, size=n)
71
+ if dtype == "int":
72
+ return vals.round().astype(np.int32)
73
+ else:
74
+ return vals.astype(np.float64)
75
+
76
+
77
+ def synthesize_dataset(num_samples: int, max_n: int, seed: int = 42):
78
+ random.seed(seed)
79
+ import numpy as _np
80
+ _np.random.seed(seed)
81
+ samples = []
82
+ gens = [
83
+ lambda n: gen_sorted(n, "int"),
84
+ lambda n: gen_reverse(n, "int"),
85
+ lambda n: gen_nearly_sorted(n, dtype="int"),
86
+ lambda n: gen_uniform(n, "int", 0, 10_000),
87
+ lambda n: gen_uniform(n, "float"),
88
+ lambda n: gen_small_range(n, 128),
89
+ lambda n: gen_zipf(n, a=2.0, dtype="int", max_val=50_000),
90
+ lambda n: gen_normal(n, dtype="float"),
91
+ ]
92
+ for _ in range(num_samples):
93
+ n = random.randint(128, max_n)
94
+ g = random.choice(gens)
95
+ arr = g(n)
96
+ props = estimate_properties(arr)
97
+ # Measure which algorithm is best for this concrete arr
98
+ label, _times = measure_best_algorithm(arr, repeats=1)
99
+ X = [
100
+ props["n"],
101
+ props["dtype_code"],
102
+ props["est_sortedness"],
103
+ props["est_dup_ratio"],
104
+ props["est_range"],
105
+ props["est_entropy"],
106
+ props["est_run_len"],
107
+ ]
108
+ samples.append(Sample(X=X, y=label, props=props))
109
+ return samples
mlsort/decision.py ADDED
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List
4
+
5
+ import numpy as np
6
+
7
+ from .features import estimate_properties, to_feature_vector
8
+ from .algorithms import ALG_TIMSORT, ALG_NP_QUICK, ALG_NP_MERGE, ALG_COUNTING, ALG_RADIX, available_algorithms_for
9
+
10
+
11
+ def _eval_tree(tree: Dict[str, Any], feature_names: List[str], X: List[float]) -> str:
12
+ node = tree
13
+ while not node.get("leaf", False):
14
+ idx = int(node["feature_index"])
15
+ thr = float(node["threshold"])
16
+ if X[idx] <= thr:
17
+ node = node["left"]
18
+ else:
19
+ node = node["right"]
20
+ return str(node["label"])
21
+
22
+
23
+ def decide(arr, thresholds) -> str:
24
+ # 1) Size cutoffs
25
+ n = len(arr)
26
+ if n < thresholds.cutoff_n:
27
+ return ALG_TIMSORT
28
+ # For arrays between cutoff and activation, use a fast default (np_quick)
29
+ if n < getattr(thresholds, "activation_n", thresholds.cutoff_n * 4):
30
+ return ALG_NP_QUICK
31
+
32
+ # 2) Estimate features only for very large arrays
33
+ props = estimate_properties(arr)
34
+ X = to_feature_vector(props)
35
+
36
+ # 3) Evaluate decision tree
37
+ label = _eval_tree(thresholds.tree, thresholds.feature_names, X)
38
+
39
+ # 4) Respect algorithm availability for dtype/range; fallback if needed
40
+ algos = set(available_algorithms_for(arr))
41
+ if label in algos:
42
+ return label
43
+ # fallback preference order depending on dtype
44
+ if ALG_NP_QUICK in algos:
45
+ return ALG_NP_QUICK
46
+ if ALG_NP_MERGE in algos:
47
+ return ALG_NP_MERGE
48
+ return ALG_TIMSORT
mlsort/features.py ADDED
@@ -0,0 +1,178 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import random
5
+ from typing import Any, Dict, Iterable, List, Sequence, Tuple
6
+
7
+ import numpy as np
8
+
9
+
10
+ DTYPE_FLOAT = 0
11
+ DTYPE_INT = 1
12
+
13
+
14
+ def infer_dtype(arr: Sequence[Any]) -> int:
15
+ if len(arr) == 0:
16
+ return DTYPE_FLOAT
17
+ # Heuristic: treat as int if all elements are ints or can be safely cast to ints
18
+ if all(isinstance(x, (int, np.integer)) for x in arr):
19
+ return DTYPE_INT
20
+ if isinstance(arr, np.ndarray) and np.issubdtype(arr.dtype, np.integer):
21
+ return DTYPE_INT
22
+ return DTYPE_FLOAT
23
+
24
+
25
+ def _sample_indices(n: int, k: int) -> List[int]:
26
+ if n <= 1 or k <= 0:
27
+ return []
28
+ k = min(k, n - 1)
29
+ # Use fixed-seed RNG per call? We'll let caller set global seed.
30
+ return random.sample(range(n - 1), k)
31
+
32
+
33
+ def est_sortedness(arr: Sequence[Any], sample: int = 256) -> float:
34
+ n = len(arr)
35
+ if n < 2:
36
+ return 1.0
37
+ idxs = _sample_indices(n, min(sample, n - 1))
38
+ if not idxs:
39
+ return 1.0
40
+ good = 0
41
+ for i in idxs:
42
+ try:
43
+ if arr[i] <= arr[i + 1]:
44
+ good += 1
45
+ except Exception:
46
+ # Fallback: consider incomparable as unsorted
47
+ pass
48
+ return good / len(idxs)
49
+
50
+
51
+ def est_duplicate_ratio(arr: Sequence[Any], sample: int = 256) -> float:
52
+ if len(arr) == 0:
53
+ return 0.0
54
+ if sample > len(arr):
55
+ sample = len(arr)
56
+ idxs = random.sample(range(len(arr)), sample)
57
+ vals = [arr[i] for i in idxs]
58
+ uniq = len(set(vals))
59
+ dup_ratio = 1.0 - (uniq / sample if sample > 0 else 1.0)
60
+ return dup_ratio
61
+
62
+
63
+ def est_range(arr: Sequence[Any], sample: int = 256) -> float:
64
+ if len(arr) == 0:
65
+ return 0.0
66
+ if sample > len(arr):
67
+ sample = len(arr)
68
+ idxs = random.sample(range(len(arr)), sample)
69
+ vals = [arr[i] for i in idxs]
70
+ try:
71
+ vmin = min(vals)
72
+ vmax = max(vals)
73
+ return float(vmax) - float(vmin)
74
+ except Exception:
75
+ return 0.0
76
+
77
+
78
+ def est_entropy(arr: Sequence[Any], bins: int = 32, sample: int = 512) -> float:
79
+ if len(arr) == 0:
80
+ return 0.0
81
+ if sample > len(arr):
82
+ sample = len(arr)
83
+ idxs = random.sample(range(len(arr)), sample)
84
+ vals = np.asarray([arr[i] for i in idxs])
85
+ if np.issubdtype(vals.dtype, np.integer):
86
+ vmin = int(vals.min())
87
+ vmax = int(vals.max())
88
+ if vmax == vmin:
89
+ return 0.0
90
+ # For ints, clamp number of bins to observed range
91
+ rng = vmax - vmin + 1
92
+ bins_ = min(bins, rng)
93
+ hist, _ = np.histogram(vals, bins=bins_, range=(vmin, vmax + 1))
94
+ else:
95
+ vmin = float(np.min(vals))
96
+ vmax = float(np.max(vals))
97
+ if vmax == vmin:
98
+ return 0.0
99
+ hist, _ = np.histogram(vals, bins=bins, range=(vmin, vmax))
100
+ p = hist.astype(np.float64)
101
+ p_sum = p.sum()
102
+ if p_sum == 0:
103
+ return 0.0
104
+ p = p / p_sum
105
+ # Shannon entropy
106
+ ent = -np.sum(p[p > 0] * np.log2(p[p > 0]))
107
+ # Normalize by max entropy (log2 of number of non-empty bins)
108
+ nonzero_bins = max(1, (p > 0).sum())
109
+ max_ent = math.log2(nonzero_bins)
110
+ return float(ent / max_ent) if max_ent > 0 else 0.0
111
+
112
+
113
+ def est_run_length(arr: Sequence[Any], sample_windows: int = 16, window_size: int = 128) -> float:
114
+ n = len(arr)
115
+ if n == 0:
116
+ return 0.0
117
+ if n <= 1:
118
+ return float(n)
119
+ windows = []
120
+ for _ in range(min(sample_windows, max(1, n // max(1, window_size)))):
121
+ start = random.randint(0, max(0, n - window_size)) if n > window_size else 0
122
+ end = min(n, start + window_size)
123
+ windows.append((start, end))
124
+ runs_total = 0
125
+ elems_total = 0
126
+ for s, e in windows:
127
+ if e - s <= 1:
128
+ runs_total += (e - s)
129
+ elems_total += (e - s)
130
+ continue
131
+ prev = arr[s]
132
+ direction = 0 # 1 increasing, -1 decreasing, 0 unknown
133
+ runs = 1
134
+ for i in range(s + 1, e):
135
+ curr = arr[i]
136
+ # Avoid numpy boolean arithmetic; use explicit branching
137
+ if curr > prev:
138
+ curr_dir = 1
139
+ elif curr < prev:
140
+ curr_dir = -1
141
+ else:
142
+ curr_dir = direction # equal values extend the current run
143
+ if curr_dir != direction and direction != 0:
144
+ runs += 1
145
+ direction = curr_dir if direction != 0 else curr_dir or 0
146
+ prev = curr
147
+ runs_total += runs
148
+ elems_total += (e - s)
149
+ avg_run_len = (elems_total / runs_total) if runs_total > 0 else float(n)
150
+ return float(avg_run_len)
151
+
152
+
153
+ def estimate_properties(arr: Sequence[Any]) -> Dict[str, float]:
154
+ n = len(arr)
155
+ dtype_code = infer_dtype(arr)
156
+ props = {
157
+ "n": float(n),
158
+ "dtype_code": float(dtype_code),
159
+ "est_sortedness": est_sortedness(arr),
160
+ "est_dup_ratio": est_duplicate_ratio(arr),
161
+ "est_range": est_range(arr),
162
+ "est_entropy": est_entropy(arr),
163
+ "est_run_len": est_run_length(arr),
164
+ }
165
+ return props
166
+
167
+
168
+ def to_feature_vector(props: Dict[str, float]) -> List[float]:
169
+ keys = [
170
+ "n",
171
+ "dtype_code",
172
+ "est_sortedness",
173
+ "est_dup_ratio",
174
+ "est_range",
175
+ "est_entropy",
176
+ "est_run_len",
177
+ ]
178
+ return [float(props[k]) for k in keys]
mlsort/installer.py ADDED
@@ -0,0 +1,139 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+ import numpy as np
9
+ from sklearn.tree import DecisionTreeClassifier
10
+
11
+ from .data import synthesize_dataset
12
+ from .model import LABELS, LABEL_TO_ID, ID_TO_LABEL
13
+
14
+
15
+ @dataclass
16
+ class Thresholds:
17
+ cutoff_n: int # use builtin timsort below this
18
+ activation_n: int # only run ML decision when n >= activation_n; else use a fast default
19
+ tree: Dict[str, Any]
20
+ feature_names: List[str]
21
+
22
+
23
+ FEATURE_NAMES = [
24
+ "n",
25
+ "dtype_code",
26
+ "est_sortedness",
27
+ "est_dup_ratio",
28
+ "est_range",
29
+ "est_entropy",
30
+ "est_run_len",
31
+ ]
32
+
33
+
34
+ def _train_tree(X: List[List[float]], y: List[str], max_depth: int = 3, random_state: int = 42) -> DecisionTreeClassifier:
35
+ y_ids = np.array([LABEL_TO_ID[l] for l in y], dtype=np.int64)
36
+ X_arr = np.asarray(X, dtype=np.float32)
37
+ tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=10, random_state=random_state)
38
+ tree.fit(X_arr, y_ids)
39
+ return tree
40
+
41
+
42
+ def _serialize_tree(tree: DecisionTreeClassifier) -> Dict[str, Any]:
43
+ # Convert sklearn tree into a nested dict
44
+ t = tree.tree_
45
+ def node_to_dict(i: int) -> Dict[str, Any]:
46
+ if t.children_left[i] == t.children_right[i]:
47
+ # leaf
48
+ # value shape: (1, n_classes)
49
+ value = t.value[i][0]
50
+ cls_id = int(np.argmax(value))
51
+ return {"leaf": True, "label": ID_TO_LABEL[cls_id]}
52
+ feat_idx = int(t.feature[i])
53
+ thresh = float(t.threshold[i])
54
+ left = int(t.children_left[i])
55
+ right = int(t.children_right[i])
56
+ return {
57
+ "leaf": False,
58
+ "feature_index": feat_idx,
59
+ "threshold": thresh,
60
+ "left": node_to_dict(left),
61
+ "right": node_to_dict(right),
62
+ }
63
+ return node_to_dict(0)
64
+
65
+
66
+ def _estimate_timsort_cutoff(seed: int = 42) -> int:
67
+ # Probe across small sizes and varied distributions and pick the largest n where
68
+ # timsort is best in >= 60% of cases.
69
+ from .algorithms import measure_best_algorithm
70
+ from .data import (
71
+ gen_sorted, gen_reverse, gen_nearly_sorted, gen_uniform, gen_small_range, gen_zipf, gen_normal
72
+ )
73
+ rng = np.random.default_rng(seed)
74
+ sizes = [32, 64, 128, 256, 384, 512, 768, 1024, 1536, 2048]
75
+ gens = [
76
+ lambda n: gen_sorted(n, "int"),
77
+ lambda n: gen_reverse(n, "int"),
78
+ lambda n: gen_nearly_sorted(n, dtype="int"),
79
+ lambda n: gen_uniform(n, "int", 0, 10_000),
80
+ lambda n: gen_uniform(n, "float"),
81
+ lambda n: gen_small_range(n, 128),
82
+ lambda n: gen_zipf(n, a=2.0, dtype="int", max_val=50_000),
83
+ lambda n: gen_normal(n, dtype="float"),
84
+ ]
85
+ cutoff = sizes[0]
86
+ for n in sizes:
87
+ wins = 0
88
+ trials = 0
89
+ for _ in range(12): # 12 trials per size
90
+ g = gens[rng.integers(0, len(gens))]
91
+ arr = g(n)
92
+ best, _ = measure_best_algorithm(arr, repeats=1)
93
+ if best == "timsort":
94
+ wins += 1
95
+ trials += 1
96
+ frac = wins / max(1, trials)
97
+ if frac >= 0.6:
98
+ cutoff = n
99
+ return int(cutoff)
100
+
101
+
102
+ def _choose_activation_n(cutoff_n: int, max_n: int) -> int:
103
+ # Heuristic: only run ML decision when arrays are "very large".
104
+ # Pick at least 4x cutoff, but not below 32k; cap by max_n.
105
+ base = max(32768, cutoff_n * 4)
106
+ if max_n > 0:
107
+ return int(min(max_n, base))
108
+ return int(base)
109
+
110
+
111
+ def train_thresholds(num_samples: int = 1000, max_n: int = 20000, seed: int = 42, max_depth: int = 3) -> Thresholds:
112
+ samples = synthesize_dataset(num_samples=num_samples, max_n=max_n, seed=seed)
113
+ X = [s.X for s in samples]
114
+ y = [s.y for s in samples]
115
+ tree = _train_tree(X, y, max_depth=max_depth, random_state=seed)
116
+ rules = _serialize_tree(tree)
117
+ cutoff_n = _estimate_timsort_cutoff(seed)
118
+ activation_n = _choose_activation_n(cutoff_n, max_n)
119
+ return Thresholds(cutoff_n=cutoff_n, activation_n=activation_n, tree=rules, feature_names=FEATURE_NAMES)
120
+
121
+
122
+ def save_thresholds(path: str, thresholds: Thresholds) -> None:
123
+ os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
124
+ data = {
125
+ "cutoff_n": thresholds.cutoff_n,
126
+ "activation_n": thresholds.activation_n,
127
+ "feature_names": thresholds.feature_names,
128
+ "tree": thresholds.tree,
129
+ }
130
+ with open(path, "w") as f:
131
+ json.dump(data, f, indent=2)
132
+
133
+
134
+ def load_thresholds(path: str) -> Thresholds:
135
+ with open(path, "r") as f:
136
+ obj = json.load(f)
137
+ # Backward-compatible: if activation_n missing, derive a conservative default
138
+ activation_n = int(obj.get("activation_n", max(32768, int(obj["cutoff_n"]) * 4)))
139
+ return Thresholds(cutoff_n=int(obj["cutoff_n"]), activation_n=activation_n, tree=obj["tree"], feature_names=list(obj["feature_names"]))
mlsort/model.py ADDED
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from typing import Dict, List, Tuple
6
+
7
+ import joblib
8
+ import numpy as np
9
+ from sklearn.ensemble import RandomForestClassifier
10
+ from sklearn.metrics import accuracy_score, classification_report
11
+
12
+ from .features import to_feature_vector
13
+
14
+
15
+ LABELS = ["timsort", "np_quick", "np_merge", "counting", "radix"]
16
+ LABEL_TO_ID = {l: i for i, l in enumerate(LABELS)}
17
+ ID_TO_LABEL = {i: l for l, i in LABEL_TO_ID.items()}
18
+
19
+
20
+ @dataclass
21
+ class ModelArtifacts:
22
+ model: RandomForestClassifier
23
+ feature_names: List[str]
24
+
25
+
26
+ def make_model(random_state: int = 42) -> RandomForestClassifier:
27
+ return RandomForestClassifier(
28
+ n_estimators=200,
29
+ max_depth=None,
30
+ min_samples_leaf=2,
31
+ random_state=random_state,
32
+ n_jobs=-1,
33
+ )
34
+
35
+
36
+ def fit_model(X: List[List[float]], y: List[str], random_state: int = 42) -> ModelArtifacts:
37
+ y_ids = np.array([LABEL_TO_ID[l] for l in y], dtype=np.int64)
38
+ X_arr = np.asarray(X, dtype=np.float32)
39
+ model = make_model(random_state)
40
+ model.fit(X_arr, y_ids)
41
+ return ModelArtifacts(model=model, feature_names=[
42
+ "n",
43
+ "dtype_code",
44
+ "est_sortedness",
45
+ "est_dup_ratio",
46
+ "est_range",
47
+ "est_entropy",
48
+ "est_run_len",
49
+ ])
50
+
51
+
52
+ def predict(model: RandomForestClassifier, props: Dict[str, float]) -> str:
53
+ X = np.asarray([to_feature_vector(props)], dtype=np.float32)
54
+ y_id = int(model.predict(X)[0])
55
+ return ID_TO_LABEL[y_id]
56
+
57
+
58
+ def load_model(path: str) -> RandomForestClassifier:
59
+ return joblib.load(path)
60
+
61
+
62
+ def save_model(path: str, model: RandomForestClassifier) -> None:
63
+ joblib.dump(model, path)
64
+
65
+
66
+ def predict_best_algo(model: RandomForestClassifier, props: Dict[str, float]) -> str:
67
+ return predict(model, props)
68
+
69
+
70
+ def evaluate_model(model: RandomForestClassifier, X: List[List[float]], y: List[str]) -> Dict:
71
+ y_true = np.array([LABEL_TO_ID[l] for l in y], dtype=np.int64)
72
+ X_arr = np.asarray(X, dtype=np.float32)
73
+ y_pred = model.predict(X_arr)
74
+ acc = accuracy_score(y_true, y_pred)
75
+ all_labels = list(range(len(LABELS)))
76
+ report = classification_report(
77
+ y_true,
78
+ y_pred,
79
+ labels=all_labels,
80
+ target_names=LABELS,
81
+ zero_division=0,
82
+ output_dict=True,
83
+ )
84
+ return {"accuracy": float(acc), "report": report}
mlsort/optimize.py ADDED
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+
3
+ import statistics
4
+ import time
5
+ from typing import Dict, List, Tuple
6
+
7
+ import numpy as np
8
+
9
+ from .decision import decide
10
+ from .installer import Thresholds
11
+ from .algorithms import time_algorithm
12
+ from .data import (
13
+ gen_sorted, gen_reverse, gen_nearly_sorted, gen_uniform, gen_small_range, gen_zipf, gen_normal,
14
+ )
15
+
16
+
17
+ def gen_cases(num_samples: int, max_n: int, seed: int) -> List[np.ndarray]:
18
+ rng = np.random.default_rng(seed)
19
+ gens = [
20
+ lambda n: gen_sorted(n, "int"),
21
+ lambda n: gen_reverse(n, "int"),
22
+ lambda n: gen_nearly_sorted(n, dtype="int"),
23
+ lambda n: gen_uniform(n, "int", 0, 10_000),
24
+ lambda n: gen_uniform(n, "float"),
25
+ lambda n: gen_small_range(n, 128),
26
+ lambda n: gen_zipf(n, a=2.0, dtype="int", max_val=50_000),
27
+ lambda n: gen_normal(n, dtype="float"),
28
+ ]
29
+ cases = []
30
+ for _ in range(num_samples):
31
+ n = int(rng.integers(128, max_n + 1))
32
+ g = gens[int(rng.integers(0, len(gens)))]
33
+ cases.append(g(n))
34
+ return cases
35
+
36
+
37
+ def essential_stats(vals: List[float]) -> Dict[str, float]:
38
+ vals_sorted = sorted(vals)
39
+ return {
40
+ "mean": float(statistics.fmean(vals)),
41
+ "median": float(vals_sorted[len(vals_sorted)//2]),
42
+ }
43
+
44
+
45
+ def eval_policy(th: Thresholds, arrays: List[np.ndarray]) -> Dict[str, float]:
46
+ total_times: List[float] = []
47
+ for arr in arrays:
48
+ t0 = time.perf_counter()
49
+ algo = decide(arr, th)
50
+ t1 = time.perf_counter()
51
+ t_sort = time_algorithm(arr, algo, repeats=1)
52
+ total_times.append((t1 - t0) + t_sort)
53
+ return essential_stats(total_times)
54
+
55
+
56
+ def grid_candidates(max_n: int) -> Tuple[List[int], List[int]]:
57
+ cutoff_grid = [32, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096]
58
+ cutoff_grid = [c for c in cutoff_grid if c < max_n]
59
+ act_grid_base = [8192, 12000, 16384, 24576, 32768, 49152, 65536, 98304, 131072]
60
+ act_grid = sorted({min(max_n, a) for a in act_grid_base if a <= max_n})
61
+ if not act_grid:
62
+ act_grid = [min(max_n, 8192)]
63
+ return cutoff_grid, act_grid
64
+
65
+
66
+ def optimize_cutoffs(th: Thresholds, arrays: List[np.ndarray]) -> Dict:
67
+ cutoff_grid, act_grid = grid_candidates(max(len(a) for a in arrays))
68
+ best = {"mean": float("inf"), "cutoff_n": th.cutoff_n, "activation_n": getattr(th, "activation_n", th.cutoff_n * 4)}
69
+ tried = []
70
+ for c in cutoff_grid:
71
+ for a in act_grid:
72
+ if a <= c:
73
+ continue
74
+ th_try = Thresholds(cutoff_n=c, activation_n=a, tree=th.tree, feature_names=th.feature_names)
75
+ stats = eval_policy(th_try, arrays)
76
+ tried.append({"cutoff_n": c, "activation_n": a, **stats})
77
+ if stats["mean"] < best["mean"]:
78
+ best = {"mean": stats["mean"], "cutoff_n": c, "activation_n": a}
79
+ top = sorted(tried, key=lambda x: x["mean"])[:10]
80
+ return {"best": best, "tried": top}