PyPI - mlsort - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mlsort 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

mlsort/__init__.py +30 -0
mlsort/algorithms.py +160 -0
mlsort/api.py +159 -0
mlsort/baseline.py +33 -0
mlsort/benchmark.py +118 -0
mlsort/cli_bench_compare.py +44 -0
mlsort/cli_bench_install.py +25 -0
mlsort/cli_init.py +38 -0
mlsort/cli_optimize_cutoffs.py +34 -0
mlsort/config.py +35 -0
mlsort/data.py +109 -0
mlsort/decision.py +48 -0
mlsort/features.py +178 -0
mlsort/installer.py +139 -0
mlsort/model.py +84 -0
mlsort/optimize.py +80 -0
mlsort-0.1.0.dist-info/METADATA +135 -0
mlsort-0.1.0.dist-info/RECORD +22 -0
mlsort-0.1.0.dist-info/WHEEL +5 -0
mlsort-0.1.0.dist-info/entry_points.txt +5 -0
mlsort-0.1.0.dist-info/licenses/LICENSE +21 -0
mlsort-0.1.0.dist-info/top_level.txt +1 -0

mlsort/data.py ADDED Viewed

@@ -0,0 +1,109 @@
+from __future__ import annotations
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Sequence, Tuple
+import numpy as np
+from .algorithms import measure_best_algorithm
+from .features import estimate_properties
+@dataclass
+class Sample:
+    X: List[float]
+    y: str
+    props: Dict[str, float]
+# Generators
+def gen_sorted(n: int, dtype: str = "int") -> Sequence:
+    if dtype == "int":
+        return np.arange(n, dtype=np.int32)
+    else:
+        return np.linspace(0.0, 1.0, n, dtype=np.float64)
+def gen_reverse(n: int, dtype: str = "int") -> Sequence:
+    if dtype == "int":
+        return np.arange(n, 0, -1, dtype=np.int32)
+    else:
+        return np.linspace(1.0, 0.0, n, dtype=np.float64)
+def gen_nearly_sorted(n: int, swaps: int = 10, dtype: str = "int") -> Sequence:
+    a = gen_sorted(n, dtype)
+    a = np.array(a, copy=True)
+    swaps = min(swaps, max(1, n // 50))
+    for _ in range(swaps):
+        i = random.randrange(n)
+        j = random.randrange(n)
+        a[i], a[j] = a[j], a[i]
+    return a
+def gen_uniform(n: int, dtype: str = "int", low: int = 0, high: int = 10000) -> Sequence:
+    if dtype == "int":
+        return np.random.randint(low, high, size=n, dtype=np.int32)
+    else:
+        return np.random.uniform(0.0, 1.0, size=n).astype(np.float64)
+def gen_small_range(n: int, k: int = 256) -> Sequence:
+    # ints with many duplicates
+    return np.random.randint(0, k, size=n, dtype=np.int32)
+def gen_zipf(n: int, a: float = 2.0, dtype: str = "int", max_val: int = 100000) -> Sequence:
+    # Zipf distributed positive integers, clipped
+    vals = np.random.zipf(a, size=n)
+    vals = np.clip(vals, 0, max_val)
+    if dtype == "int":
+        return vals.astype(np.int32)
+    else:
+        return vals.astype(np.float64)
+def gen_normal(n: int, mean: float = 0.0, std: float = 1.0, dtype: str = "float") -> Sequence:
+    vals = np.random.normal(mean, std, size=n)
+    if dtype == "int":
+        return vals.round().astype(np.int32)
+    else:
+        return vals.astype(np.float64)
+def synthesize_dataset(num_samples: int, max_n: int, seed: int = 42):
+    random.seed(seed)
+    import numpy as _np
+    _np.random.seed(seed)
+    samples = []
+    gens = [
+        lambda n: gen_sorted(n, "int"),
+        lambda n: gen_reverse(n, "int"),
+        lambda n: gen_nearly_sorted(n, dtype="int"),
+        lambda n: gen_uniform(n, "int", 0, 10_000),
+        lambda n: gen_uniform(n, "float"),
+        lambda n: gen_small_range(n, 128),
+        lambda n: gen_zipf(n, a=2.0, dtype="int", max_val=50_000),
+        lambda n: gen_normal(n, dtype="float"),
+    ]
+    for _ in range(num_samples):
+        n = random.randint(128, max_n)
+        g = random.choice(gens)
+        arr = g(n)
+        props = estimate_properties(arr)
+        # Measure which algorithm is best for this concrete arr
+        label, _times = measure_best_algorithm(arr, repeats=1)
+        X = [
+            props["n"],
+            props["dtype_code"],
+            props["est_sortedness"],
+            props["est_dup_ratio"],
+            props["est_range"],
+            props["est_entropy"],
+            props["est_run_len"],
+        ]
+        samples.append(Sample(X=X, y=label, props=props))
+    return samples

mlsort/decision.py ADDED Viewed

@@ -0,0 +1,48 @@
+from __future__ import annotations
+from typing import Any, Dict, List
+import numpy as np
+from .features import estimate_properties, to_feature_vector
+from .algorithms import ALG_TIMSORT, ALG_NP_QUICK, ALG_NP_MERGE, ALG_COUNTING, ALG_RADIX, available_algorithms_for
+def _eval_tree(tree: Dict[str, Any], feature_names: List[str], X: List[float]) -> str:
+    node = tree
+    while not node.get("leaf", False):
+        idx = int(node["feature_index"])
+        thr = float(node["threshold"])
+        if X[idx] <= thr:
+            node = node["left"]
+        else:
+            node = node["right"]
+    return str(node["label"])
+def decide(arr, thresholds) -> str:
+    # 1) Size cutoffs
+    n = len(arr)
+    if n < thresholds.cutoff_n:
+        return ALG_TIMSORT
+    # For arrays between cutoff and activation, use a fast default (np_quick)
+    if n < getattr(thresholds, "activation_n", thresholds.cutoff_n * 4):
+        return ALG_NP_QUICK
+    # 2) Estimate features only for very large arrays
+    props = estimate_properties(arr)
+    X = to_feature_vector(props)
+    # 3) Evaluate decision tree
+    label = _eval_tree(thresholds.tree, thresholds.feature_names, X)
+    # 4) Respect algorithm availability for dtype/range; fallback if needed
+    algos = set(available_algorithms_for(arr))
+    if label in algos:
+        return label
+    # fallback preference order depending on dtype
+    if ALG_NP_QUICK in algos:
+        return ALG_NP_QUICK
+    if ALG_NP_MERGE in algos:
+        return ALG_NP_MERGE
+    return ALG_TIMSORT

mlsort/features.py ADDED Viewed

@@ -0,0 +1,178 @@
+from __future__ import annotations
+import math
+import random
+from typing import Any, Dict, Iterable, List, Sequence, Tuple
+import numpy as np
+DTYPE_FLOAT = 0
+DTYPE_INT = 1
+def infer_dtype(arr: Sequence[Any]) -> int:
+    if len(arr) == 0:
+        return DTYPE_FLOAT
+    # Heuristic: treat as int if all elements are ints or can be safely cast to ints
+    if all(isinstance(x, (int, np.integer)) for x in arr):
+        return DTYPE_INT
+    if isinstance(arr, np.ndarray) and np.issubdtype(arr.dtype, np.integer):
+        return DTYPE_INT
+    return DTYPE_FLOAT
+def _sample_indices(n: int, k: int) -> List[int]:
+    if n <= 1 or k <= 0:
+        return []
+    k = min(k, n - 1)
+    # Use fixed-seed RNG per call? We'll let caller set global seed.
+    return random.sample(range(n - 1), k)
+def est_sortedness(arr: Sequence[Any], sample: int = 256) -> float:
+    n = len(arr)
+    if n < 2:
+        return 1.0
+    idxs = _sample_indices(n, min(sample, n - 1))
+    if not idxs:
+        return 1.0
+    good = 0
+    for i in idxs:
+        try:
+            if arr[i] <= arr[i + 1]:
+                good += 1
+        except Exception:
+            # Fallback: consider incomparable as unsorted
+            pass
+    return good / len(idxs)
+def est_duplicate_ratio(arr: Sequence[Any], sample: int = 256) -> float:
+    if len(arr) == 0:
+        return 0.0
+    if sample > len(arr):
+        sample = len(arr)
+    idxs = random.sample(range(len(arr)), sample)
+    vals = [arr[i] for i in idxs]
+    uniq = len(set(vals))
+    dup_ratio = 1.0 - (uniq / sample if sample > 0 else 1.0)
+    return dup_ratio
+def est_range(arr: Sequence[Any], sample: int = 256) -> float:
+    if len(arr) == 0:
+        return 0.0
+    if sample > len(arr):
+        sample = len(arr)
+    idxs = random.sample(range(len(arr)), sample)
+    vals = [arr[i] for i in idxs]
+    try:
+        vmin = min(vals)
+        vmax = max(vals)
+        return float(vmax) - float(vmin)
+    except Exception:
+        return 0.0
+def est_entropy(arr: Sequence[Any], bins: int = 32, sample: int = 512) -> float:
+    if len(arr) == 0:
+        return 0.0
+    if sample > len(arr):
+        sample = len(arr)
+    idxs = random.sample(range(len(arr)), sample)
+    vals = np.asarray([arr[i] for i in idxs])
+    if np.issubdtype(vals.dtype, np.integer):
+        vmin = int(vals.min())
+        vmax = int(vals.max())
+        if vmax == vmin:
+            return 0.0
+        # For ints, clamp number of bins to observed range
+        rng = vmax - vmin + 1
+        bins_ = min(bins, rng)
+        hist, _ = np.histogram(vals, bins=bins_, range=(vmin, vmax + 1))
+    else:
+        vmin = float(np.min(vals))
+        vmax = float(np.max(vals))
+        if vmax == vmin:
+            return 0.0
+        hist, _ = np.histogram(vals, bins=bins, range=(vmin, vmax))
+    p = hist.astype(np.float64)
+    p_sum = p.sum()
+    if p_sum == 0:
+        return 0.0
+    p = p / p_sum
+    # Shannon entropy
+    ent = -np.sum(p[p > 0] * np.log2(p[p > 0]))
+    # Normalize by max entropy (log2 of number of non-empty bins)
+    nonzero_bins = max(1, (p > 0).sum())
+    max_ent = math.log2(nonzero_bins)
+    return float(ent / max_ent) if max_ent > 0 else 0.0
+def est_run_length(arr: Sequence[Any], sample_windows: int = 16, window_size: int = 128) -> float:
+    n = len(arr)
+    if n == 0:
+        return 0.0
+    if n <= 1:
+        return float(n)
+    windows = []
+    for _ in range(min(sample_windows, max(1, n // max(1, window_size)))):
+        start = random.randint(0, max(0, n - window_size)) if n > window_size else 0
+        end = min(n, start + window_size)
+        windows.append((start, end))
+    runs_total = 0
+    elems_total = 0
+    for s, e in windows:
+        if e - s <= 1:
+            runs_total += (e - s)
+            elems_total += (e - s)
+            continue
+        prev = arr[s]
+        direction = 0  # 1 increasing, -1 decreasing, 0 unknown
+        runs = 1
+        for i in range(s + 1, e):
+            curr = arr[i]
+            # Avoid numpy boolean arithmetic; use explicit branching
+            if curr > prev:
+                curr_dir = 1
+            elif curr < prev:
+                curr_dir = -1
+            else:
+                curr_dir = direction  # equal values extend the current run
+            if curr_dir != direction and direction != 0:
+                runs += 1
+            direction = curr_dir if direction != 0 else curr_dir or 0
+            prev = curr
+        runs_total += runs
+        elems_total += (e - s)
+    avg_run_len = (elems_total / runs_total) if runs_total > 0 else float(n)
+    return float(avg_run_len)
+def estimate_properties(arr: Sequence[Any]) -> Dict[str, float]:
+    n = len(arr)
+    dtype_code = infer_dtype(arr)
+    props = {
+        "n": float(n),
+        "dtype_code": float(dtype_code),
+        "est_sortedness": est_sortedness(arr),
+        "est_dup_ratio": est_duplicate_ratio(arr),
+        "est_range": est_range(arr),
+        "est_entropy": est_entropy(arr),
+        "est_run_len": est_run_length(arr),
+    }
+    return props
+def to_feature_vector(props: Dict[str, float]) -> List[float]:
+    keys = [
+        "n",
+        "dtype_code",
+        "est_sortedness",
+        "est_dup_ratio",
+        "est_range",
+        "est_entropy",
+        "est_run_len",
+    ]
+    return [float(props[k]) for k in keys]

mlsort/installer.py ADDED Viewed

@@ -0,0 +1,139 @@
+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+from sklearn.tree import DecisionTreeClassifier
+from .data import synthesize_dataset
+from .model import LABELS, LABEL_TO_ID, ID_TO_LABEL
+@dataclass
+class Thresholds:
+    cutoff_n: int  # use builtin timsort below this
+    activation_n: int  # only run ML decision when n >= activation_n; else use a fast default
+    tree: Dict[str, Any]
+    feature_names: List[str]
+FEATURE_NAMES = [
+    "n",
+    "dtype_code",
+    "est_sortedness",
+    "est_dup_ratio",
+    "est_range",
+    "est_entropy",
+    "est_run_len",
+]
+def _train_tree(X: List[List[float]], y: List[str], max_depth: int = 3, random_state: int = 42) -> DecisionTreeClassifier:
+    y_ids = np.array([LABEL_TO_ID[l] for l in y], dtype=np.int64)
+    X_arr = np.asarray(X, dtype=np.float32)
+    tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=10, random_state=random_state)
+    tree.fit(X_arr, y_ids)
+    return tree
+def _serialize_tree(tree: DecisionTreeClassifier) -> Dict[str, Any]:
+    # Convert sklearn tree into a nested dict
+    t = tree.tree_
+    def node_to_dict(i: int) -> Dict[str, Any]:
+        if t.children_left[i] == t.children_right[i]:
+            # leaf
+            # value shape: (1, n_classes)
+            value = t.value[i][0]
+            cls_id = int(np.argmax(value))
+            return {"leaf": True, "label": ID_TO_LABEL[cls_id]}
+        feat_idx = int(t.feature[i])
+        thresh = float(t.threshold[i])
+        left = int(t.children_left[i])
+        right = int(t.children_right[i])
+        return {
+            "leaf": False,
+            "feature_index": feat_idx,
+            "threshold": thresh,
+            "left": node_to_dict(left),
+            "right": node_to_dict(right),
+        }
+    return node_to_dict(0)
+def _estimate_timsort_cutoff(seed: int = 42) -> int:
+    # Probe across small sizes and varied distributions and pick the largest n where
+    # timsort is best in >= 60% of cases.
+    from .algorithms import measure_best_algorithm
+    from .data import (
+        gen_sorted, gen_reverse, gen_nearly_sorted, gen_uniform, gen_small_range, gen_zipf, gen_normal
+    )
+    rng = np.random.default_rng(seed)
+    sizes = [32, 64, 128, 256, 384, 512, 768, 1024, 1536, 2048]
+    gens = [
+        lambda n: gen_sorted(n, "int"),
+        lambda n: gen_reverse(n, "int"),
+        lambda n: gen_nearly_sorted(n, dtype="int"),
+        lambda n: gen_uniform(n, "int", 0, 10_000),
+        lambda n: gen_uniform(n, "float"),
+        lambda n: gen_small_range(n, 128),
+        lambda n: gen_zipf(n, a=2.0, dtype="int", max_val=50_000),
+        lambda n: gen_normal(n, dtype="float"),
+    ]
+    cutoff = sizes[0]
+    for n in sizes:
+        wins = 0
+        trials = 0
+        for _ in range(12):  # 12 trials per size
+            g = gens[rng.integers(0, len(gens))]
+            arr = g(n)
+            best, _ = measure_best_algorithm(arr, repeats=1)
+            if best == "timsort":
+                wins += 1
+            trials += 1
+        frac = wins / max(1, trials)
+        if frac >= 0.6:
+            cutoff = n
+    return int(cutoff)
+def _choose_activation_n(cutoff_n: int, max_n: int) -> int:
+    # Heuristic: only run ML decision when arrays are "very large".
+    # Pick at least 4x cutoff, but not below 32k; cap by max_n.
+    base = max(32768, cutoff_n * 4)
+    if max_n > 0:
+        return int(min(max_n, base))
+    return int(base)
+def train_thresholds(num_samples: int = 1000, max_n: int = 20000, seed: int = 42, max_depth: int = 3) -> Thresholds:
+    samples = synthesize_dataset(num_samples=num_samples, max_n=max_n, seed=seed)
+    X = [s.X for s in samples]
+    y = [s.y for s in samples]
+    tree = _train_tree(X, y, max_depth=max_depth, random_state=seed)
+    rules = _serialize_tree(tree)
+    cutoff_n = _estimate_timsort_cutoff(seed)
+    activation_n = _choose_activation_n(cutoff_n, max_n)
+    return Thresholds(cutoff_n=cutoff_n, activation_n=activation_n, tree=rules, feature_names=FEATURE_NAMES)
+def save_thresholds(path: str, thresholds: Thresholds) -> None:
+    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+    data = {
+        "cutoff_n": thresholds.cutoff_n,
+        "activation_n": thresholds.activation_n,
+        "feature_names": thresholds.feature_names,
+        "tree": thresholds.tree,
+    }
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+def load_thresholds(path: str) -> Thresholds:
+    with open(path, "r") as f:
+        obj = json.load(f)
+    # Backward-compatible: if activation_n missing, derive a conservative default
+    activation_n = int(obj.get("activation_n", max(32768, int(obj["cutoff_n"]) * 4)))
+    return Thresholds(cutoff_n=int(obj["cutoff_n"]), activation_n=activation_n, tree=obj["tree"], feature_names=list(obj["feature_names"]))

mlsort/model.py ADDED Viewed

@@ -0,0 +1,84 @@
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+import joblib
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, classification_report
+from .features import to_feature_vector
+LABELS = ["timsort", "np_quick", "np_merge", "counting", "radix"]
+LABEL_TO_ID = {l: i for i, l in enumerate(LABELS)}
+ID_TO_LABEL = {i: l for l, i in LABEL_TO_ID.items()}
+@dataclass
+class ModelArtifacts:
+    model: RandomForestClassifier
+    feature_names: List[str]
+def make_model(random_state: int = 42) -> RandomForestClassifier:
+    return RandomForestClassifier(
+        n_estimators=200,
+        max_depth=None,
+        min_samples_leaf=2,
+        random_state=random_state,
+        n_jobs=-1,
+    )
+def fit_model(X: List[List[float]], y: List[str], random_state: int = 42) -> ModelArtifacts:
+    y_ids = np.array([LABEL_TO_ID[l] for l in y], dtype=np.int64)
+    X_arr = np.asarray(X, dtype=np.float32)
+    model = make_model(random_state)
+    model.fit(X_arr, y_ids)
+    return ModelArtifacts(model=model, feature_names=[
+        "n",
+        "dtype_code",
+        "est_sortedness",
+        "est_dup_ratio",
+        "est_range",
+        "est_entropy",
+        "est_run_len",
+    ])
+def predict(model: RandomForestClassifier, props: Dict[str, float]) -> str:
+    X = np.asarray([to_feature_vector(props)], dtype=np.float32)
+    y_id = int(model.predict(X)[0])
+    return ID_TO_LABEL[y_id]
+def load_model(path: str) -> RandomForestClassifier:
+    return joblib.load(path)
+def save_model(path: str, model: RandomForestClassifier) -> None:
+    joblib.dump(model, path)
+def predict_best_algo(model: RandomForestClassifier, props: Dict[str, float]) -> str:
+    return predict(model, props)
+def evaluate_model(model: RandomForestClassifier, X: List[List[float]], y: List[str]) -> Dict:
+    y_true = np.array([LABEL_TO_ID[l] for l in y], dtype=np.int64)
+    X_arr = np.asarray(X, dtype=np.float32)
+    y_pred = model.predict(X_arr)
+    acc = accuracy_score(y_true, y_pred)
+    all_labels = list(range(len(LABELS)))
+    report = classification_report(
+        y_true,
+        y_pred,
+        labels=all_labels,
+        target_names=LABELS,
+        zero_division=0,
+        output_dict=True,
+    )
+    return {"accuracy": float(acc), "report": report}

mlsort/optimize.py ADDED Viewed

@@ -0,0 +1,80 @@
+from __future__ import annotations
+import statistics
+import time
+from typing import Dict, List, Tuple
+import numpy as np
+from .decision import decide
+from .installer import Thresholds
+from .algorithms import time_algorithm
+from .data import (
+    gen_sorted, gen_reverse, gen_nearly_sorted, gen_uniform, gen_small_range, gen_zipf, gen_normal,
+)
+def gen_cases(num_samples: int, max_n: int, seed: int) -> List[np.ndarray]:
+    rng = np.random.default_rng(seed)
+    gens = [
+        lambda n: gen_sorted(n, "int"),
+        lambda n: gen_reverse(n, "int"),
+        lambda n: gen_nearly_sorted(n, dtype="int"),
+        lambda n: gen_uniform(n, "int", 0, 10_000),
+        lambda n: gen_uniform(n, "float"),
+        lambda n: gen_small_range(n, 128),
+        lambda n: gen_zipf(n, a=2.0, dtype="int", max_val=50_000),
+        lambda n: gen_normal(n, dtype="float"),
+    ]
+    cases = []
+    for _ in range(num_samples):
+        n = int(rng.integers(128, max_n + 1))
+        g = gens[int(rng.integers(0, len(gens)))]
+        cases.append(g(n))
+    return cases
+def essential_stats(vals: List[float]) -> Dict[str, float]:
+    vals_sorted = sorted(vals)
+    return {
+        "mean": float(statistics.fmean(vals)),
+        "median": float(vals_sorted[len(vals_sorted)//2]),
+    }
+def eval_policy(th: Thresholds, arrays: List[np.ndarray]) -> Dict[str, float]:
+    total_times: List[float] = []
+    for arr in arrays:
+        t0 = time.perf_counter()
+        algo = decide(arr, th)
+        t1 = time.perf_counter()
+        t_sort = time_algorithm(arr, algo, repeats=1)
+        total_times.append((t1 - t0) + t_sort)
+    return essential_stats(total_times)
+def grid_candidates(max_n: int) -> Tuple[List[int], List[int]]:
+    cutoff_grid = [32, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072, 4096]
+    cutoff_grid = [c for c in cutoff_grid if c < max_n]
+    act_grid_base = [8192, 12000, 16384, 24576, 32768, 49152, 65536, 98304, 131072]
+    act_grid = sorted({min(max_n, a) for a in act_grid_base if a <= max_n})
+    if not act_grid:
+        act_grid = [min(max_n, 8192)]
+    return cutoff_grid, act_grid
+def optimize_cutoffs(th: Thresholds, arrays: List[np.ndarray]) -> Dict:
+    cutoff_grid, act_grid = grid_candidates(max(len(a) for a in arrays))
+    best = {"mean": float("inf"), "cutoff_n": th.cutoff_n, "activation_n": getattr(th, "activation_n", th.cutoff_n * 4)}
+    tried = []
+    for c in cutoff_grid:
+        for a in act_grid:
+            if a <= c:
+                continue
+            th_try = Thresholds(cutoff_n=c, activation_n=a, tree=th.tree, feature_names=th.feature_names)
+            stats = eval_policy(th_try, arrays)
+            tried.append({"cutoff_n": c, "activation_n": a, **stats})
+            if stats["mean"] < best["mean"]:
+                best = {"mean": stats["mean"], "cutoff_n": c, "activation_n": a}
+    top = sorted(tried, key=lambda x: x["mean"])[:10]
+    return {"best": best, "tried": top}