mlsort 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {mlsort-0.1.0 → mlsort-0.1.1}/PKG-INFO +5 -4
  2. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/api.py +15 -1
  3. mlsort-0.1.1/mlsort/cli_export_forest.py +47 -0
  4. mlsort-0.1.1/mlsort/fast_model.py +56 -0
  5. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort.egg-info/PKG-INFO +5 -4
  6. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort.egg-info/SOURCES.txt +2 -0
  7. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort.egg-info/entry_points.txt +1 -0
  8. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort.egg-info/requires.txt +2 -0
  9. {mlsort-0.1.0 → mlsort-0.1.1}/pyproject.toml +6 -1
  10. {mlsort-0.1.0 → mlsort-0.1.1}/LICENSE +0 -0
  11. {mlsort-0.1.0 → mlsort-0.1.1}/README.md +0 -0
  12. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/__init__.py +0 -0
  13. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/algorithms.py +0 -0
  14. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/baseline.py +0 -0
  15. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/benchmark.py +0 -0
  16. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/cli_bench_compare.py +0 -0
  17. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/cli_bench_install.py +0 -0
  18. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/cli_init.py +0 -0
  19. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/cli_optimize_cutoffs.py +0 -0
  20. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/config.py +0 -0
  21. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/data.py +0 -0
  22. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/decision.py +0 -0
  23. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/features.py +0 -0
  24. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/installer.py +0 -0
  25. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/model.py +0 -0
  26. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort/optimize.py +0 -0
  27. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort.egg-info/dependency_links.txt +0 -0
  28. {mlsort-0.1.0 → mlsort-0.1.1}/mlsort.egg-info/top_level.txt +0 -0
  29. {mlsort-0.1.0 → mlsort-0.1.1}/setup.cfg +0 -0
  30. {mlsort-0.1.0 → mlsort-0.1.1}/tests/test_decision.py +0 -0
  31. {mlsort-0.1.0 → mlsort-0.1.1}/tests/test_features.py +0 -0
  32. {mlsort-0.1.0 → mlsort-0.1.1}/tests/test_training_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlsort
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: ML-guided sorting backend selector with install-time benchmarking
5
5
  Author: Siddharth Chaudhary
6
6
  License: MIT License
@@ -43,9 +43,10 @@ Requires-Python: >=3.9
43
43
  Description-Content-Type: text/markdown
44
44
  License-File: LICENSE
45
45
  Requires-Dist: numpy>=1.24
46
- Requires-Dist: scikit-learn>=1.3
47
- Requires-Dist: scipy>=1.10
48
- Requires-Dist: joblib>=1.3
46
+ Provides-Extra: train
47
+ Requires-Dist: scikit-learn>=1.3; extra == "train"
48
+ Requires-Dist: scipy>=1.10; extra == "train"
49
+ Requires-Dist: joblib>=1.3; extra == "train"
49
50
  Dynamic: license-file
50
51
 
51
52
  # mlsort
@@ -43,6 +43,10 @@ def _ensure_thresholds(path: str) -> Thresholds:
43
43
  return th
44
44
 
45
45
 
46
+ def _use_fast_model() -> bool:
47
+ return os.environ.get("MLSORT_USE_FAST_MODEL", "0").lower() in {"1", "true", "yes", "on"}
48
+
49
+
46
50
  def select_algorithm(arr: Sequence[Any], thresholds_path: str | None = None, *, key: Any = None, reverse: bool = False) -> str:
47
51
  # Input validation
48
52
  try:
@@ -88,7 +92,17 @@ def select_algorithm(arr: Sequence[Any], thresholds_path: str | None = None, *,
88
92
  thr_path = thresholds_path or os.path.join(get_artifacts_dir(), "thresholds.json")
89
93
  os.makedirs(os.path.dirname(thr_path) or ".", exist_ok=True)
90
94
  th = _ensure_thresholds(thr_path)
91
- algo = decide(arr, th)
95
+ # Large arrays: optionally use fast model
96
+ if _use_fast_model():
97
+ try:
98
+ from .features import estimate_properties
99
+ from .fast_model import predict_fast
100
+ props = estimate_properties(arr)
101
+ algo = predict_fast(props)
102
+ except Exception:
103
+ algo = decide(arr, th)
104
+ else:
105
+ algo = decide(arr, th)
92
106
  if get_env_bool("MLSORT_DEBUG", False):
93
107
  log.debug("mlsort.select algo=%s n=%d path=%s", algo, n, thr_path)
94
108
  return algo
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import joblib
8
+ from sklearn.ensemble import RandomForestClassifier
9
+
10
+ from .model import LABELS
11
+
12
+
13
+ def export_forest(model: RandomForestClassifier) -> dict:
14
+ trees = []
15
+ for est in model.estimators_:
16
+ t = est.tree_
17
+ nodes = []
18
+ for i in range(t.node_count):
19
+ if t.children_left[i] == -1 and t.children_right[i] == -1:
20
+ value = t.value[i][0].tolist()
21
+ nodes.append({"value": value})
22
+ else:
23
+ nodes.append({
24
+ "feature": int(t.feature[i]),
25
+ "threshold": float(t.threshold[i]),
26
+ "left": int(t.children_left[i]),
27
+ "right": int(t.children_right[i]),
28
+ })
29
+ trees.append({"nodes": nodes})
30
+ return {"label_names": LABELS, "trees": trees}
31
+
32
+
33
+ def main():
34
+ p = argparse.ArgumentParser(description="Export sklearn RandomForest to fast JSON format")
35
+ p.add_argument("--model", required=True, help="Path to model.joblib")
36
+ p.add_argument("--out", required=True, help="Path to write forest.json")
37
+ args = p.parse_args()
38
+
39
+ model: RandomForestClassifier = joblib.load(args.model)
40
+ spec = export_forest(model)
41
+ Path(args.out).parent.mkdir(parents=True, exist_ok=True)
42
+ Path(args.out).write_text(json.dumps(spec))
43
+ print(f"Wrote {args.out}") # noqa: T201
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from typing import Dict, List, Optional
6
+
7
+ from .features import to_feature_vector
8
+ from .model import ID_TO_LABEL
9
+ from .config import get_artifacts_dir
10
+
11
+ _FAST_MODEL: Optional[Dict] = None
12
+ _FAST_MODEL_PATH: Optional[str] = None
13
+
14
+
15
+ def _get_default_fast_model_path() -> str:
16
+ return os.path.join(get_artifacts_dir(), "forest.json")
17
+
18
+
19
+ def load_fast_model(path: Optional[str] = None) -> Dict:
20
+ global _FAST_MODEL, _FAST_MODEL_PATH
21
+ use_path = path or _get_default_fast_model_path()
22
+ if _FAST_MODEL is None or _FAST_MODEL_PATH != use_path:
23
+ with open(use_path, "r") as f:
24
+ _FAST_MODEL = json.load(f)
25
+ _FAST_MODEL_PATH = use_path
26
+ return _FAST_MODEL # type: ignore[return-value]
27
+
28
+
29
+ def _tree_predict(tree: Dict, x: List[float]) -> int:
30
+ nodes = tree["nodes"]
31
+ i = 0
32
+ while True:
33
+ node = nodes[i]
34
+ if "value" in node:
35
+ vec: List[float] = node["value"]
36
+ return int(max(range(len(vec)), key=lambda k: vec[k]))
37
+ feat = node["feature"]
38
+ thr = node["threshold"]
39
+ left = node["left"]
40
+ right = node["right"]
41
+ i = left if x[feat] <= thr else right
42
+
43
+
44
+ def predict_fast(props: Dict[str, float], *, model_path: Optional[str] = None) -> str:
45
+ fm = load_fast_model(model_path)
46
+ x = to_feature_vector(props)
47
+ votes: List[int] = []
48
+ for tree in fm["trees"]:
49
+ votes.append(_tree_predict(tree, x))
50
+ if not votes:
51
+ return "timsort"
52
+ counts: Dict[int, int] = {}
53
+ for v in votes:
54
+ counts[v] = counts.get(v, 0) + 1
55
+ best_id = max(counts.items(), key=lambda kv: kv[1])[0]
56
+ return ID_TO_LABEL[int(best_id)]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mlsort
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: ML-guided sorting backend selector with install-time benchmarking
5
5
  Author: Siddharth Chaudhary
6
6
  License: MIT License
@@ -43,9 +43,10 @@ Requires-Python: >=3.9
43
43
  Description-Content-Type: text/markdown
44
44
  License-File: LICENSE
45
45
  Requires-Dist: numpy>=1.24
46
- Requires-Dist: scikit-learn>=1.3
47
- Requires-Dist: scipy>=1.10
48
- Requires-Dist: joblib>=1.3
46
+ Provides-Extra: train
47
+ Requires-Dist: scikit-learn>=1.3; extra == "train"
48
+ Requires-Dist: scipy>=1.10; extra == "train"
49
+ Requires-Dist: joblib>=1.3; extra == "train"
49
50
  Dynamic: license-file
50
51
 
51
52
  # mlsort
@@ -8,11 +8,13 @@ mlsort/baseline.py
8
8
  mlsort/benchmark.py
9
9
  mlsort/cli_bench_compare.py
10
10
  mlsort/cli_bench_install.py
11
+ mlsort/cli_export_forest.py
11
12
  mlsort/cli_init.py
12
13
  mlsort/cli_optimize_cutoffs.py
13
14
  mlsort/config.py
14
15
  mlsort/data.py
15
16
  mlsort/decision.py
17
+ mlsort/fast_model.py
16
18
  mlsort/features.py
17
19
  mlsort/installer.py
18
20
  mlsort/model.py
@@ -1,5 +1,6 @@
1
1
  [console_scripts]
2
2
  mlsort-bench-compare = mlsort.cli_bench_compare:main
3
3
  mlsort-bench-install = mlsort.cli_bench_install:main
4
+ mlsort-export-forest = mlsort.cli_export_forest:main
4
5
  mlsort-init = mlsort.cli_init:main
5
6
  mlsort-optimize-cutoffs = mlsort.cli_optimize_cutoffs:main
@@ -1,4 +1,6 @@
1
1
  numpy>=1.24
2
+
3
+ [train]
2
4
  scikit-learn>=1.3
3
5
  scipy>=1.10
4
6
  joblib>=1.3
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "mlsort"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  description = "ML-guided sorting backend selector with install-time benchmarking"
9
9
  authors = [{ name = "Siddharth Chaudhary" }]
10
10
  requires-python = ">=3.9"
@@ -28,6 +28,10 @@ classifiers = [
28
28
  ]
29
29
  dependencies = [
30
30
  "numpy>=1.24",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ train = [
31
35
  "scikit-learn>=1.3",
32
36
  "scipy>=1.10",
33
37
  "joblib>=1.3",
@@ -47,3 +51,4 @@ mlsort-bench-install = "mlsort.cli_bench_install:main"
47
51
  mlsort-bench-compare = "mlsort.cli_bench_compare:main"
48
52
  mlsort-optimize-cutoffs = "mlsort.cli_optimize_cutoffs:main"
49
53
  mlsort-init = "mlsort.cli_init:main"
54
+ mlsort-export-forest = "mlsort.cli_export_forest:main"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes