slento-mesh-optimizer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- slento_mesh_optimizer-0.1.0/PKG-INFO +83 -0
- slento_mesh_optimizer-0.1.0/README.md +43 -0
- slento_mesh_optimizer-0.1.0/mesh/__init__.py +2 -0
- slento_mesh_optimizer-0.1.0/mesh/_version.py +29 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/__init__.py +0 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/dataset_tracker.py +54 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/hardware_scanner.py +372 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/health_reporter.py +147 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/job_executor.py +254 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/network_probe.py +610 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/node_agent.py +302 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/probe_runner.py +113 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/storage_probe.py +571 -0
- slento_mesh_optimizer-0.1.0/mesh/agent/system_optimizer.py +988 -0
- slento_mesh_optimizer-0.1.0/mesh/api/__init__.py +0 -0
- slento_mesh_optimizer-0.1.0/mesh/api/agent_api.py +349 -0
- slento_mesh_optimizer-0.1.0/mesh/api/controller_api.py +574 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/__init__.py +1 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/analyzer.py +290 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/api.py +97 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/base_agent.py +146 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/diagnostics.py +316 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/models.py +89 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/orchestrator.py +269 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/test_gen.py +354 -0
- slento_mesh_optimizer-0.1.0/mesh/codex/tuner.py +245 -0
- slento_mesh_optimizer-0.1.0/mesh/config.py +178 -0
- slento_mesh_optimizer-0.1.0/mesh/dashboard/__init__.py +0 -0
- slento_mesh_optimizer-0.1.0/mesh/dashboard/app.py +246 -0
- slento_mesh_optimizer-0.1.0/mesh/db/__init__.py +0 -0
- slento_mesh_optimizer-0.1.0/mesh/db/mesh_db.py +307 -0
- slento_mesh_optimizer-0.1.0/mesh/errors.py +91 -0
- slento_mesh_optimizer-0.1.0/mesh/licensing/__init__.py +1 -0
- slento_mesh_optimizer-0.1.0/mesh/licensing/client.py +188 -0
- slento_mesh_optimizer-0.1.0/mesh/licensing/enforcement.py +230 -0
- slento_mesh_optimizer-0.1.0/mesh/licensing/keygen.py +58 -0
- slento_mesh_optimizer-0.1.0/mesh/licensing/models.py +161 -0
- slento_mesh_optimizer-0.1.0/mesh/licensing/server.py +415 -0
- slento_mesh_optimizer-0.1.0/mesh/licensing/telemetry.py +158 -0
- slento_mesh_optimizer-0.1.0/mesh/models.py +342 -0
- slento_mesh_optimizer-0.1.0/mesh/net/__init__.py +3 -0
- slento_mesh_optimizer-0.1.0/mesh/net/client.py +149 -0
- slento_mesh_optimizer-0.1.0/mesh/scripts/__init__.py +0 -0
- slento_mesh_optimizer-0.1.0/mesh/scripts/license_admin.py +275 -0
- slento_mesh_optimizer-0.1.0/mesh/scripts/meshrun.py +395 -0
- slento_mesh_optimizer-0.1.0/mesh/scripts/self_update.py +285 -0
- slento_mesh_optimizer-0.1.0/mesh/scripts/start_agent.py +55 -0
- slento_mesh_optimizer-0.1.0/mesh/scripts/start_controller.py +103 -0
- slento_mesh_optimizer-0.1.0/mesh/security/__init__.py +18 -0
- slento_mesh_optimizer-0.1.0/mesh/security/audit.py +220 -0
- slento_mesh_optimizer-0.1.0/mesh/security/auth.py +148 -0
- slento_mesh_optimizer-0.1.0/mesh/security/tls.py +112 -0
- slento_mesh_optimizer-0.1.0/mesh/security/tokens.py +154 -0
- slento_mesh_optimizer-0.1.0/mesh/security/validation.py +377 -0
- slento_mesh_optimizer-0.1.0/mesh/updates/__init__.py +1 -0
- slento_mesh_optimizer-0.1.0/mesh/updates/api.py +230 -0
- slento_mesh_optimizer-0.1.0/mesh/updates/distributor.py +208 -0
- slento_mesh_optimizer-0.1.0/mesh/updates/manager.py +377 -0
- slento_mesh_optimizer-0.1.0/mesh/updates/models.py +53 -0
- slento_mesh_optimizer-0.1.0/mesh/updates/server.py +161 -0
- slento_mesh_optimizer-0.1.0/mesh/vault/__init__.py +5 -0
- slento_mesh_optimizer-0.1.0/mesh/vault/cli.py +486 -0
- slento_mesh_optimizer-0.1.0/mesh/vault/crypto.py +147 -0
- slento_mesh_optimizer-0.1.0/mesh/vault/mcp_server.py +382 -0
- slento_mesh_optimizer-0.1.0/mesh/vault/models.py +94 -0
- slento_mesh_optimizer-0.1.0/mesh/vault/policies.py +374 -0
- slento_mesh_optimizer-0.1.0/mesh/vault/store.py +455 -0
- slento_mesh_optimizer-0.1.0/pyproject.toml +56 -0
- slento_mesh_optimizer-0.1.0/setup.cfg +4 -0
- slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/PKG-INFO +83 -0
- slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/SOURCES.txt +73 -0
- slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/dependency_links.txt +1 -0
- slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/entry_points.txt +2 -0
- slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/requires.txt +16 -0
- slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/top_level.txt +2 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: slento-mesh-optimizer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Distributed hardware optimization agent — JEPA-driven job routing, GPU/CPU/FPGA probing, and performance tuning across heterogeneous compute clusters.
|
|
5
|
+
Author-email: Slento Systems <support@slentosystems.com>
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Project-URL: Homepage, https://slentosystems.com
|
|
8
|
+
Project-URL: Documentation, https://docs.slentosystems.com
|
|
9
|
+
Project-URL: Repository, https://github.com/slentoai/mesh-optimizer
|
|
10
|
+
Project-URL: Portal, https://portal.slentosystems.com
|
|
11
|
+
Keywords: gpu,optimization,mesh,distributed,jepa,hardware,profiling
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: System Administrators
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: System :: Hardware
|
|
22
|
+
Classifier: Topic :: System :: Monitoring
|
|
23
|
+
Classifier: Topic :: System :: Systems Administration
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
Requires-Dist: aiohttp>=3.9
|
|
27
|
+
Requires-Dist: fastapi>=0.109
|
|
28
|
+
Requires-Dist: httpx>=0.25
|
|
29
|
+
Requires-Dist: numpy>=1.24
|
|
30
|
+
Requires-Dist: psutil>=5.9
|
|
31
|
+
Requires-Dist: pydantic>=2.0
|
|
32
|
+
Requires-Dist: pyyaml>=6.0
|
|
33
|
+
Requires-Dist: uvicorn[standard]>=0.25
|
|
34
|
+
Provides-Extra: ml
|
|
35
|
+
Requires-Dist: torch>=2.0; extra == "ml"
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
39
|
+
Requires-Dist: httpx; extra == "dev"
|
|
40
|
+
|
|
41
|
+
# Mesh Optimizer
|
|
42
|
+
|
|
43
|
+
Distributed hardware optimization agent by [Slento Systems](https://slentosystems.com).
|
|
44
|
+
|
|
45
|
+
JEPA-driven job routing, GPU/CPU/FPGA probing, and performance tuning across heterogeneous compute clusters.
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install mesh-optimizer
|
|
51
|
+
mesh-optimizer start
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## One-Line Install (Linux)
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
curl -fsSL https://mesh.slentosystems.com/install.sh | bash
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Features
|
|
61
|
+
|
|
62
|
+
- **Hardware Discovery** — Auto-detects GPUs (AMD, NVIDIA), CPUs, FPGAs, memory subsystems
|
|
63
|
+
- **Performance Probing** — Runs targeted benchmarks to map hardware capabilities
|
|
64
|
+
- **JEPA Optimization** — Joint Embedding Predictive Architecture learns your hardware's performance characteristics
|
|
65
|
+
- **Job Routing** — Automatically routes compute jobs to the best-suited hardware
|
|
66
|
+
- **NAT/WAN Support** — Works behind firewalls with outbound-only connections
|
|
67
|
+
- **Multi-Platform** — Linux, macOS, Windows
|
|
68
|
+
|
|
69
|
+
## Supported Hardware
|
|
70
|
+
|
|
71
|
+
- AMD Radeon RX 7000 (RDNA3), Instinct MI200/MI300 (CDNA)
|
|
72
|
+
- NVIDIA GeForce RTX 30/40/50, Quadro, Tesla, A100
|
|
73
|
+
- Intel/AMD CPUs (10th gen+, Ryzen/EPYC)
|
|
74
|
+
- Apple M1/M2/M3/M4
|
|
75
|
+
- Xilinx Alveo/UltraScale+ FPGAs
|
|
76
|
+
|
|
77
|
+
## Documentation
|
|
78
|
+
|
|
79
|
+
Full docs at [docs.slentosystems.com](https://docs.slentosystems.com)
|
|
80
|
+
|
|
81
|
+
## License
|
|
82
|
+
|
|
83
|
+
Proprietary. Free community tier available at [portal.slentosystems.com](https://portal.slentosystems.com).
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Mesh Optimizer
|
|
2
|
+
|
|
3
|
+
Distributed hardware optimization agent by [Slento Systems](https://slentosystems.com).
|
|
4
|
+
|
|
5
|
+
JEPA-driven job routing, GPU/CPU/FPGA probing, and performance tuning across heterogeneous compute clusters.
|
|
6
|
+
|
|
7
|
+
## Quick Start
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install mesh-optimizer
|
|
11
|
+
mesh-optimizer start
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## One-Line Install (Linux)
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
curl -fsSL https://mesh.slentosystems.com/install.sh | bash
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- **Hardware Discovery** — Auto-detects GPUs (AMD, NVIDIA), CPUs, FPGAs, memory subsystems
|
|
23
|
+
- **Performance Probing** — Runs targeted benchmarks to map hardware capabilities
|
|
24
|
+
- **JEPA Optimization** — Joint Embedding Predictive Architecture learns your hardware's performance characteristics
|
|
25
|
+
- **Job Routing** — Automatically routes compute jobs to the best-suited hardware
|
|
26
|
+
- **NAT/WAN Support** — Works behind firewalls with outbound-only connections
|
|
27
|
+
- **Multi-Platform** — Linux, macOS, Windows
|
|
28
|
+
|
|
29
|
+
## Supported Hardware
|
|
30
|
+
|
|
31
|
+
- AMD Radeon RX 7000 (RDNA3), Instinct MI200/MI300 (CDNA)
|
|
32
|
+
- NVIDIA GeForce RTX 30/40/50, Quadro, Tesla, A100
|
|
33
|
+
- Intel/AMD CPUs (10th gen+, Ryzen/EPYC)
|
|
34
|
+
- Apple M1/M2/M3/M4
|
|
35
|
+
- Xilinx Alveo/UltraScale+ FPGAs
|
|
36
|
+
|
|
37
|
+
## Documentation
|
|
38
|
+
|
|
39
|
+
Full docs at [docs.slentosystems.com](https://docs.slentosystems.com)
|
|
40
|
+
|
|
41
|
+
## License
|
|
42
|
+
|
|
43
|
+
Proprietary. Free community tier available at [portal.slentosystems.com](https://portal.slentosystems.com).
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Single source of truth for mesh-optimizer version."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_version_info() -> dict:
|
|
12
|
+
"""Return version metadata including build date and git SHA."""
|
|
13
|
+
info = {
|
|
14
|
+
"version": __version__,
|
|
15
|
+
"build_date": datetime.now(timezone.utc).isoformat(),
|
|
16
|
+
"python_version": sys.version,
|
|
17
|
+
"git_sha": None,
|
|
18
|
+
}
|
|
19
|
+
try:
|
|
20
|
+
import subprocess
|
|
21
|
+
result = subprocess.run(
|
|
22
|
+
["git", "rev-parse", "--short", "HEAD"],
|
|
23
|
+
capture_output=True, text=True, timeout=5,
|
|
24
|
+
)
|
|
25
|
+
if result.returncode == 0:
|
|
26
|
+
info["git_sha"] = result.stdout.strip()
|
|
27
|
+
except Exception:
|
|
28
|
+
pass
|
|
29
|
+
return info
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Track which datasets are cached locally on this node."""
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DatasetTracker:
|
|
12
|
+
def __init__(self, cache_dir: str = ""):
|
|
13
|
+
self.cache_dir = cache_dir
|
|
14
|
+
self._registry_path = Path(cache_dir or "/tmp") / "mesh_dataset_registry.json"
|
|
15
|
+
self._datasets: Dict[str, dict] = {}
|
|
16
|
+
self._load()
|
|
17
|
+
|
|
18
|
+
def _load(self):
|
|
19
|
+
if self._registry_path.exists():
|
|
20
|
+
try:
|
|
21
|
+
self._datasets = json.loads(self._registry_path.read_text())
|
|
22
|
+
except Exception as e:
|
|
23
|
+
logger.warning("Failed to load dataset registry: %s", e)
|
|
24
|
+
self._datasets = {}
|
|
25
|
+
|
|
26
|
+
def _save(self):
|
|
27
|
+
try:
|
|
28
|
+
self._registry_path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
self._registry_path.write_text(json.dumps(self._datasets, indent=2))
|
|
30
|
+
except Exception as e:
|
|
31
|
+
logger.warning("Failed to save dataset registry: %s", e)
|
|
32
|
+
|
|
33
|
+
def register(self, dataset_id: str, path: str, size_mb: float = 0.0):
|
|
34
|
+
self._datasets[dataset_id] = {
|
|
35
|
+
"path": path,
|
|
36
|
+
"size_mb": size_mb,
|
|
37
|
+
"registered_at": datetime.now(timezone.utc).isoformat(),
|
|
38
|
+
}
|
|
39
|
+
self._save()
|
|
40
|
+
logger.info("Dataset registered: %s at %s (%.1f MB)", dataset_id, path, size_mb)
|
|
41
|
+
|
|
42
|
+
def unregister(self, dataset_id: str):
|
|
43
|
+
if dataset_id in self._datasets:
|
|
44
|
+
del self._datasets[dataset_id]
|
|
45
|
+
self._save()
|
|
46
|
+
|
|
47
|
+
def list_ids(self) -> List[str]:
|
|
48
|
+
return list(self._datasets.keys())
|
|
49
|
+
|
|
50
|
+
def get(self, dataset_id: str) -> Optional[dict]:
|
|
51
|
+
return self._datasets.get(dataset_id)
|
|
52
|
+
|
|
53
|
+
def list_all(self) -> Dict[str, dict]:
|
|
54
|
+
return dict(self._datasets)
|
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""Auto-discovery of hardware on a mesh node."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import multiprocessing
|
|
6
|
+
import os
|
|
7
|
+
import platform
|
|
8
|
+
import re
|
|
9
|
+
import shutil
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import psutil
|
|
14
|
+
|
|
15
|
+
from mesh.models import CPUInfo, FPGAInfo, GPUInfo, HardwareInventory, MountInfo
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_SLENTO_REQUIRED_LIBS = [
|
|
20
|
+
"pytorch_compat/libcudart.so.12",
|
|
21
|
+
"pytorch_compat/libcublas.so.12",
|
|
22
|
+
"pytorch_compat/libcublasLt.so.12",
|
|
23
|
+
"pytorch_compat/libcudnn.so.9",
|
|
24
|
+
"pytorch_compat/libcuda.so.1",
|
|
25
|
+
]
|
|
26
|
+
_SLENTO_GPU_ARCH_MAP = {
|
|
27
|
+
"7900 XTX": "gfx1100",
|
|
28
|
+
"7900 XT": "gfx1100",
|
|
29
|
+
"MI100": "gfx908",
|
|
30
|
+
"MI250": "gfx90a",
|
|
31
|
+
"MI300": "gfx942",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def scan_hardware(hostname_override: str = "", config=None) -> HardwareInventory:
|
|
36
|
+
"""Detect all hardware on this machine."""
|
|
37
|
+
inv = HardwareInventory(
|
|
38
|
+
hostname=hostname_override or platform.node(),
|
|
39
|
+
platform=platform.system(),
|
|
40
|
+
memory_total_mb=psutil.virtual_memory().total // (1024 * 1024),
|
|
41
|
+
)
|
|
42
|
+
inv.cpu = _detect_cpu()
|
|
43
|
+
inv.gpus = _detect_gpus()
|
|
44
|
+
inv.fpgas = _detect_fpgas()
|
|
45
|
+
inv.has_pytorch = _check_pytorch()
|
|
46
|
+
inv.has_rocm = shutil.which("rocm-smi") is not None
|
|
47
|
+
inv.has_cuda = shutil.which("nvidia-smi") is not None
|
|
48
|
+
inv.mounts = _detect_mounts()
|
|
49
|
+
inv.scratch_paths = _get_config_value(config, "scratch_paths", []) or []
|
|
50
|
+
|
|
51
|
+
slento = _detect_slento_cuda(config=config)
|
|
52
|
+
inv.has_slento_cuda = slento["has_slento_cuda"]
|
|
53
|
+
inv.slento_cuda_version = slento["slento_cuda_version"]
|
|
54
|
+
inv.slento_cuda_gpu_archs = slento["slento_cuda_gpu_archs"]
|
|
55
|
+
inv.slento_cuda_lib_path = slento["slento_cuda_lib_path"]
|
|
56
|
+
return inv
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _get_config_value(config, key: str, default=None):
|
|
60
|
+
if config is None:
|
|
61
|
+
return default
|
|
62
|
+
if isinstance(config, dict):
|
|
63
|
+
if key in config:
|
|
64
|
+
return config[key]
|
|
65
|
+
node_cfg = config.get("node")
|
|
66
|
+
if isinstance(node_cfg, dict) and key in node_cfg:
|
|
67
|
+
return node_cfg[key]
|
|
68
|
+
return default
|
|
69
|
+
if hasattr(config, key):
|
|
70
|
+
return getattr(config, key)
|
|
71
|
+
node_cfg = getattr(config, "node", None)
|
|
72
|
+
if node_cfg is not None and hasattr(node_cfg, key):
|
|
73
|
+
return getattr(node_cfg, key)
|
|
74
|
+
return default
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _detect_cpu() -> CPUInfo:
|
|
78
|
+
info = CPUInfo(cores=multiprocessing.cpu_count() or 1)
|
|
79
|
+
# Model name
|
|
80
|
+
if platform.system() == "Linux":
|
|
81
|
+
try:
|
|
82
|
+
with open("/proc/cpuinfo") as f:
|
|
83
|
+
for line in f:
|
|
84
|
+
if "model name" in line:
|
|
85
|
+
info.model = line.split(":")[1].strip()
|
|
86
|
+
break
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
elif platform.system() == "Darwin":
|
|
90
|
+
try:
|
|
91
|
+
info.model = subprocess.check_output(
|
|
92
|
+
["sysctl", "-n", "machdep.cpu.brand_string"],
|
|
93
|
+
text=True, stderr=subprocess.DEVNULL
|
|
94
|
+
).strip()
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
# lscpu for Linux details
|
|
99
|
+
try:
|
|
100
|
+
out = subprocess.check_output("lscpu", text=True, stderr=subprocess.DEVNULL)
|
|
101
|
+
cores_per = 1
|
|
102
|
+
sockets = 1
|
|
103
|
+
for line in out.splitlines():
|
|
104
|
+
if "Core(s) per socket:" in line:
|
|
105
|
+
cores_per = int(line.split(":")[1].strip())
|
|
106
|
+
elif "Socket(s):" in line:
|
|
107
|
+
sockets = int(line.split(":")[1].strip())
|
|
108
|
+
elif "Thread(s) per core:" in line:
|
|
109
|
+
info.threads_per_core = int(line.split(":")[1].strip())
|
|
110
|
+
elif "NUMA node(s):" in line:
|
|
111
|
+
info.numa_nodes = int(line.split(":")[1].strip())
|
|
112
|
+
elif "CPU max MHz:" in line:
|
|
113
|
+
info.freq_mhz = float(line.split(":")[1].strip())
|
|
114
|
+
elif "CPU MHz:" in line and info.freq_mhz == 0:
|
|
115
|
+
info.freq_mhz = float(line.split(":")[1].strip())
|
|
116
|
+
info.physical_cores = cores_per * sockets
|
|
117
|
+
except Exception:
|
|
118
|
+
info.physical_cores = info.cores // 2
|
|
119
|
+
|
|
120
|
+
# Cache sizes from sysfs
|
|
121
|
+
try:
|
|
122
|
+
for idx in range(10):
|
|
123
|
+
cache_dir = f"/sys/devices/system/cpu/cpu0/cache/index{idx}"
|
|
124
|
+
if not os.path.exists(cache_dir):
|
|
125
|
+
break
|
|
126
|
+
level = open(f"{cache_dir}/level").read().strip()
|
|
127
|
+
ctype = open(f"{cache_dir}/type").read().strip()
|
|
128
|
+
size_str = open(f"{cache_dir}/size").read().strip()
|
|
129
|
+
size_kb = int(re.match(r"(\d+)", size_str).group(1))
|
|
130
|
+
if "M" in size_str:
|
|
131
|
+
size_kb *= 1024
|
|
132
|
+
if level == "1" and "Data" in ctype:
|
|
133
|
+
info.l1d_kb = size_kb
|
|
134
|
+
elif level == "2":
|
|
135
|
+
info.l2_kb = size_kb
|
|
136
|
+
elif level == "3":
|
|
137
|
+
info.l3_kb = size_kb
|
|
138
|
+
except Exception:
|
|
139
|
+
pass
|
|
140
|
+
return info
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _detect_gpus() -> list[GPUInfo]:
|
|
144
|
+
gpus = []
|
|
145
|
+
# NVIDIA GPUs
|
|
146
|
+
gpus.extend(_detect_nvidia_gpus())
|
|
147
|
+
# AMD GPUs
|
|
148
|
+
gpus.extend(_detect_amd_gpus())
|
|
149
|
+
return gpus
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _detect_mounts() -> list[MountInfo]:
|
|
153
|
+
mounts = []
|
|
154
|
+
try:
|
|
155
|
+
for partition in psutil.disk_partitions(all=False):
|
|
156
|
+
try:
|
|
157
|
+
usage = psutil.disk_usage(partition.mountpoint)
|
|
158
|
+
except Exception:
|
|
159
|
+
continue
|
|
160
|
+
mounts.append(MountInfo(
|
|
161
|
+
mountpoint=partition.mountpoint,
|
|
162
|
+
device=partition.device,
|
|
163
|
+
fstype=partition.fstype,
|
|
164
|
+
total_mb=int(usage.total // (1024 * 1024)),
|
|
165
|
+
free_mb=int(usage.free // (1024 * 1024)),
|
|
166
|
+
))
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.debug("Mount detection failed: %s", e)
|
|
169
|
+
return mounts
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _detect_nvidia_gpus() -> list[GPUInfo]:
|
|
173
|
+
gpus = []
|
|
174
|
+
if not shutil.which("nvidia-smi"):
|
|
175
|
+
return gpus
|
|
176
|
+
try:
|
|
177
|
+
out = subprocess.check_output(
|
|
178
|
+
["nvidia-smi", "--query-gpu=name,memory.total,driver_version,temperature.gpu,utilization.gpu,memory.used",
|
|
179
|
+
"--format=csv,noheader,nounits"],
|
|
180
|
+
text=True, stderr=subprocess.DEVNULL, timeout=10
|
|
181
|
+
)
|
|
182
|
+
for line in out.strip().splitlines():
|
|
183
|
+
parts = [p.strip() for p in line.split(",")]
|
|
184
|
+
if len(parts) >= 6:
|
|
185
|
+
gpus.append(GPUInfo(
|
|
186
|
+
name=parts[0],
|
|
187
|
+
vendor="nvidia",
|
|
188
|
+
vram_mb=int(float(parts[1])),
|
|
189
|
+
driver=parts[2],
|
|
190
|
+
temperature_c=float(parts[3]) if parts[3] != "N/A" else 0.0,
|
|
191
|
+
utilization_pct=float(parts[4]) if parts[4] != "N/A" else 0.0,
|
|
192
|
+
memory_used_mb=int(float(parts[5])) if parts[5] != "N/A" else 0,
|
|
193
|
+
))
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.warning("nvidia-smi failed: %s", e)
|
|
196
|
+
return gpus
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _detect_amd_gpus() -> list[GPUInfo]:
|
|
200
|
+
gpus = []
|
|
201
|
+
if not shutil.which("rocm-smi"):
|
|
202
|
+
return gpus
|
|
203
|
+
try:
|
|
204
|
+
out = subprocess.check_output(
|
|
205
|
+
["rocm-smi", "--showproductname", "--showmeminfo", "vram",
|
|
206
|
+
"--showtemp", "--showuse", "--json"],
|
|
207
|
+
text=True, stderr=subprocess.DEVNULL, timeout=10
|
|
208
|
+
)
|
|
209
|
+
import json
|
|
210
|
+
data = json.loads(out)
|
|
211
|
+
for key, val in data.items():
|
|
212
|
+
if not key.startswith("card"):
|
|
213
|
+
continue
|
|
214
|
+
gpu = GPUInfo(vendor="amd")
|
|
215
|
+
gpu.name = val.get("Card Series", val.get("Card series", "AMD GPU"))
|
|
216
|
+
vram_total = val.get("VRAM Total Memory (B)", 0)
|
|
217
|
+
if vram_total:
|
|
218
|
+
gpu.vram_mb = int(vram_total) // (1024 * 1024)
|
|
219
|
+
vram_used = val.get("VRAM Total Used Memory (B)", 0)
|
|
220
|
+
if vram_used:
|
|
221
|
+
gpu.memory_used_mb = int(vram_used) // (1024 * 1024)
|
|
222
|
+
temp = val.get("Temperature (Sensor edge) (C)", 0)
|
|
223
|
+
if temp:
|
|
224
|
+
gpu.temperature_c = float(temp)
|
|
225
|
+
use = val.get("GPU use (%)", 0)
|
|
226
|
+
if use:
|
|
227
|
+
gpu.utilization_pct = float(use)
|
|
228
|
+
gpus.append(gpu)
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.warning("rocm-smi failed: %s", e)
|
|
231
|
+
# Fallback: check /sys/class/drm
|
|
232
|
+
if not gpus:
|
|
233
|
+
try:
|
|
234
|
+
drm = Path("/sys/class/drm")
|
|
235
|
+
for card in sorted(drm.glob("card[0-9]*")):
|
|
236
|
+
device = card / "device"
|
|
237
|
+
vendor_file = device / "vendor"
|
|
238
|
+
if vendor_file.exists():
|
|
239
|
+
vendor_id = vendor_file.read_text().strip()
|
|
240
|
+
if vendor_id == "0x1002": # AMD
|
|
241
|
+
name = "AMD GPU"
|
|
242
|
+
product = device / "product_name"
|
|
243
|
+
if product.exists():
|
|
244
|
+
name = product.read_text().strip()
|
|
245
|
+
vram = 0
|
|
246
|
+
mem_file = device / "mem_info_vram_total"
|
|
247
|
+
if mem_file.exists():
|
|
248
|
+
vram = int(mem_file.read_text().strip()) // (1024 * 1024)
|
|
249
|
+
gpus.append(GPUInfo(name=name, vendor="amd", vram_mb=vram))
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
252
|
+
return gpus
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _detect_fpgas() -> list[FPGAInfo]:
|
|
256
|
+
fpgas = []
|
|
257
|
+
try:
|
|
258
|
+
out = subprocess.check_output(
|
|
259
|
+
["lspci", "-nn"], text=True, stderr=subprocess.DEVNULL, timeout=5
|
|
260
|
+
)
|
|
261
|
+
for line in out.splitlines():
|
|
262
|
+
lower = line.lower()
|
|
263
|
+
if "xilinx" in lower or "altera" in lower or "intel.*fpga" in lower:
|
|
264
|
+
bdf = line.split()[0]
|
|
265
|
+
vendor = "xilinx" if "xilinx" in lower else "altera/intel"
|
|
266
|
+
name = line.split(": ", 1)[1] if ": " in line else "FPGA"
|
|
267
|
+
# Check driver
|
|
268
|
+
driver = ""
|
|
269
|
+
driver_link = Path(f"/sys/bus/pci/devices/0000:{bdf}/driver")
|
|
270
|
+
if driver_link.is_symlink():
|
|
271
|
+
driver = driver_link.resolve().name
|
|
272
|
+
fpgas.append(FPGAInfo(
|
|
273
|
+
name=name, vendor=vendor, pcie_bdf=bdf, driver=driver
|
|
274
|
+
))
|
|
275
|
+
except Exception:
|
|
276
|
+
pass
|
|
277
|
+
return fpgas
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _check_pytorch() -> bool:
|
|
281
|
+
try:
|
|
282
|
+
import torch
|
|
283
|
+
return True
|
|
284
|
+
except ImportError:
|
|
285
|
+
return False
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _detect_slento_cuda(config=None) -> dict[str, object]:
|
|
289
|
+
result = {
|
|
290
|
+
"has_slento_cuda": False,
|
|
291
|
+
"slento_cuda_version": "",
|
|
292
|
+
"slento_cuda_gpu_archs": [],
|
|
293
|
+
"slento_cuda_lib_path": "",
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if platform.system() != "Linux":
|
|
297
|
+
return result
|
|
298
|
+
if not shutil.which("rocm-smi"):
|
|
299
|
+
return result
|
|
300
|
+
|
|
301
|
+
amd_gpus = _detect_amd_gpus()
|
|
302
|
+
if not amd_gpus:
|
|
303
|
+
return result
|
|
304
|
+
|
|
305
|
+
script_path = _find_slento_script(config=config)
|
|
306
|
+
if not script_path:
|
|
307
|
+
return result
|
|
308
|
+
|
|
309
|
+
base_dir = script_path.parent
|
|
310
|
+
missing = [rel for rel in _SLENTO_REQUIRED_LIBS if not (base_dir / rel).exists()]
|
|
311
|
+
if missing:
|
|
312
|
+
logger.debug("Slento CUDA candidate missing libs under %s: %s", base_dir, missing)
|
|
313
|
+
return result
|
|
314
|
+
|
|
315
|
+
result["has_slento_cuda"] = True
|
|
316
|
+
result["slento_cuda_version"] = _detect_slento_version(base_dir)
|
|
317
|
+
result["slento_cuda_gpu_archs"] = _detect_slento_gpu_archs()
|
|
318
|
+
result["slento_cuda_lib_path"] = str(base_dir / "pytorch_compat")
|
|
319
|
+
return result
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _find_slento_script(config=None) -> Path | None:
|
|
323
|
+
configured = _get_config_value(config, "slento_cuda_path", "")
|
|
324
|
+
candidates: list[Path] = []
|
|
325
|
+
if configured:
|
|
326
|
+
cfg_path = Path(configured).expanduser()
|
|
327
|
+
if cfg_path.name == "cuda_emulator_run.sh":
|
|
328
|
+
candidates.append(cfg_path)
|
|
329
|
+
else:
|
|
330
|
+
candidates.append(cfg_path / "cuda_emulator_run.sh")
|
|
331
|
+
candidates.extend([
|
|
332
|
+
Path("~/CascadeProjects/rocm-cuda-compat/build/cuda_emulator_run.sh").expanduser(),
|
|
333
|
+
Path("/opt/slento-cuda/cuda_emulator_run.sh"),
|
|
334
|
+
])
|
|
335
|
+
which_path = shutil.which("cuda_emulator_run.sh")
|
|
336
|
+
if which_path:
|
|
337
|
+
candidates.append(Path(which_path))
|
|
338
|
+
|
|
339
|
+
for candidate in candidates:
|
|
340
|
+
if candidate.exists() and candidate.is_file():
|
|
341
|
+
return candidate.resolve()
|
|
342
|
+
return None
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _detect_slento_version(base_dir: Path) -> str:
|
|
346
|
+
for version_file in ("VERSION", "version.txt"):
|
|
347
|
+
path = base_dir / version_file
|
|
348
|
+
if path.exists():
|
|
349
|
+
try:
|
|
350
|
+
return path.read_text().strip()
|
|
351
|
+
except Exception:
|
|
352
|
+
pass
|
|
353
|
+
if re.search(r"slento", str(base_dir), re.IGNORECASE):
|
|
354
|
+
return base_dir.name
|
|
355
|
+
return "detected"
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _detect_slento_gpu_archs() -> list[str]:
|
|
359
|
+
archs: list[str] = []
|
|
360
|
+
try:
|
|
361
|
+
out = subprocess.check_output(
|
|
362
|
+
["rocm-smi", "--showproductname"],
|
|
363
|
+
text=True,
|
|
364
|
+
stderr=subprocess.DEVNULL,
|
|
365
|
+
timeout=10,
|
|
366
|
+
)
|
|
367
|
+
for product_name, arch in _SLENTO_GPU_ARCH_MAP.items():
|
|
368
|
+
if product_name in out and arch not in archs:
|
|
369
|
+
archs.append(arch)
|
|
370
|
+
except Exception as e:
|
|
371
|
+
logger.debug("rocm-smi product lookup failed for Slento CUDA detection: %s", e)
|
|
372
|
+
return archs
|