slento-mesh-optimizer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. slento_mesh_optimizer-0.1.0/PKG-INFO +83 -0
  2. slento_mesh_optimizer-0.1.0/README.md +43 -0
  3. slento_mesh_optimizer-0.1.0/mesh/__init__.py +2 -0
  4. slento_mesh_optimizer-0.1.0/mesh/_version.py +29 -0
  5. slento_mesh_optimizer-0.1.0/mesh/agent/__init__.py +0 -0
  6. slento_mesh_optimizer-0.1.0/mesh/agent/dataset_tracker.py +54 -0
  7. slento_mesh_optimizer-0.1.0/mesh/agent/hardware_scanner.py +372 -0
  8. slento_mesh_optimizer-0.1.0/mesh/agent/health_reporter.py +147 -0
  9. slento_mesh_optimizer-0.1.0/mesh/agent/job_executor.py +254 -0
  10. slento_mesh_optimizer-0.1.0/mesh/agent/network_probe.py +610 -0
  11. slento_mesh_optimizer-0.1.0/mesh/agent/node_agent.py +302 -0
  12. slento_mesh_optimizer-0.1.0/mesh/agent/probe_runner.py +113 -0
  13. slento_mesh_optimizer-0.1.0/mesh/agent/storage_probe.py +571 -0
  14. slento_mesh_optimizer-0.1.0/mesh/agent/system_optimizer.py +988 -0
  15. slento_mesh_optimizer-0.1.0/mesh/api/__init__.py +0 -0
  16. slento_mesh_optimizer-0.1.0/mesh/api/agent_api.py +349 -0
  17. slento_mesh_optimizer-0.1.0/mesh/api/controller_api.py +574 -0
  18. slento_mesh_optimizer-0.1.0/mesh/codex/__init__.py +1 -0
  19. slento_mesh_optimizer-0.1.0/mesh/codex/analyzer.py +290 -0
  20. slento_mesh_optimizer-0.1.0/mesh/codex/api.py +97 -0
  21. slento_mesh_optimizer-0.1.0/mesh/codex/base_agent.py +146 -0
  22. slento_mesh_optimizer-0.1.0/mesh/codex/diagnostics.py +316 -0
  23. slento_mesh_optimizer-0.1.0/mesh/codex/models.py +89 -0
  24. slento_mesh_optimizer-0.1.0/mesh/codex/orchestrator.py +269 -0
  25. slento_mesh_optimizer-0.1.0/mesh/codex/test_gen.py +354 -0
  26. slento_mesh_optimizer-0.1.0/mesh/codex/tuner.py +245 -0
  27. slento_mesh_optimizer-0.1.0/mesh/config.py +178 -0
  28. slento_mesh_optimizer-0.1.0/mesh/dashboard/__init__.py +0 -0
  29. slento_mesh_optimizer-0.1.0/mesh/dashboard/app.py +246 -0
  30. slento_mesh_optimizer-0.1.0/mesh/db/__init__.py +0 -0
  31. slento_mesh_optimizer-0.1.0/mesh/db/mesh_db.py +307 -0
  32. slento_mesh_optimizer-0.1.0/mesh/errors.py +91 -0
  33. slento_mesh_optimizer-0.1.0/mesh/licensing/__init__.py +1 -0
  34. slento_mesh_optimizer-0.1.0/mesh/licensing/client.py +188 -0
  35. slento_mesh_optimizer-0.1.0/mesh/licensing/enforcement.py +230 -0
  36. slento_mesh_optimizer-0.1.0/mesh/licensing/keygen.py +58 -0
  37. slento_mesh_optimizer-0.1.0/mesh/licensing/models.py +161 -0
  38. slento_mesh_optimizer-0.1.0/mesh/licensing/server.py +415 -0
  39. slento_mesh_optimizer-0.1.0/mesh/licensing/telemetry.py +158 -0
  40. slento_mesh_optimizer-0.1.0/mesh/models.py +342 -0
  41. slento_mesh_optimizer-0.1.0/mesh/net/__init__.py +3 -0
  42. slento_mesh_optimizer-0.1.0/mesh/net/client.py +149 -0
  43. slento_mesh_optimizer-0.1.0/mesh/scripts/__init__.py +0 -0
  44. slento_mesh_optimizer-0.1.0/mesh/scripts/license_admin.py +275 -0
  45. slento_mesh_optimizer-0.1.0/mesh/scripts/meshrun.py +395 -0
  46. slento_mesh_optimizer-0.1.0/mesh/scripts/self_update.py +285 -0
  47. slento_mesh_optimizer-0.1.0/mesh/scripts/start_agent.py +55 -0
  48. slento_mesh_optimizer-0.1.0/mesh/scripts/start_controller.py +103 -0
  49. slento_mesh_optimizer-0.1.0/mesh/security/__init__.py +18 -0
  50. slento_mesh_optimizer-0.1.0/mesh/security/audit.py +220 -0
  51. slento_mesh_optimizer-0.1.0/mesh/security/auth.py +148 -0
  52. slento_mesh_optimizer-0.1.0/mesh/security/tls.py +112 -0
  53. slento_mesh_optimizer-0.1.0/mesh/security/tokens.py +154 -0
  54. slento_mesh_optimizer-0.1.0/mesh/security/validation.py +377 -0
  55. slento_mesh_optimizer-0.1.0/mesh/updates/__init__.py +1 -0
  56. slento_mesh_optimizer-0.1.0/mesh/updates/api.py +230 -0
  57. slento_mesh_optimizer-0.1.0/mesh/updates/distributor.py +208 -0
  58. slento_mesh_optimizer-0.1.0/mesh/updates/manager.py +377 -0
  59. slento_mesh_optimizer-0.1.0/mesh/updates/models.py +53 -0
  60. slento_mesh_optimizer-0.1.0/mesh/updates/server.py +161 -0
  61. slento_mesh_optimizer-0.1.0/mesh/vault/__init__.py +5 -0
  62. slento_mesh_optimizer-0.1.0/mesh/vault/cli.py +486 -0
  63. slento_mesh_optimizer-0.1.0/mesh/vault/crypto.py +147 -0
  64. slento_mesh_optimizer-0.1.0/mesh/vault/mcp_server.py +382 -0
  65. slento_mesh_optimizer-0.1.0/mesh/vault/models.py +94 -0
  66. slento_mesh_optimizer-0.1.0/mesh/vault/policies.py +374 -0
  67. slento_mesh_optimizer-0.1.0/mesh/vault/store.py +455 -0
  68. slento_mesh_optimizer-0.1.0/pyproject.toml +56 -0
  69. slento_mesh_optimizer-0.1.0/setup.cfg +4 -0
  70. slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/PKG-INFO +83 -0
  71. slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/SOURCES.txt +73 -0
  72. slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/dependency_links.txt +1 -0
  73. slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/entry_points.txt +2 -0
  74. slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/requires.txt +16 -0
  75. slento_mesh_optimizer-0.1.0/slento_mesh_optimizer.egg-info/top_level.txt +2 -0
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: slento-mesh-optimizer
3
+ Version: 0.1.0
4
+ Summary: Distributed hardware optimization agent — JEPA-driven job routing, GPU/CPU/FPGA probing, and performance tuning across heterogeneous compute clusters.
5
+ Author-email: Slento Systems <support@slentosystems.com>
6
+ License: Proprietary
7
+ Project-URL: Homepage, https://slentosystems.com
8
+ Project-URL: Documentation, https://docs.slentosystems.com
9
+ Project-URL: Repository, https://github.com/slentoai/mesh-optimizer
10
+ Project-URL: Portal, https://portal.slentosystems.com
11
+ Keywords: gpu,optimization,mesh,distributed,jepa,hardware,profiling
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: System Administrators
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: System :: Hardware
22
+ Classifier: Topic :: System :: Monitoring
23
+ Classifier: Topic :: System :: Systems Administration
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ Requires-Dist: aiohttp>=3.9
27
+ Requires-Dist: fastapi>=0.109
28
+ Requires-Dist: httpx>=0.25
29
+ Requires-Dist: numpy>=1.24
30
+ Requires-Dist: psutil>=5.9
31
+ Requires-Dist: pydantic>=2.0
32
+ Requires-Dist: pyyaml>=6.0
33
+ Requires-Dist: uvicorn[standard]>=0.25
34
+ Provides-Extra: ml
35
+ Requires-Dist: torch>=2.0; extra == "ml"
36
+ Provides-Extra: dev
37
+ Requires-Dist: pytest; extra == "dev"
38
+ Requires-Dist: pytest-asyncio; extra == "dev"
39
+ Requires-Dist: httpx; extra == "dev"
40
+
41
+ # Mesh Optimizer
42
+
43
+ Distributed hardware optimization agent by [Slento Systems](https://slentosystems.com).
44
+
45
+ JEPA-driven job routing, GPU/CPU/FPGA probing, and performance tuning across heterogeneous compute clusters.
46
+
47
+ ## Quick Start
48
+
49
+ ```bash
50
+ pip install mesh-optimizer
51
+ mesh-optimizer start
52
+ ```
53
+
54
+ ## One-Line Install (Linux)
55
+
56
+ ```bash
57
+ curl -fsSL https://mesh.slentosystems.com/install.sh | bash
58
+ ```
59
+
60
+ ## Features
61
+
62
+ - **Hardware Discovery** — Auto-detects GPUs (AMD, NVIDIA), CPUs, FPGAs, memory subsystems
63
+ - **Performance Probing** — Runs targeted benchmarks to map hardware capabilities
64
+ - **JEPA Optimization** — Joint Embedding Predictive Architecture learns your hardware's performance characteristics
65
+ - **Job Routing** — Automatically routes compute jobs to the best-suited hardware
66
+ - **NAT/WAN Support** — Works behind firewalls with outbound-only connections
67
+ - **Multi-Platform** — Linux, macOS, Windows
68
+
69
+ ## Supported Hardware
70
+
71
+ - AMD Radeon RX 7000 (RDNA3), Instinct MI200/MI300 (CDNA)
72
+ - NVIDIA GeForce RTX 30/40/50, Quadro, Tesla, A100
73
+ - Intel/AMD CPUs (10th gen+, Ryzen/EPYC)
74
+ - Apple M1/M2/M3/M4
75
+ - Xilinx Alveo/UltraScale+ FPGAs
76
+
77
+ ## Documentation
78
+
79
+ Full docs at [docs.slentosystems.com](https://docs.slentosystems.com)
80
+
81
+ ## License
82
+
83
+ Proprietary. Free community tier available at [portal.slentosystems.com](https://portal.slentosystems.com).
@@ -0,0 +1,43 @@
1
+ # Mesh Optimizer
2
+
3
+ Distributed hardware optimization agent by [Slento Systems](https://slentosystems.com).
4
+
5
+ JEPA-driven job routing, GPU/CPU/FPGA probing, and performance tuning across heterogeneous compute clusters.
6
+
7
+ ## Quick Start
8
+
9
+ ```bash
10
+ pip install mesh-optimizer
11
+ mesh-optimizer start
12
+ ```
13
+
14
+ ## One-Line Install (Linux)
15
+
16
+ ```bash
17
+ curl -fsSL https://mesh.slentosystems.com/install.sh | bash
18
+ ```
19
+
20
+ ## Features
21
+
22
+ - **Hardware Discovery** — Auto-detects GPUs (AMD, NVIDIA), CPUs, FPGAs, memory subsystems
23
+ - **Performance Probing** — Runs targeted benchmarks to map hardware capabilities
24
+ - **JEPA Optimization** — Joint Embedding Predictive Architecture learns your hardware's performance characteristics
25
+ - **Job Routing** — Automatically routes compute jobs to the best-suited hardware
26
+ - **NAT/WAN Support** — Works behind firewalls with outbound-only connections
27
+ - **Multi-Platform** — Linux, macOS, Windows
28
+
29
+ ## Supported Hardware
30
+
31
+ - AMD Radeon RX 7000 (RDNA3), Instinct MI200/MI300 (CDNA)
32
+ - NVIDIA GeForce RTX 30/40/50, Quadro, Tesla, A100
33
+ - Intel/AMD CPUs (10th gen+, Ryzen/EPYC)
34
+ - Apple M1/M2/M3/M4
35
+ - Xilinx Alveo/UltraScale+ FPGAs
36
+
37
+ ## Documentation
38
+
39
+ Full docs at [docs.slentosystems.com](https://docs.slentosystems.com)
40
+
41
+ ## License
42
+
43
+ Proprietary. Free community tier available at [portal.slentosystems.com](https://portal.slentosystems.com).
@@ -0,0 +1,2 @@
1
+ """RDNA3 Discovery Mesh — Distributed profiling and job routing."""
2
+ __version__ = "0.1.0"
@@ -0,0 +1,29 @@
1
+ """Single source of truth for mesh-optimizer version."""
2
+ from __future__ import annotations
3
+
4
+ import sys
5
+ from datetime import datetime, timezone
6
+
7
+
8
+ __version__ = "0.1.0"
9
+
10
+
11
+ def get_version_info() -> dict:
12
+ """Return version metadata including build date and git SHA."""
13
+ info = {
14
+ "version": __version__,
15
+ "build_date": datetime.now(timezone.utc).isoformat(),
16
+ "python_version": sys.version,
17
+ "git_sha": None,
18
+ }
19
+ try:
20
+ import subprocess
21
+ result = subprocess.run(
22
+ ["git", "rev-parse", "--short", "HEAD"],
23
+ capture_output=True, text=True, timeout=5,
24
+ )
25
+ if result.returncode == 0:
26
+ info["git_sha"] = result.stdout.strip()
27
+ except Exception:
28
+ pass
29
+ return info
File without changes
@@ -0,0 +1,54 @@
1
+ """Track which datasets are cached locally on this node."""
2
+ import json
3
+ import logging
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class DatasetTracker:
12
+ def __init__(self, cache_dir: str = ""):
13
+ self.cache_dir = cache_dir
14
+ self._registry_path = Path(cache_dir or "/tmp") / "mesh_dataset_registry.json"
15
+ self._datasets: Dict[str, dict] = {}
16
+ self._load()
17
+
18
+ def _load(self):
19
+ if self._registry_path.exists():
20
+ try:
21
+ self._datasets = json.loads(self._registry_path.read_text())
22
+ except Exception as e:
23
+ logger.warning("Failed to load dataset registry: %s", e)
24
+ self._datasets = {}
25
+
26
+ def _save(self):
27
+ try:
28
+ self._registry_path.parent.mkdir(parents=True, exist_ok=True)
29
+ self._registry_path.write_text(json.dumps(self._datasets, indent=2))
30
+ except Exception as e:
31
+ logger.warning("Failed to save dataset registry: %s", e)
32
+
33
+ def register(self, dataset_id: str, path: str, size_mb: float = 0.0):
34
+ self._datasets[dataset_id] = {
35
+ "path": path,
36
+ "size_mb": size_mb,
37
+ "registered_at": datetime.now(timezone.utc).isoformat(),
38
+ }
39
+ self._save()
40
+ logger.info("Dataset registered: %s at %s (%.1f MB)", dataset_id, path, size_mb)
41
+
42
+ def unregister(self, dataset_id: str):
43
+ if dataset_id in self._datasets:
44
+ del self._datasets[dataset_id]
45
+ self._save()
46
+
47
+ def list_ids(self) -> List[str]:
48
+ return list(self._datasets.keys())
49
+
50
+ def get(self, dataset_id: str) -> Optional[dict]:
51
+ return self._datasets.get(dataset_id)
52
+
53
+ def list_all(self) -> Dict[str, dict]:
54
+ return dict(self._datasets)
@@ -0,0 +1,372 @@
1
+ """Auto-discovery of hardware on a mesh node."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import multiprocessing
6
+ import os
7
+ import platform
8
+ import re
9
+ import shutil
10
+ import subprocess
11
+ from pathlib import Path
12
+
13
+ import psutil
14
+
15
+ from mesh.models import CPUInfo, FPGAInfo, GPUInfo, HardwareInventory, MountInfo
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _SLENTO_REQUIRED_LIBS = [
20
+ "pytorch_compat/libcudart.so.12",
21
+ "pytorch_compat/libcublas.so.12",
22
+ "pytorch_compat/libcublasLt.so.12",
23
+ "pytorch_compat/libcudnn.so.9",
24
+ "pytorch_compat/libcuda.so.1",
25
+ ]
26
+ _SLENTO_GPU_ARCH_MAP = {
27
+ "7900 XTX": "gfx1100",
28
+ "7900 XT": "gfx1100",
29
+ "MI100": "gfx908",
30
+ "MI250": "gfx90a",
31
+ "MI300": "gfx942",
32
+ }
33
+
34
+
35
+ def scan_hardware(hostname_override: str = "", config=None) -> HardwareInventory:
36
+ """Detect all hardware on this machine."""
37
+ inv = HardwareInventory(
38
+ hostname=hostname_override or platform.node(),
39
+ platform=platform.system(),
40
+ memory_total_mb=psutil.virtual_memory().total // (1024 * 1024),
41
+ )
42
+ inv.cpu = _detect_cpu()
43
+ inv.gpus = _detect_gpus()
44
+ inv.fpgas = _detect_fpgas()
45
+ inv.has_pytorch = _check_pytorch()
46
+ inv.has_rocm = shutil.which("rocm-smi") is not None
47
+ inv.has_cuda = shutil.which("nvidia-smi") is not None
48
+ inv.mounts = _detect_mounts()
49
+ inv.scratch_paths = _get_config_value(config, "scratch_paths", []) or []
50
+
51
+ slento = _detect_slento_cuda(config=config)
52
+ inv.has_slento_cuda = slento["has_slento_cuda"]
53
+ inv.slento_cuda_version = slento["slento_cuda_version"]
54
+ inv.slento_cuda_gpu_archs = slento["slento_cuda_gpu_archs"]
55
+ inv.slento_cuda_lib_path = slento["slento_cuda_lib_path"]
56
+ return inv
57
+
58
+
59
+ def _get_config_value(config, key: str, default=None):
60
+ if config is None:
61
+ return default
62
+ if isinstance(config, dict):
63
+ if key in config:
64
+ return config[key]
65
+ node_cfg = config.get("node")
66
+ if isinstance(node_cfg, dict) and key in node_cfg:
67
+ return node_cfg[key]
68
+ return default
69
+ if hasattr(config, key):
70
+ return getattr(config, key)
71
+ node_cfg = getattr(config, "node", None)
72
+ if node_cfg is not None and hasattr(node_cfg, key):
73
+ return getattr(node_cfg, key)
74
+ return default
75
+
76
+
77
+ def _detect_cpu() -> CPUInfo:
78
+ info = CPUInfo(cores=multiprocessing.cpu_count() or 1)
79
+ # Model name
80
+ if platform.system() == "Linux":
81
+ try:
82
+ with open("/proc/cpuinfo") as f:
83
+ for line in f:
84
+ if "model name" in line:
85
+ info.model = line.split(":")[1].strip()
86
+ break
87
+ except Exception:
88
+ pass
89
+ elif platform.system() == "Darwin":
90
+ try:
91
+ info.model = subprocess.check_output(
92
+ ["sysctl", "-n", "machdep.cpu.brand_string"],
93
+ text=True, stderr=subprocess.DEVNULL
94
+ ).strip()
95
+ except Exception:
96
+ pass
97
+
98
+ # lscpu for Linux details
99
+ try:
100
+ out = subprocess.check_output("lscpu", text=True, stderr=subprocess.DEVNULL)
101
+ cores_per = 1
102
+ sockets = 1
103
+ for line in out.splitlines():
104
+ if "Core(s) per socket:" in line:
105
+ cores_per = int(line.split(":")[1].strip())
106
+ elif "Socket(s):" in line:
107
+ sockets = int(line.split(":")[1].strip())
108
+ elif "Thread(s) per core:" in line:
109
+ info.threads_per_core = int(line.split(":")[1].strip())
110
+ elif "NUMA node(s):" in line:
111
+ info.numa_nodes = int(line.split(":")[1].strip())
112
+ elif "CPU max MHz:" in line:
113
+ info.freq_mhz = float(line.split(":")[1].strip())
114
+ elif "CPU MHz:" in line and info.freq_mhz == 0:
115
+ info.freq_mhz = float(line.split(":")[1].strip())
116
+ info.physical_cores = cores_per * sockets
117
+ except Exception:
118
+ info.physical_cores = info.cores // 2
119
+
120
+ # Cache sizes from sysfs
121
+ try:
122
+ for idx in range(10):
123
+ cache_dir = f"/sys/devices/system/cpu/cpu0/cache/index{idx}"
124
+ if not os.path.exists(cache_dir):
125
+ break
126
+ level = open(f"{cache_dir}/level").read().strip()
127
+ ctype = open(f"{cache_dir}/type").read().strip()
128
+ size_str = open(f"{cache_dir}/size").read().strip()
129
+ size_kb = int(re.match(r"(\d+)", size_str).group(1))
130
+ if "M" in size_str:
131
+ size_kb *= 1024
132
+ if level == "1" and "Data" in ctype:
133
+ info.l1d_kb = size_kb
134
+ elif level == "2":
135
+ info.l2_kb = size_kb
136
+ elif level == "3":
137
+ info.l3_kb = size_kb
138
+ except Exception:
139
+ pass
140
+ return info
141
+
142
+
143
+ def _detect_gpus() -> list[GPUInfo]:
144
+ gpus = []
145
+ # NVIDIA GPUs
146
+ gpus.extend(_detect_nvidia_gpus())
147
+ # AMD GPUs
148
+ gpus.extend(_detect_amd_gpus())
149
+ return gpus
150
+
151
+
152
+ def _detect_mounts() -> list[MountInfo]:
153
+ mounts = []
154
+ try:
155
+ for partition in psutil.disk_partitions(all=False):
156
+ try:
157
+ usage = psutil.disk_usage(partition.mountpoint)
158
+ except Exception:
159
+ continue
160
+ mounts.append(MountInfo(
161
+ mountpoint=partition.mountpoint,
162
+ device=partition.device,
163
+ fstype=partition.fstype,
164
+ total_mb=int(usage.total // (1024 * 1024)),
165
+ free_mb=int(usage.free // (1024 * 1024)),
166
+ ))
167
+ except Exception as e:
168
+ logger.debug("Mount detection failed: %s", e)
169
+ return mounts
170
+
171
+
172
+ def _detect_nvidia_gpus() -> list[GPUInfo]:
173
+ gpus = []
174
+ if not shutil.which("nvidia-smi"):
175
+ return gpus
176
+ try:
177
+ out = subprocess.check_output(
178
+ ["nvidia-smi", "--query-gpu=name,memory.total,driver_version,temperature.gpu,utilization.gpu,memory.used",
179
+ "--format=csv,noheader,nounits"],
180
+ text=True, stderr=subprocess.DEVNULL, timeout=10
181
+ )
182
+ for line in out.strip().splitlines():
183
+ parts = [p.strip() for p in line.split(",")]
184
+ if len(parts) >= 6:
185
+ gpus.append(GPUInfo(
186
+ name=parts[0],
187
+ vendor="nvidia",
188
+ vram_mb=int(float(parts[1])),
189
+ driver=parts[2],
190
+ temperature_c=float(parts[3]) if parts[3] != "N/A" else 0.0,
191
+ utilization_pct=float(parts[4]) if parts[4] != "N/A" else 0.0,
192
+ memory_used_mb=int(float(parts[5])) if parts[5] != "N/A" else 0,
193
+ ))
194
+ except Exception as e:
195
+ logger.warning("nvidia-smi failed: %s", e)
196
+ return gpus
197
+
198
+
199
+ def _detect_amd_gpus() -> list[GPUInfo]:
200
+ gpus = []
201
+ if not shutil.which("rocm-smi"):
202
+ return gpus
203
+ try:
204
+ out = subprocess.check_output(
205
+ ["rocm-smi", "--showproductname", "--showmeminfo", "vram",
206
+ "--showtemp", "--showuse", "--json"],
207
+ text=True, stderr=subprocess.DEVNULL, timeout=10
208
+ )
209
+ import json
210
+ data = json.loads(out)
211
+ for key, val in data.items():
212
+ if not key.startswith("card"):
213
+ continue
214
+ gpu = GPUInfo(vendor="amd")
215
+ gpu.name = val.get("Card Series", val.get("Card series", "AMD GPU"))
216
+ vram_total = val.get("VRAM Total Memory (B)", 0)
217
+ if vram_total:
218
+ gpu.vram_mb = int(vram_total) // (1024 * 1024)
219
+ vram_used = val.get("VRAM Total Used Memory (B)", 0)
220
+ if vram_used:
221
+ gpu.memory_used_mb = int(vram_used) // (1024 * 1024)
222
+ temp = val.get("Temperature (Sensor edge) (C)", 0)
223
+ if temp:
224
+ gpu.temperature_c = float(temp)
225
+ use = val.get("GPU use (%)", 0)
226
+ if use:
227
+ gpu.utilization_pct = float(use)
228
+ gpus.append(gpu)
229
+ except Exception as e:
230
+ logger.warning("rocm-smi failed: %s", e)
231
+ # Fallback: check /sys/class/drm
232
+ if not gpus:
233
+ try:
234
+ drm = Path("/sys/class/drm")
235
+ for card in sorted(drm.glob("card[0-9]*")):
236
+ device = card / "device"
237
+ vendor_file = device / "vendor"
238
+ if vendor_file.exists():
239
+ vendor_id = vendor_file.read_text().strip()
240
+ if vendor_id == "0x1002": # AMD
241
+ name = "AMD GPU"
242
+ product = device / "product_name"
243
+ if product.exists():
244
+ name = product.read_text().strip()
245
+ vram = 0
246
+ mem_file = device / "mem_info_vram_total"
247
+ if mem_file.exists():
248
+ vram = int(mem_file.read_text().strip()) // (1024 * 1024)
249
+ gpus.append(GPUInfo(name=name, vendor="amd", vram_mb=vram))
250
+ except Exception:
251
+ pass
252
+ return gpus
253
+
254
+
255
+ def _detect_fpgas() -> list[FPGAInfo]:
256
+ fpgas = []
257
+ try:
258
+ out = subprocess.check_output(
259
+ ["lspci", "-nn"], text=True, stderr=subprocess.DEVNULL, timeout=5
260
+ )
261
+ for line in out.splitlines():
262
+ lower = line.lower()
263
+ if "xilinx" in lower or "altera" in lower or "intel.*fpga" in lower:
264
+ bdf = line.split()[0]
265
+ vendor = "xilinx" if "xilinx" in lower else "altera/intel"
266
+ name = line.split(": ", 1)[1] if ": " in line else "FPGA"
267
+ # Check driver
268
+ driver = ""
269
+ driver_link = Path(f"/sys/bus/pci/devices/0000:{bdf}/driver")
270
+ if driver_link.is_symlink():
271
+ driver = driver_link.resolve().name
272
+ fpgas.append(FPGAInfo(
273
+ name=name, vendor=vendor, pcie_bdf=bdf, driver=driver
274
+ ))
275
+ except Exception:
276
+ pass
277
+ return fpgas
278
+
279
+
280
+ def _check_pytorch() -> bool:
281
+ try:
282
+ import torch
283
+ return True
284
+ except ImportError:
285
+ return False
286
+
287
+
288
+ def _detect_slento_cuda(config=None) -> dict[str, object]:
289
+ result = {
290
+ "has_slento_cuda": False,
291
+ "slento_cuda_version": "",
292
+ "slento_cuda_gpu_archs": [],
293
+ "slento_cuda_lib_path": "",
294
+ }
295
+
296
+ if platform.system() != "Linux":
297
+ return result
298
+ if not shutil.which("rocm-smi"):
299
+ return result
300
+
301
+ amd_gpus = _detect_amd_gpus()
302
+ if not amd_gpus:
303
+ return result
304
+
305
+ script_path = _find_slento_script(config=config)
306
+ if not script_path:
307
+ return result
308
+
309
+ base_dir = script_path.parent
310
+ missing = [rel for rel in _SLENTO_REQUIRED_LIBS if not (base_dir / rel).exists()]
311
+ if missing:
312
+ logger.debug("Slento CUDA candidate missing libs under %s: %s", base_dir, missing)
313
+ return result
314
+
315
+ result["has_slento_cuda"] = True
316
+ result["slento_cuda_version"] = _detect_slento_version(base_dir)
317
+ result["slento_cuda_gpu_archs"] = _detect_slento_gpu_archs()
318
+ result["slento_cuda_lib_path"] = str(base_dir / "pytorch_compat")
319
+ return result
320
+
321
+
322
+ def _find_slento_script(config=None) -> Path | None:
323
+ configured = _get_config_value(config, "slento_cuda_path", "")
324
+ candidates: list[Path] = []
325
+ if configured:
326
+ cfg_path = Path(configured).expanduser()
327
+ if cfg_path.name == "cuda_emulator_run.sh":
328
+ candidates.append(cfg_path)
329
+ else:
330
+ candidates.append(cfg_path / "cuda_emulator_run.sh")
331
+ candidates.extend([
332
+ Path("~/CascadeProjects/rocm-cuda-compat/build/cuda_emulator_run.sh").expanduser(),
333
+ Path("/opt/slento-cuda/cuda_emulator_run.sh"),
334
+ ])
335
+ which_path = shutil.which("cuda_emulator_run.sh")
336
+ if which_path:
337
+ candidates.append(Path(which_path))
338
+
339
+ for candidate in candidates:
340
+ if candidate.exists() and candidate.is_file():
341
+ return candidate.resolve()
342
+ return None
343
+
344
+
345
+ def _detect_slento_version(base_dir: Path) -> str:
346
+ for version_file in ("VERSION", "version.txt"):
347
+ path = base_dir / version_file
348
+ if path.exists():
349
+ try:
350
+ return path.read_text().strip()
351
+ except Exception:
352
+ pass
353
+ if re.search(r"slento", str(base_dir), re.IGNORECASE):
354
+ return base_dir.name
355
+ return "detected"
356
+
357
+
358
+ def _detect_slento_gpu_archs() -> list[str]:
359
+ archs: list[str] = []
360
+ try:
361
+ out = subprocess.check_output(
362
+ ["rocm-smi", "--showproductname"],
363
+ text=True,
364
+ stderr=subprocess.DEVNULL,
365
+ timeout=10,
366
+ )
367
+ for product_name, arch in _SLENTO_GPU_ARCH_MAP.items():
368
+ if product_name in out and arch not in archs:
369
+ archs.append(arch)
370
+ except Exception as e:
371
+ logger.debug("rocm-smi product lookup failed for Slento CUDA detection: %s", e)
372
+ return archs