matrice-compute 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +45 -3
- matrice_compute/resources_tracker.py +730 -87
- {matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/RECORD +7 -7
- {matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.33.dist-info → matrice_compute-0.1.34.dist-info}/top_level.txt +0 -0
|
@@ -83,7 +83,8 @@ class ActionInstance:
|
|
|
83
83
|
"inference_ws_server": inference_ws_server_execute,
|
|
84
84
|
"fe_analytics_service": fe_analytics_service_execute,
|
|
85
85
|
"lpr_setup": lpr_setup_execute,
|
|
86
|
-
"inference_tracker_server": inference_tracker_setup_execute
|
|
86
|
+
"inference_tracker_server": inference_tracker_setup_execute,
|
|
87
|
+
"video_storage_setup" : video_storage_setup_execute
|
|
87
88
|
}
|
|
88
89
|
if self.action_type not in self.actions_map:
|
|
89
90
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
@@ -309,7 +310,7 @@ class ActionInstance:
|
|
|
309
310
|
)
|
|
310
311
|
|
|
311
312
|
@log_errors(default_return=None, raise_exception=False, log_error=False)
|
|
312
|
-
def
|
|
313
|
+
def get_action_details(self):
|
|
313
314
|
"""Get action details from scaling service.
|
|
314
315
|
|
|
315
316
|
Returns:
|
|
@@ -2196,4 +2197,45 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
2196
2197
|
f"{image}"
|
|
2197
2198
|
)
|
|
2198
2199
|
|
|
2199
|
-
self.start(worker_cmd, "inference_tracker_setup")
|
|
2200
|
+
self.start(worker_cmd, "inference_tracker_setup")
|
|
2201
|
+
|
|
2202
|
+
@log_errors(raise_exception=False)
|
|
2203
|
+
def video_storage_setup_execute(self: ActionInstance):
|
|
2204
|
+
|
|
2205
|
+
"""
|
|
2206
|
+
Creates and start Video Storage
|
|
2207
|
+
Video Stroage runs on port 8106 (localhost only with --net=host).
|
|
2208
|
+
"""
|
|
2209
|
+
|
|
2210
|
+
action_details = self.get_action_details()
|
|
2211
|
+
if not action_details:
|
|
2212
|
+
return
|
|
2213
|
+
|
|
2214
|
+
image = self.docker_container
|
|
2215
|
+
|
|
2216
|
+
self.setup_action_requirements(action_details)
|
|
2217
|
+
|
|
2218
|
+
if action_details["actionDetails"].get("containerId"):
|
|
2219
|
+
logging.info(
|
|
2220
|
+
"Using existing container ID for inference tracker: %s",
|
|
2221
|
+
action_details["actionDetails"]["containerId"],
|
|
2222
|
+
)
|
|
2223
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
2224
|
+
cmd = "docker restart " + self.docker_container
|
|
2225
|
+
self.start(cmd, "video_storage_setup_execute")
|
|
2226
|
+
return
|
|
2227
|
+
|
|
2228
|
+
# This is the existing Docker run command
|
|
2229
|
+
worker_cmd = (
|
|
2230
|
+
f"docker run -d --pull=always --net=host "
|
|
2231
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
2232
|
+
f"--name media_server "
|
|
2233
|
+
f"-v matrice_myvol:/matrice_data "
|
|
2234
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
2235
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
2236
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
2237
|
+
f'-e ACTION_ID="{self.action_record_id}" '
|
|
2238
|
+
f"{image}"
|
|
2239
|
+
)
|
|
2240
|
+
|
|
2241
|
+
self.start(worker_cmd, "video_storage_setup_execute")
|
|
@@ -21,7 +21,19 @@ from matrice_common.utils import log_errors
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class ResourcesTracker:
|
|
24
|
-
"""Tracks machine and container resources.
|
|
24
|
+
"""Tracks machine and container resources.
|
|
25
|
+
|
|
26
|
+
GPU Utilization Note:
|
|
27
|
+
GPU utilization is tracked at the DEVICE level, not per-container.
|
|
28
|
+
NVIDIA does not expose reliable per-process GPU utilization.
|
|
29
|
+
Per-container GPU MEMORY is accurate; per-container GPU UTILIZATION is best-effort.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Cache for nvidia-smi output to reduce subprocess overhead
|
|
33
|
+
_gpu_cache: Dict = {}
|
|
34
|
+
_gpu_cache_timestamp: float = 0
|
|
35
|
+
_gpu_cache_ttl: float = 1.0 # Cache TTL in seconds
|
|
36
|
+
_gpu_cache_lock = threading.Lock()
|
|
25
37
|
|
|
26
38
|
def __init__(self) -> None:
|
|
27
39
|
"""
|
|
@@ -38,11 +50,11 @@ class ResourcesTracker:
|
|
|
38
50
|
container (docker.models.containers.Container): Docker container instance.
|
|
39
51
|
|
|
40
52
|
Returns:
|
|
41
|
-
Tuple[float, float]: CPU utilization percentage and memory
|
|
53
|
+
Tuple[float, float]: CPU utilization percentage (0-100 per core used) and memory usage in MB.
|
|
42
54
|
"""
|
|
43
55
|
stats = container.stats(stream=False)
|
|
44
56
|
if stats:
|
|
45
|
-
cpu_utilization = 0
|
|
57
|
+
cpu_utilization = 0.0
|
|
46
58
|
cpu_delta = (
|
|
47
59
|
stats["cpu_stats"]["cpu_usage"]["total_usage"]
|
|
48
60
|
- stats["precpu_stats"]["cpu_usage"]["total_usage"]
|
|
@@ -50,14 +62,92 @@ class ResourcesTracker:
|
|
|
50
62
|
system_delta = stats["cpu_stats"].get("system_cpu_usage", 0) - stats[
|
|
51
63
|
"precpu_stats"
|
|
52
64
|
].get("system_cpu_usage", 0)
|
|
65
|
+
|
|
53
66
|
if system_delta > 0:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
67
|
+
# FIX: Multiply by online_cpus to get correct percentage
|
|
68
|
+
# Docker formula: (cpu_delta / system_delta) * online_cpus * 100
|
|
69
|
+
online_cpus = stats["cpu_stats"].get("online_cpus")
|
|
70
|
+
if not online_cpus:
|
|
71
|
+
# Fallback: count from percpu_usage or use system CPU count
|
|
72
|
+
percpu = stats["cpu_stats"]["cpu_usage"].get("percpu_usage", [])
|
|
73
|
+
online_cpus = len(percpu) if percpu else psutil.cpu_count()
|
|
74
|
+
cpu_utilization = (cpu_delta / system_delta) * online_cpus * 100.0
|
|
75
|
+
|
|
76
|
+
# Return memory in MB (consistent units) instead of percentage
|
|
77
|
+
memory_usage_bytes = stats["memory_stats"].get("usage", 0)
|
|
78
|
+
# Subtract cache if available for more accurate "real" memory
|
|
79
|
+
cache_bytes = stats["memory_stats"].get("stats", {}).get("cache", 0)
|
|
80
|
+
memory_usage_mb = (memory_usage_bytes - cache_bytes) / (1024 * 1024)
|
|
81
|
+
|
|
82
|
+
return cpu_utilization, max(0, memory_usage_mb)
|
|
59
83
|
return 0, 0
|
|
60
84
|
|
|
85
|
+
@staticmethod
|
|
86
|
+
def _parse_memory_string(memory_str: str) -> float:
|
|
87
|
+
"""
|
|
88
|
+
Parse Docker memory string to MB.
|
|
89
|
+
|
|
90
|
+
Handles: "1.5GiB", "512MiB", "1024KiB", "1.5GB", "512MB", "1024KB", "1024B"
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
memory_str: Memory string from docker stats
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
float: Memory in MB
|
|
97
|
+
"""
|
|
98
|
+
import re
|
|
99
|
+
memory_str = memory_str.strip()
|
|
100
|
+
|
|
101
|
+
# Match number (with optional decimal) and unit
|
|
102
|
+
match = re.match(r'^([\d.]+)\s*([A-Za-z]+)$', memory_str)
|
|
103
|
+
if not match:
|
|
104
|
+
# Try splitting by space
|
|
105
|
+
parts = memory_str.split()
|
|
106
|
+
if len(parts) >= 2:
|
|
107
|
+
value_str, unit = parts[0], parts[1]
|
|
108
|
+
else:
|
|
109
|
+
# Last resort: assume it's bytes
|
|
110
|
+
try:
|
|
111
|
+
return float(memory_str) / (1024 * 1024)
|
|
112
|
+
except ValueError:
|
|
113
|
+
return 0.0
|
|
114
|
+
else:
|
|
115
|
+
value_str, unit = match.groups()
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
value = float(value_str)
|
|
119
|
+
except ValueError:
|
|
120
|
+
return 0.0
|
|
121
|
+
|
|
122
|
+
# Normalize unit to lowercase for comparison
|
|
123
|
+
unit = unit.lower()
|
|
124
|
+
|
|
125
|
+
# Binary units (IEC)
|
|
126
|
+
if unit in ('kib', 'ki'):
|
|
127
|
+
return value / 1024
|
|
128
|
+
elif unit in ('mib', 'mi'):
|
|
129
|
+
return value
|
|
130
|
+
elif unit in ('gib', 'gi'):
|
|
131
|
+
return value * 1024
|
|
132
|
+
elif unit in ('tib', 'ti'):
|
|
133
|
+
return value * 1024 * 1024
|
|
134
|
+
# Decimal units (SI)
|
|
135
|
+
elif unit in ('kb', 'k'):
|
|
136
|
+
return value / 1000
|
|
137
|
+
elif unit in ('mb', 'm'):
|
|
138
|
+
return value
|
|
139
|
+
elif unit in ('gb', 'g'):
|
|
140
|
+
return value * 1000
|
|
141
|
+
elif unit in ('tb', 't'):
|
|
142
|
+
return value * 1000 * 1000
|
|
143
|
+
# Bytes
|
|
144
|
+
elif unit in ('b', 'bytes'):
|
|
145
|
+
return value / (1024 * 1024)
|
|
146
|
+
else:
|
|
147
|
+
# Unknown unit, assume MB
|
|
148
|
+
logging.debug("Unknown memory unit '%s', assuming MB", unit)
|
|
149
|
+
return value
|
|
150
|
+
|
|
61
151
|
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
62
152
|
def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
|
|
63
153
|
"""
|
|
@@ -70,13 +160,14 @@ class ResourcesTracker:
|
|
|
70
160
|
Tuple[float, float]: CPU utilization percentage and memory usage in MB.
|
|
71
161
|
"""
|
|
72
162
|
try:
|
|
163
|
+
# Use JSON format for more reliable parsing
|
|
73
164
|
stats_result = subprocess.run(
|
|
74
165
|
[
|
|
75
166
|
"docker",
|
|
76
167
|
"stats",
|
|
77
168
|
"--no-stream",
|
|
78
169
|
"--format",
|
|
79
|
-
"
|
|
170
|
+
'{"cpu":"{{.CPUPerc}}","mem":"{{.MemUsage}}"}',
|
|
80
171
|
container_id,
|
|
81
172
|
],
|
|
82
173
|
capture_output=True,
|
|
@@ -87,19 +178,24 @@ class ResourcesTracker:
|
|
|
87
178
|
if stats_result.returncode != 0:
|
|
88
179
|
logging.debug("docker stats command failed for container %s", container_id)
|
|
89
180
|
return 0, 0
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
181
|
+
|
|
182
|
+
# Parse JSON output
|
|
183
|
+
stats_json = json.loads(stats_result.stdout.strip())
|
|
184
|
+
|
|
185
|
+
# Parse CPU (remove % sign)
|
|
186
|
+
cpu_str = stats_json.get("cpu", "0%").replace("%", "").strip()
|
|
187
|
+
cpu_usage = float(cpu_str) if cpu_str else 0.0
|
|
188
|
+
|
|
189
|
+
# Parse memory (format: "used / limit")
|
|
190
|
+
mem_str = stats_json.get("mem", "0B / 0B")
|
|
191
|
+
mem_used = mem_str.split("/")[0].strip()
|
|
192
|
+
memory_usage_mb = self._parse_memory_string(mem_used)
|
|
193
|
+
|
|
102
194
|
return cpu_usage, memory_usage_mb
|
|
195
|
+
|
|
196
|
+
except json.JSONDecodeError as e:
|
|
197
|
+
logging.debug("JSON parse error for container %s: %s", container_id, e)
|
|
198
|
+
return 0, 0
|
|
103
199
|
except subprocess.TimeoutExpired:
|
|
104
200
|
logging.debug("docker stats command timed out for container %s", container_id)
|
|
105
201
|
return 0, 0
|
|
@@ -110,20 +206,395 @@ class ResourcesTracker:
|
|
|
110
206
|
logging.debug("Unexpected error getting container stats for %s: %s", container_id, e)
|
|
111
207
|
return 0, 0
|
|
112
208
|
|
|
209
|
+
def _get_cached_gpu_data(self) -> Dict:
|
|
210
|
+
"""
|
|
211
|
+
Get cached GPU data from nvidia-smi to reduce subprocess overhead.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Dict: Cached GPU data with keys:
|
|
215
|
+
- 'processes': List of {pid, gpu_idx, memory_mb}
|
|
216
|
+
- 'gpus': List of {idx, utilization, memory_used, memory_total}
|
|
217
|
+
- 'timestamp': When cache was populated
|
|
218
|
+
"""
|
|
219
|
+
import time as time_module
|
|
220
|
+
current_time = time_module.time()
|
|
221
|
+
|
|
222
|
+
with ResourcesTracker._gpu_cache_lock:
|
|
223
|
+
# Return cache if still valid
|
|
224
|
+
if (ResourcesTracker._gpu_cache and
|
|
225
|
+
current_time - ResourcesTracker._gpu_cache_timestamp < ResourcesTracker._gpu_cache_ttl):
|
|
226
|
+
return ResourcesTracker._gpu_cache
|
|
227
|
+
|
|
228
|
+
# Refresh cache
|
|
229
|
+
cache = {
|
|
230
|
+
'processes': [],
|
|
231
|
+
'gpus': [],
|
|
232
|
+
'timestamp': current_time,
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if not has_gpu():
|
|
236
|
+
ResourcesTracker._gpu_cache = cache
|
|
237
|
+
ResourcesTracker._gpu_cache_timestamp = current_time
|
|
238
|
+
return cache
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
# Single nvidia-smi call for all GPU info
|
|
242
|
+
result = subprocess.run(
|
|
243
|
+
[
|
|
244
|
+
"nvidia-smi",
|
|
245
|
+
"--query-gpu=index,utilization.gpu,memory.used,memory.total",
|
|
246
|
+
"--format=csv,noheader,nounits"
|
|
247
|
+
],
|
|
248
|
+
capture_output=True,
|
|
249
|
+
text=True,
|
|
250
|
+
timeout=10,
|
|
251
|
+
)
|
|
252
|
+
if result.returncode == 0:
|
|
253
|
+
for line in result.stdout.strip().split("\n"):
|
|
254
|
+
if not line.strip():
|
|
255
|
+
continue
|
|
256
|
+
parts = [p.strip() for p in line.split(",")]
|
|
257
|
+
if len(parts) >= 4:
|
|
258
|
+
cache['gpus'].append({
|
|
259
|
+
'idx': int(parts[0]) if parts[0].isdigit() else 0,
|
|
260
|
+
'utilization': float(parts[1]) if parts[1].replace('.', '').isdigit() else 0,
|
|
261
|
+
'memory_used': int(parts[2]) if parts[2].isdigit() else 0,
|
|
262
|
+
'memory_total': int(parts[3]) if parts[3].isdigit() else 0,
|
|
263
|
+
})
|
|
264
|
+
|
|
265
|
+
# Single nvidia-smi call for all processes
|
|
266
|
+
result = subprocess.run(
|
|
267
|
+
[
|
|
268
|
+
"nvidia-smi",
|
|
269
|
+
"--query-compute-apps=pid,gpu_uuid,used_memory",
|
|
270
|
+
"--format=csv,noheader,nounits"
|
|
271
|
+
],
|
|
272
|
+
capture_output=True,
|
|
273
|
+
text=True,
|
|
274
|
+
timeout=10,
|
|
275
|
+
)
|
|
276
|
+
if result.returncode == 0:
|
|
277
|
+
for line in result.stdout.strip().split("\n"):
|
|
278
|
+
if not line.strip():
|
|
279
|
+
continue
|
|
280
|
+
parts = [p.strip() for p in line.split(",")]
|
|
281
|
+
if len(parts) >= 3:
|
|
282
|
+
cache['processes'].append({
|
|
283
|
+
'pid': parts[0],
|
|
284
|
+
'gpu_uuid': parts[1],
|
|
285
|
+
'memory_mb': int(parts[2]) if parts[2].isdigit() else 0,
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
except subprocess.TimeoutExpired:
|
|
289
|
+
logging.debug("nvidia-smi cache refresh timed out")
|
|
290
|
+
except Exception as e:
|
|
291
|
+
logging.debug("Error refreshing GPU cache: %s", e)
|
|
292
|
+
|
|
293
|
+
ResourcesTracker._gpu_cache = cache
|
|
294
|
+
ResourcesTracker._gpu_cache_timestamp = current_time
|
|
295
|
+
return cache
|
|
296
|
+
|
|
113
297
|
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
114
298
|
def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
|
|
115
299
|
"""
|
|
116
300
|
Get GPU usage for a specific container.
|
|
117
301
|
|
|
302
|
+
IMPORTANT: GPU utilization tracking limitations:
|
|
303
|
+
- GPU MEMORY per container is ACCURATE (from nvidia-smi per-process data)
|
|
304
|
+
- GPU UTILIZATION per container is BEST-EFFORT (NVIDIA doesn't expose per-process SM usage)
|
|
305
|
+
|
|
306
|
+
For GPU utilization, we report the utilization of GPUs that have container processes.
|
|
307
|
+
If multiple containers share a GPU, they will all report similar utilization.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
container_id (str): ID of the Docker container.
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Tuple[float, int]:
|
|
314
|
+
- GPU utilization percentage (device-level, for GPUs used by container)
|
|
315
|
+
- GPU memory usage in MB (accurate per-container)
|
|
316
|
+
"""
|
|
317
|
+
# Get ALL PIDs belonging to this container (not just main PID)
|
|
318
|
+
container_pids = self.get_all_container_pids(container_id)
|
|
319
|
+
if not container_pids:
|
|
320
|
+
# Fallback to main PID only
|
|
321
|
+
main_pid = self.get_pid_id_by_container_id(container_id)
|
|
322
|
+
if main_pid:
|
|
323
|
+
container_pids = {main_pid}
|
|
324
|
+
else:
|
|
325
|
+
return 0, 0
|
|
326
|
+
|
|
327
|
+
# Check if this is a Jetson device
|
|
328
|
+
if self._is_jetson_device():
|
|
329
|
+
return self._get_jetson_gpu_usage(container_pids)
|
|
330
|
+
|
|
331
|
+
# Use cached GPU data for efficiency
|
|
332
|
+
gpu_data = self._get_cached_gpu_data()
|
|
333
|
+
|
|
334
|
+
# Find GPU memory used by container (ACCURATE)
|
|
335
|
+
gpu_mem_used = 0
|
|
336
|
+
container_gpu_uuids = set()
|
|
337
|
+
|
|
338
|
+
for proc in gpu_data.get('processes', []):
|
|
339
|
+
if proc['pid'] in container_pids:
|
|
340
|
+
gpu_mem_used += proc['memory_mb']
|
|
341
|
+
container_gpu_uuids.add(proc['gpu_uuid'])
|
|
342
|
+
|
|
343
|
+
# Get utilization of GPUs used by container (DEVICE-LEVEL approximation)
|
|
344
|
+
# NOTE: This is NOT per-container utilization - it's the utilization of shared GPUs
|
|
345
|
+
gpu_util = 0.0
|
|
346
|
+
if container_gpu_uuids:
|
|
347
|
+
# If we have GPU UUIDs, get their utilization
|
|
348
|
+
# For now, just use overall utilization as approximation
|
|
349
|
+
total_util = sum(g['utilization'] for g in gpu_data.get('gpus', []))
|
|
350
|
+
gpu_count = len(gpu_data.get('gpus', [])) or 1
|
|
351
|
+
gpu_util = total_util / gpu_count
|
|
352
|
+
|
|
353
|
+
return gpu_util, gpu_mem_used
|
|
354
|
+
|
|
355
|
+
@log_errors(default_return=False, raise_exception=False, log_error=False)
|
|
356
|
+
def _is_jetson_device(self) -> bool:
|
|
357
|
+
"""
|
|
358
|
+
Check if the current device is an NVIDIA Jetson.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
bool: True if Jetson device, False otherwise.
|
|
362
|
+
"""
|
|
363
|
+
# Check for Jetson-specific indicators
|
|
364
|
+
try:
|
|
365
|
+
# Method 1: Check /etc/nv_tegra_release (Jetson specific)
|
|
366
|
+
if os.path.exists("/etc/nv_tegra_release"):
|
|
367
|
+
return True
|
|
368
|
+
|
|
369
|
+
# Method 2: Check for tegra in /proc/device-tree/compatible
|
|
370
|
+
if os.path.exists("/proc/device-tree/compatible"):
|
|
371
|
+
with open("/proc/device-tree/compatible", "r") as f:
|
|
372
|
+
content = f.read().lower()
|
|
373
|
+
if "tegra" in content or "jetson" in content:
|
|
374
|
+
return True
|
|
375
|
+
|
|
376
|
+
# Method 3: Check if tegrastats exists
|
|
377
|
+
result = subprocess.run(
|
|
378
|
+
["which", "tegrastats"],
|
|
379
|
+
capture_output=True,
|
|
380
|
+
text=True,
|
|
381
|
+
timeout=5,
|
|
382
|
+
)
|
|
383
|
+
if result.returncode == 0:
|
|
384
|
+
return True
|
|
385
|
+
|
|
386
|
+
except Exception as e:
|
|
387
|
+
logging.debug("Error checking for Jetson device: %s", e)
|
|
388
|
+
|
|
389
|
+
return False
|
|
390
|
+
|
|
391
|
+
@log_errors(default_return=set(), raise_exception=False, log_error=False)
|
|
392
|
+
def get_all_container_pids(self, container_id: str) -> set:
|
|
393
|
+
"""
|
|
394
|
+
Get ALL PIDs belonging to a container (including child processes).
|
|
395
|
+
|
|
396
|
+
Uses multiple methods for robustness:
|
|
397
|
+
1. docker top (most reliable for standard Docker)
|
|
398
|
+
2. Docker API inspect + process tree enumeration
|
|
399
|
+
3. cgroup procs files (v1 and v2)
|
|
400
|
+
|
|
401
|
+
Known limitations:
|
|
402
|
+
- May miss processes in rootless Docker
|
|
403
|
+
- CRI-O/containerd may have different layouts
|
|
404
|
+
|
|
118
405
|
Args:
|
|
119
406
|
container_id (str): ID of the Docker container.
|
|
120
407
|
|
|
408
|
+
Returns:
|
|
409
|
+
set: Set of all PIDs (as strings) belonging to the container.
|
|
410
|
+
"""
|
|
411
|
+
pids = set()
|
|
412
|
+
|
|
413
|
+
# Method 1: Use docker top (most reliable)
|
|
414
|
+
try:
|
|
415
|
+
result = subprocess.run(
|
|
416
|
+
["docker", "top", container_id, "-o", "pid"],
|
|
417
|
+
capture_output=True,
|
|
418
|
+
text=True,
|
|
419
|
+
timeout=10,
|
|
420
|
+
)
|
|
421
|
+
if result.returncode == 0:
|
|
422
|
+
lines = result.stdout.strip().split("\n")
|
|
423
|
+
for line in lines[1:]: # Skip header
|
|
424
|
+
pid = line.strip()
|
|
425
|
+
if pid.isdigit():
|
|
426
|
+
pids.add(pid)
|
|
427
|
+
except subprocess.TimeoutExpired:
|
|
428
|
+
logging.debug("docker top command timed out for container %s", container_id)
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logging.debug("docker top failed for %s: %s", container_id, e)
|
|
431
|
+
|
|
432
|
+
# Method 2: Get init PID from docker inspect and enumerate children
|
|
433
|
+
if not pids:
|
|
434
|
+
try:
|
|
435
|
+
result = subprocess.run(
|
|
436
|
+
["docker", "inspect", "--format", "{{.State.Pid}}", container_id],
|
|
437
|
+
capture_output=True,
|
|
438
|
+
text=True,
|
|
439
|
+
timeout=10,
|
|
440
|
+
)
|
|
441
|
+
if result.returncode == 0:
|
|
442
|
+
init_pid = result.stdout.strip()
|
|
443
|
+
if init_pid and init_pid.isdigit() and init_pid != "0":
|
|
444
|
+
pids.add(init_pid)
|
|
445
|
+
# Enumerate all child processes recursively
|
|
446
|
+
pids.update(self._get_child_pids(init_pid))
|
|
447
|
+
except Exception as e:
|
|
448
|
+
logging.debug("docker inspect failed for %s: %s", container_id, e)
|
|
449
|
+
|
|
450
|
+
# Method 3: Check cgroup procs files (fallback)
|
|
451
|
+
cgroup_paths = [
|
|
452
|
+
# cgroup v2 paths
|
|
453
|
+
f"/sys/fs/cgroup/system.slice/docker-{container_id}.scope/cgroup.procs",
|
|
454
|
+
f"/sys/fs/cgroup/docker/{container_id}/cgroup.procs",
|
|
455
|
+
# cgroup v1 paths
|
|
456
|
+
f"/sys/fs/cgroup/pids/docker/{container_id}/cgroup.procs",
|
|
457
|
+
f"/sys/fs/cgroup/cpu/docker/{container_id}/cgroup.procs",
|
|
458
|
+
f"/sys/fs/cgroup/memory/docker/{container_id}/cgroup.procs",
|
|
459
|
+
]
|
|
460
|
+
|
|
461
|
+
for cgroup_path in cgroup_paths:
|
|
462
|
+
try:
|
|
463
|
+
if os.path.exists(cgroup_path):
|
|
464
|
+
with open(cgroup_path, "r") as f:
|
|
465
|
+
for line in f:
|
|
466
|
+
pid = line.strip()
|
|
467
|
+
if pid.isdigit():
|
|
468
|
+
pids.add(pid)
|
|
469
|
+
break
|
|
470
|
+
except Exception as e:
|
|
471
|
+
logging.debug("Error reading cgroup %s: %s", cgroup_path, e)
|
|
472
|
+
|
|
473
|
+
return pids
|
|
474
|
+
|
|
475
|
+
@log_errors(default_return=set(), raise_exception=False, log_error=False)
|
|
476
|
+
def _get_child_pids(self, parent_pid: str, visited: set = None) -> set:
|
|
477
|
+
"""
|
|
478
|
+
Recursively get all child PIDs of a process.
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
parent_pid (str): Parent PID to get children for.
|
|
482
|
+
visited (set): Set of already visited PIDs to prevent cycles.
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
set: Set of all child PIDs (as strings).
|
|
486
|
+
"""
|
|
487
|
+
if visited is None:
|
|
488
|
+
visited = set()
|
|
489
|
+
|
|
490
|
+
if parent_pid in visited:
|
|
491
|
+
return set()
|
|
492
|
+
visited.add(parent_pid)
|
|
493
|
+
|
|
494
|
+
children = set()
|
|
495
|
+
children_path = f"/proc/{parent_pid}/task/{parent_pid}/children"
|
|
496
|
+
|
|
497
|
+
try:
|
|
498
|
+
if os.path.exists(children_path):
|
|
499
|
+
with open(children_path, "r") as f:
|
|
500
|
+
child_pids = f.read().strip().split()
|
|
501
|
+
for child_pid in child_pids:
|
|
502
|
+
if child_pid.isdigit():
|
|
503
|
+
children.add(child_pid)
|
|
504
|
+
# Recursively get grandchildren
|
|
505
|
+
children.update(self._get_child_pids(child_pid, visited))
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logging.debug("Error getting children of PID %s: %s", parent_pid, e)
|
|
508
|
+
|
|
509
|
+
return children
|
|
510
|
+
|
|
511
|
+
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
512
|
+
def _get_jetson_gpu_usage(self, container_pids: set) -> Tuple[float, int]:
|
|
513
|
+
"""
|
|
514
|
+
Get GPU usage for Jetson devices.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
container_pids (set): Set of container PIDs.
|
|
518
|
+
|
|
121
519
|
Returns:
|
|
122
520
|
Tuple[float, int]: GPU utilization percentage and GPU memory usage in MB.
|
|
123
521
|
"""
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
522
|
+
gpu_util = 0.0
|
|
523
|
+
gpu_mem_used = 0
|
|
524
|
+
|
|
525
|
+
try:
|
|
526
|
+
# Method 1: Try using tegrastats (one-shot)
|
|
527
|
+
result = subprocess.run(
|
|
528
|
+
["tegrastats", "--interval", "100", "--stop", "1"],
|
|
529
|
+
capture_output=True,
|
|
530
|
+
text=True,
|
|
531
|
+
timeout=5,
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
if result.returncode == 0 and result.stdout:
|
|
535
|
+
output = result.stdout.strip()
|
|
536
|
+
# Parse tegrastats output - format varies by Jetson model
|
|
537
|
+
# Example: "RAM 2457/7773MB (lfb 1x512kB) CPU [...] GR3D_FREQ 0% ..."
|
|
538
|
+
|
|
539
|
+
# Extract GR3D (GPU) utilization
|
|
540
|
+
import re
|
|
541
|
+
gr3d_match = re.search(r'GR3D_FREQ\s+(\d+)%', output)
|
|
542
|
+
if gr3d_match:
|
|
543
|
+
gpu_util = float(gr3d_match.group(1))
|
|
544
|
+
|
|
545
|
+
# For Jetson, GPU memory is shared with system RAM
|
|
546
|
+
# We can estimate based on total GPU memory allocation
|
|
547
|
+
# Try to get from /sys/kernel/debug/nvmap or similar
|
|
548
|
+
|
|
549
|
+
except subprocess.TimeoutExpired:
|
|
550
|
+
logging.debug("tegrastats timed out")
|
|
551
|
+
except FileNotFoundError:
|
|
552
|
+
logging.debug("tegrastats not found, trying alternative methods")
|
|
553
|
+
except Exception as e:
|
|
554
|
+
logging.debug("Error running tegrastats: %s", e)
|
|
555
|
+
|
|
556
|
+
# Method 2: Try jtop Python library info from /sys
|
|
557
|
+
if gpu_util == 0:
|
|
558
|
+
try:
|
|
559
|
+
# Read GPU frequency/utilization from sysfs
|
|
560
|
+
gpu_load_paths = [
|
|
561
|
+
"/sys/devices/gpu.0/load",
|
|
562
|
+
"/sys/devices/platform/host1x/gpu.0/load",
|
|
563
|
+
"/sys/devices/57000000.gpu/load",
|
|
564
|
+
"/sys/devices/17000000.ga10b/load", # Orin
|
|
565
|
+
]
|
|
566
|
+
|
|
567
|
+
for path in gpu_load_paths:
|
|
568
|
+
if os.path.exists(path):
|
|
569
|
+
with open(path, "r") as f:
|
|
570
|
+
# Load is reported as 0-1000, convert to percentage
|
|
571
|
+
load_val = int(f.read().strip())
|
|
572
|
+
gpu_util = load_val / 10.0
|
|
573
|
+
break
|
|
574
|
+
|
|
575
|
+
except Exception as e:
|
|
576
|
+
logging.debug("Error reading Jetson GPU load from sysfs: %s", e)
|
|
577
|
+
|
|
578
|
+
# Method 3: Get GPU memory from /proc for container processes
|
|
579
|
+
if container_pids:
|
|
580
|
+
try:
|
|
581
|
+
# On Jetson, GPU memory is unified with system RAM
|
|
582
|
+
# Check /proc/[pid]/smaps for GPU-related mappings
|
|
583
|
+
for pid in container_pids:
|
|
584
|
+
smaps_path = f"/proc/{pid}/smaps"
|
|
585
|
+
if os.path.exists(smaps_path):
|
|
586
|
+
with open(smaps_path, "r") as f:
|
|
587
|
+
content = f.read()
|
|
588
|
+
# Look for nvmap or GPU memory regions
|
|
589
|
+
for line in content.split("\n"):
|
|
590
|
+
if "nvmap" in line.lower() or "gpu" in line.lower():
|
|
591
|
+
# Extract size if present
|
|
592
|
+
if "Size:" in line:
|
|
593
|
+
size_kb = int(line.split()[1])
|
|
594
|
+
gpu_mem_used += size_kb // 1024 # Convert to MB
|
|
595
|
+
except Exception as e:
|
|
596
|
+
logging.debug("Error getting Jetson GPU memory: %s", e)
|
|
597
|
+
|
|
127
598
|
return gpu_util, gpu_mem_used
|
|
128
599
|
|
|
129
600
|
@log_errors(default_return="", raise_exception=False, log_error=False)
|
|
@@ -174,30 +645,97 @@ class ResourcesTracker:
|
|
|
174
645
|
Returns:
|
|
175
646
|
float: GPU utilization percentage.
|
|
176
647
|
"""
|
|
648
|
+
return self.get_container_gpu_usage_multi_pid({str(container_pid)})
|
|
649
|
+
|
|
650
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
651
|
+
def get_container_gpu_usage_multi_pid(self, container_pids: set) -> float:
|
|
652
|
+
"""
|
|
653
|
+
Get GPU usage for multiple container PIDs.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
container_pids (set): Set of container PIDs (as strings).
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
float: Total GPU utilization percentage across all matching processes.
|
|
660
|
+
"""
|
|
177
661
|
if not has_gpu():
|
|
178
662
|
return 0
|
|
179
|
-
|
|
663
|
+
if not container_pids:
|
|
664
|
+
return 0
|
|
665
|
+
|
|
666
|
+
gpu_util = 0.0
|
|
667
|
+
|
|
180
668
|
try:
|
|
669
|
+
# Method 1: nvidia-smi pmon (process monitoring)
|
|
181
670
|
result = subprocess.run(
|
|
182
|
-
["nvidia-smi", "pmon", "-c", "1"],
|
|
671
|
+
["nvidia-smi", "pmon", "-c", "1", "-s", "u"],
|
|
183
672
|
capture_output=True,
|
|
184
673
|
text=True,
|
|
185
674
|
check=False,
|
|
186
|
-
timeout=
|
|
675
|
+
timeout=10,
|
|
187
676
|
)
|
|
188
|
-
if result.returncode
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
677
|
+
if result.returncode == 0:
|
|
678
|
+
pmon_output = result.stdout.strip().split("\n")
|
|
679
|
+
for line in pmon_output:
|
|
680
|
+
# Skip header lines (start with # or contain column names)
|
|
681
|
+
if line.startswith("#") or "gpu" in line.lower() and "pid" in line.lower():
|
|
682
|
+
continue
|
|
683
|
+
parts = line.split()
|
|
684
|
+
if len(parts) >= 4:
|
|
685
|
+
pid = parts[1]
|
|
686
|
+
sm_usage = parts[3] if len(parts) > 3 else "0"
|
|
687
|
+
if pid in container_pids:
|
|
688
|
+
if sm_usage != "-" and sm_usage.replace(".", "").isdigit():
|
|
689
|
+
gpu_util += float(sm_usage)
|
|
690
|
+
|
|
691
|
+
if gpu_util > 0:
|
|
692
|
+
return gpu_util
|
|
693
|
+
|
|
694
|
+
# Method 2: Query per-process GPU utilization
|
|
695
|
+
result = subprocess.run(
|
|
696
|
+
["nvidia-smi", "--query-compute-apps=pid,gpu_uuid", "--format=csv,noheader,nounits"],
|
|
697
|
+
capture_output=True,
|
|
698
|
+
text=True,
|
|
699
|
+
check=False,
|
|
700
|
+
timeout=10,
|
|
701
|
+
)
|
|
702
|
+
if result.returncode == 0:
|
|
703
|
+
# Get overall GPU utilization per GPU
|
|
704
|
+
gpu_utils = {}
|
|
705
|
+
util_result = subprocess.run(
|
|
706
|
+
["nvidia-smi", "--query-gpu=uuid,utilization.gpu", "--format=csv,noheader,nounits"],
|
|
707
|
+
capture_output=True,
|
|
708
|
+
text=True,
|
|
709
|
+
check=False,
|
|
710
|
+
timeout=10,
|
|
711
|
+
)
|
|
712
|
+
if util_result.returncode == 0:
|
|
713
|
+
for line in util_result.stdout.strip().split("\n"):
|
|
714
|
+
parts = line.split(",")
|
|
715
|
+
if len(parts) >= 2:
|
|
716
|
+
gpu_uuid = parts[0].strip()
|
|
717
|
+
util = float(parts[1].strip()) if parts[1].strip().replace(".", "").isdigit() else 0
|
|
718
|
+
gpu_utils[gpu_uuid] = util
|
|
719
|
+
|
|
720
|
+
# Check which GPUs have our container processes
|
|
721
|
+
matched_gpus = set()
|
|
722
|
+
for line in result.stdout.strip().split("\n"):
|
|
723
|
+
if not line.strip():
|
|
724
|
+
continue
|
|
725
|
+
parts = line.split(",")
|
|
726
|
+
if len(parts) >= 2:
|
|
727
|
+
pid = parts[0].strip()
|
|
728
|
+
gpu_uuid = parts[1].strip()
|
|
729
|
+
if pid in container_pids:
|
|
730
|
+
matched_gpus.add(gpu_uuid)
|
|
731
|
+
|
|
732
|
+
# Sum utilization for matched GPUs
|
|
733
|
+
for gpu_uuid in matched_gpus:
|
|
734
|
+
if gpu_uuid in gpu_utils:
|
|
735
|
+
gpu_util += gpu_utils[gpu_uuid]
|
|
736
|
+
|
|
199
737
|
except subprocess.TimeoutExpired:
|
|
200
|
-
logging.debug("nvidia-smi
|
|
738
|
+
logging.debug("nvidia-smi command timed out in get_container_gpu_usage_multi_pid")
|
|
201
739
|
return 0
|
|
202
740
|
except (ValueError, IndexError) as e:
|
|
203
741
|
logging.debug("Error parsing GPU usage info: %s", e)
|
|
@@ -206,8 +744,9 @@ class ResourcesTracker:
|
|
|
206
744
|
logging.debug("nvidia-smi not found on this system")
|
|
207
745
|
return 0
|
|
208
746
|
except Exception as e:
|
|
209
|
-
logging.debug("Unexpected error in
|
|
747
|
+
logging.debug("Unexpected error in get_container_gpu_usage_multi_pid: %s", e)
|
|
210
748
|
return 0
|
|
749
|
+
|
|
211
750
|
return gpu_util
|
|
212
751
|
|
|
213
752
|
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
@@ -221,34 +760,85 @@ class ResourcesTracker:
|
|
|
221
760
|
Returns:
|
|
222
761
|
int: GPU memory usage in MB.
|
|
223
762
|
"""
|
|
763
|
+
return self.get_container_gpu_memory_usage_multi_pid({str(container_pid)})
|
|
764
|
+
|
|
765
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
766
|
+
def get_container_gpu_memory_usage_multi_pid(self, container_pids: set) -> int:
|
|
767
|
+
"""
|
|
768
|
+
Get GPU memory usage for multiple container PIDs.
|
|
769
|
+
|
|
770
|
+
Args:
|
|
771
|
+
container_pids (set): Set of container PIDs (as strings).
|
|
772
|
+
|
|
773
|
+
Returns:
|
|
774
|
+
int: Total GPU memory usage in MB across all matching processes.
|
|
775
|
+
"""
|
|
224
776
|
if not has_gpu():
|
|
225
777
|
return 0
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
"--format=csv,noheader,nounits",
|
|
230
|
-
]
|
|
778
|
+
if not container_pids:
|
|
779
|
+
return 0
|
|
780
|
+
|
|
231
781
|
total_memory = 0
|
|
782
|
+
|
|
232
783
|
try:
|
|
784
|
+
# Method 1: Query compute apps for memory usage
|
|
785
|
+
cmd = [
|
|
786
|
+
"nvidia-smi",
|
|
787
|
+
"--query-compute-apps=pid,used_memory",
|
|
788
|
+
"--format=csv,noheader,nounits",
|
|
789
|
+
]
|
|
233
790
|
result = subprocess.run(
|
|
234
791
|
cmd,
|
|
235
792
|
stdout=subprocess.PIPE,
|
|
236
793
|
stderr=subprocess.PIPE,
|
|
237
794
|
text=True,
|
|
238
795
|
check=False,
|
|
239
|
-
timeout=
|
|
796
|
+
timeout=10,
|
|
240
797
|
)
|
|
241
|
-
if result.returncode
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
798
|
+
if result.returncode == 0:
|
|
799
|
+
for line in result.stdout.splitlines():
|
|
800
|
+
line = line.strip()
|
|
801
|
+
if not line:
|
|
802
|
+
continue
|
|
803
|
+
# Handle both ", " and "," separators
|
|
804
|
+
if ", " in line:
|
|
805
|
+
parts = line.split(", ")
|
|
806
|
+
else:
|
|
807
|
+
parts = line.split(",")
|
|
808
|
+
if len(parts) >= 2:
|
|
809
|
+
process_pid = parts[0].strip()
|
|
810
|
+
used_memory = parts[1].strip()
|
|
811
|
+
if process_pid in container_pids:
|
|
812
|
+
if used_memory.isdigit():
|
|
813
|
+
total_memory += int(used_memory)
|
|
814
|
+
|
|
815
|
+
if total_memory > 0:
|
|
816
|
+
return total_memory
|
|
817
|
+
|
|
818
|
+
# Method 2: Use pmon for memory info
|
|
819
|
+
result = subprocess.run(
|
|
820
|
+
["nvidia-smi", "pmon", "-c", "1", "-s", "m"],
|
|
821
|
+
capture_output=True,
|
|
822
|
+
text=True,
|
|
823
|
+
check=False,
|
|
824
|
+
timeout=10,
|
|
825
|
+
)
|
|
826
|
+
if result.returncode == 0:
|
|
827
|
+
pmon_output = result.stdout.strip().split("\n")
|
|
828
|
+
for line in pmon_output:
|
|
829
|
+
if line.startswith("#") or "gpu" in line.lower() and "pid" in line.lower():
|
|
830
|
+
continue
|
|
831
|
+
parts = line.split()
|
|
832
|
+
# Format: gpu pid type fb_mem (MB)
|
|
833
|
+
if len(parts) >= 4:
|
|
834
|
+
pid = parts[1]
|
|
835
|
+
fb_mem = parts[3] if len(parts) > 3 else "0"
|
|
836
|
+
if pid in container_pids:
|
|
837
|
+
if fb_mem != "-" and fb_mem.isdigit():
|
|
838
|
+
total_memory += int(fb_mem)
|
|
839
|
+
|
|
250
840
|
except subprocess.TimeoutExpired:
|
|
251
|
-
logging.debug("nvidia-smi command timed out
|
|
841
|
+
logging.debug("nvidia-smi command timed out in get_container_gpu_memory_usage_multi_pid")
|
|
252
842
|
return 0
|
|
253
843
|
except (ValueError, IndexError) as e:
|
|
254
844
|
logging.debug("Error parsing GPU memory usage info: %s", e)
|
|
@@ -257,72 +847,125 @@ class ResourcesTracker:
|
|
|
257
847
|
logging.debug("nvidia-smi not found on this system")
|
|
258
848
|
return 0
|
|
259
849
|
except Exception as e:
|
|
260
|
-
logging.debug("Unexpected error in
|
|
850
|
+
logging.debug("Unexpected error in get_container_gpu_memory_usage_multi_pid: %s", e)
|
|
261
851
|
return 0
|
|
852
|
+
|
|
262
853
|
return total_memory
|
|
263
854
|
|
|
264
855
|
@log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
|
|
265
856
|
def get_available_resources(self) -> Tuple[float, float, int, float]:
|
|
266
857
|
"""
|
|
267
858
|
Get available machine resources.
|
|
859
|
+
|
|
860
|
+
Note: CPU measurement is non-blocking (uses interval=0).
|
|
861
|
+
For more accurate CPU usage, call this method periodically and track trends.
|
|
268
862
|
|
|
269
863
|
Returns:
|
|
270
|
-
Tuple[float, float, int, float]:
|
|
271
|
-
|
|
864
|
+
Tuple[float, float, int, float]:
|
|
865
|
+
- Available memory in GB
|
|
866
|
+
- Available CPU percentage (100 - current_usage)
|
|
867
|
+
- Free GPU memory in MB
|
|
868
|
+
- GPU utilization percentage (0-100)
|
|
272
869
|
"""
|
|
273
|
-
|
|
274
|
-
|
|
870
|
+
# Memory: straightforward
|
|
871
|
+
available_memory = psutil.virtual_memory().available / (1024 ** 3)
|
|
872
|
+
|
|
873
|
+
# CPU: NON-BLOCKING - interval=0 returns instant snapshot
|
|
874
|
+
# For better accuracy, consider using load average or tracking over time
|
|
875
|
+
# Note: Inside containers, this may not reflect cgroup limits
|
|
876
|
+
try:
|
|
877
|
+
# Use interval=0 for non-blocking (returns cached value or 0.0 on first call)
|
|
878
|
+
cpu_percent = psutil.cpu_percent(1)
|
|
879
|
+
# # If first call (returns 0.0), try load average as fallback
|
|
880
|
+
# if cpu_percent == 0.0:
|
|
881
|
+
# try:
|
|
882
|
+
# # Use 1-minute load average as percentage of CPU count
|
|
883
|
+
# load_avg = os.getloadavg()[0]
|
|
884
|
+
# cpu_count = psutil.cpu_count() or 1
|
|
885
|
+
# cpu_percent = min(100.0, (load_avg / cpu_count) * 100.0)
|
|
886
|
+
# except (OSError, AttributeError):
|
|
887
|
+
# # os.getloadavg() not available on Windows
|
|
888
|
+
# pass
|
|
889
|
+
available_cpu = max(0.0, 100.0 - cpu_percent)
|
|
890
|
+
except Exception:
|
|
891
|
+
available_cpu = 100.0
|
|
892
|
+
|
|
275
893
|
gpu_memory_free, gpu_utilization = self._get_gpu_resources()
|
|
276
894
|
return available_memory, available_cpu, gpu_memory_free, gpu_utilization
|
|
277
895
|
|
|
278
896
|
@log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
|
|
279
897
|
def _get_gpu_resources(self) -> Tuple[int, float]:
|
|
280
898
|
"""
|
|
281
|
-
Get available GPU resources.
|
|
899
|
+
Get available GPU resources using cached data.
|
|
900
|
+
|
|
901
|
+
Returns:
|
|
902
|
+
Tuple[int, float]: Free GPU memory in MB and GPU utilization percentage.
|
|
903
|
+
"""
|
|
904
|
+
if not has_gpu():
|
|
905
|
+
return 0, 0.0
|
|
906
|
+
|
|
907
|
+
# Use cached GPU data for efficiency
|
|
908
|
+
gpu_data = self._get_cached_gpu_data()
|
|
909
|
+
|
|
910
|
+
if not gpu_data.get('gpus'):
|
|
911
|
+
# Cache miss or no GPUs, fall back to direct query
|
|
912
|
+
return self._get_gpu_resources_direct()
|
|
913
|
+
|
|
914
|
+
gpu_memory_free = 0
|
|
915
|
+
gpu_utilization = 0.0
|
|
916
|
+
gpu_count = 0
|
|
917
|
+
|
|
918
|
+
for gpu in gpu_data['gpus']:
|
|
919
|
+
gpu_memory_free += gpu['memory_total'] - gpu['memory_used']
|
|
920
|
+
gpu_utilization += gpu['utilization']
|
|
921
|
+
gpu_count += 1
|
|
922
|
+
|
|
923
|
+
if gpu_count > 0:
|
|
924
|
+
gpu_utilization /= gpu_count
|
|
925
|
+
|
|
926
|
+
return gpu_memory_free, gpu_utilization
|
|
927
|
+
|
|
928
|
+
@log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
|
|
929
|
+
def _get_gpu_resources_direct(self) -> Tuple[int, float]:
|
|
930
|
+
"""
|
|
931
|
+
Get GPU resources directly (fallback when cache is empty).
|
|
282
932
|
|
|
283
933
|
Returns:
|
|
284
934
|
Tuple[int, float]: Free GPU memory in MB and GPU utilization percentage.
|
|
285
935
|
"""
|
|
286
936
|
gpu_memory_free = 0
|
|
287
937
|
gpu_utilization = 0.0
|
|
288
|
-
if not has_gpu():
|
|
289
|
-
return gpu_memory_free, gpu_utilization
|
|
290
938
|
|
|
291
939
|
try:
|
|
292
940
|
result = subprocess.run(
|
|
293
|
-
["nvidia-smi"],
|
|
941
|
+
["nvidia-smi", "--query-gpu=memory.free,utilization.gpu", "--format=csv,noheader,nounits"],
|
|
294
942
|
stdout=subprocess.PIPE,
|
|
295
943
|
stderr=subprocess.PIPE,
|
|
944
|
+
text=True,
|
|
296
945
|
timeout=5,
|
|
297
|
-
check=False,
|
|
298
946
|
)
|
|
299
947
|
if result.returncode != 0:
|
|
300
|
-
logging.debug("nvidia-smi command failed in _get_gpu_resources")
|
|
301
948
|
return 0, 0.0
|
|
949
|
+
|
|
950
|
+
gpu_count = 0
|
|
951
|
+
for line in result.stdout.strip().split("\n"):
|
|
952
|
+
if not line.strip():
|
|
953
|
+
continue
|
|
954
|
+
parts = [p.strip() for p in line.split(",")]
|
|
955
|
+
if len(parts) >= 2:
|
|
956
|
+
gpu_memory_free += int(parts[0]) if parts[0].isdigit() else 0
|
|
957
|
+
gpu_utilization += float(parts[1]) if parts[1].replace('.', '').isdigit() else 0
|
|
958
|
+
gpu_count += 1
|
|
959
|
+
|
|
960
|
+
if gpu_count > 0:
|
|
961
|
+
gpu_utilization /= gpu_count
|
|
962
|
+
|
|
302
963
|
except subprocess.TimeoutExpired:
|
|
303
|
-
logging.debug("nvidia-smi command timed out
|
|
304
|
-
return 0, 0.0
|
|
964
|
+
logging.debug("nvidia-smi command timed out in _get_gpu_resources_direct")
|
|
305
965
|
except FileNotFoundError:
|
|
306
966
|
logging.debug("nvidia-smi not found on this system")
|
|
307
|
-
return 0, 0.0
|
|
308
967
|
except Exception as e:
|
|
309
|
-
logging.debug("Error
|
|
310
|
-
return 0, 0.0
|
|
311
|
-
|
|
312
|
-
info_list = get_gpu_info()
|
|
313
|
-
if not info_list:
|
|
314
|
-
return 0, 0.0
|
|
315
|
-
|
|
316
|
-
try:
|
|
317
|
-
for info in info_list:
|
|
318
|
-
info_split = info.split(", ")
|
|
319
|
-
if len(info_split) >= 6:
|
|
320
|
-
gpu_memory_free += int(info_split[5])
|
|
321
|
-
gpu_utilization += float(info_split[2])
|
|
322
|
-
gpu_utilization /= len(info_list) if info_list else 1
|
|
323
|
-
except (ValueError, IndexError) as e:
|
|
324
|
-
logging.debug("Error parsing GPU resources: %s", e)
|
|
325
|
-
return 0, 0.0
|
|
968
|
+
logging.debug("Error in _get_gpu_resources_direct: %s", e)
|
|
326
969
|
|
|
327
970
|
return gpu_memory_free, gpu_utilization
|
|
328
971
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
2
|
+
matrice_compute/action_instance.py,sha256=GF49-yYJp_5EHZ6ZT5kY4U-y1zyPkFjjDt1xMb2BaIg,87439
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=a_TulMnu462xc0t_A-Mpug5zhQTmtpjiv7mhiC_IAVw,18280
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
5
|
matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
|
|
@@ -7,12 +7,12 @@ matrice_compute/instance_manager.py,sha256=9u3QRTP-MkAWmrSQMMbCKc0TfK584teAg1wWI
|
|
|
7
7
|
matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
|
|
8
8
|
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
9
9
|
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
matrice_compute/resources_tracker.py,sha256=
|
|
10
|
+
matrice_compute/resources_tracker.py,sha256=DffKitGU1gran0OAuKIsfH0XeOe03xU7NGl-_uMsad4,58674
|
|
11
11
|
matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
|
|
12
12
|
matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
|
|
13
13
|
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
18
|
-
matrice_compute-0.1.
|
|
14
|
+
matrice_compute-0.1.34.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
15
|
+
matrice_compute-0.1.34.dist-info/METADATA,sha256=K4c_uaSlUeEbbC7yWB9RzW_qvLoxfgwGOk94BbbtaQs,1038
|
|
16
|
+
matrice_compute-0.1.34.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
matrice_compute-0.1.34.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
18
|
+
matrice_compute-0.1.34.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|