matrice-compute 0.1.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +20 -0
- matrice_compute/action_instance.py +2023 -0
- matrice_compute/actions_manager.py +467 -0
- matrice_compute/actions_scaledown_manager.py +57 -0
- matrice_compute/compute_operations_handler.py +490 -0
- matrice_compute/instance_manager.py +470 -0
- matrice_compute/instance_utils.py +1266 -0
- matrice_compute/prechecks.py +538 -0
- matrice_compute/py.typed +0 -0
- matrice_compute/resources_tracker.py +842 -0
- matrice_compute/scaling.py +1395 -0
- matrice_compute/shutdown_manager.py +314 -0
- matrice_compute/task_utils.py +77 -0
- matrice_compute-0.1.29.dist-info/METADATA +28 -0
- matrice_compute-0.1.29.dist-info/RECORD +18 -0
- matrice_compute-0.1.29.dist-info/WHEEL +5 -0
- matrice_compute-0.1.29.dist-info/licenses/LICENSE.txt +21 -0
- matrice_compute-0.1.29.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,842 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module contains classes for tracking machine and action resources.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import logging
|
|
8
|
+
import threading
|
|
9
|
+
import json
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
import psutil
|
|
12
|
+
import docker
|
|
13
|
+
from typing import List, Tuple, Dict, Optional
|
|
14
|
+
from matrice_compute.instance_utils import (
|
|
15
|
+
has_gpu,
|
|
16
|
+
get_gpu_info,
|
|
17
|
+
calculate_time_difference,
|
|
18
|
+
)
|
|
19
|
+
from matrice_compute.scaling import Scaling
|
|
20
|
+
from matrice_common.utils import log_errors
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ResourcesTracker:
|
|
24
|
+
"""Tracks machine and container resources."""
|
|
25
|
+
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
"""
|
|
28
|
+
Initialize ResourcesTracker.
|
|
29
|
+
"""
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
@log_errors(default_return=(0, 0), raise_exception=False)
|
|
33
|
+
def get_container_cpu_and_memory(self, container: docker.models.containers.Container) -> Tuple[float, float]:
|
|
34
|
+
"""
|
|
35
|
+
Get CPU and memory usage for a container.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
container (docker.models.containers.Container): Docker container instance.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Tuple[float, float]: CPU utilization percentage and memory utilization percentage.
|
|
42
|
+
"""
|
|
43
|
+
stats = container.stats(stream=False)
|
|
44
|
+
if stats:
|
|
45
|
+
cpu_utilization = 0
|
|
46
|
+
cpu_delta = (
|
|
47
|
+
stats["cpu_stats"]["cpu_usage"]["total_usage"]
|
|
48
|
+
- stats["precpu_stats"]["cpu_usage"]["total_usage"]
|
|
49
|
+
)
|
|
50
|
+
system_delta = stats["cpu_stats"].get("system_cpu_usage", 0) - stats[
|
|
51
|
+
"precpu_stats"
|
|
52
|
+
].get("system_cpu_usage", 0)
|
|
53
|
+
if system_delta > 0:
|
|
54
|
+
cpu_utilization = cpu_delta / system_delta * 100.0
|
|
55
|
+
memory_usage = stats["memory_stats"].get("usage", 0)
|
|
56
|
+
memory_limit = stats["memory_stats"].get("limit", 1)
|
|
57
|
+
memory_utilization = memory_usage / memory_limit * 100.0
|
|
58
|
+
return cpu_utilization, memory_utilization
|
|
59
|
+
return 0, 0
|
|
60
|
+
|
|
61
|
+
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
62
|
+
def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
|
|
63
|
+
"""
|
|
64
|
+
Get CPU and memory usage for a specific container by its ID.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
container_id (str): ID of the Docker container.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Tuple[float, float]: CPU utilization percentage and memory usage in MB.
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
stats_result = subprocess.run(
|
|
74
|
+
[
|
|
75
|
+
"docker",
|
|
76
|
+
"stats",
|
|
77
|
+
"--no-stream",
|
|
78
|
+
"--format",
|
|
79
|
+
"{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
|
|
80
|
+
container_id,
|
|
81
|
+
],
|
|
82
|
+
capture_output=True,
|
|
83
|
+
text=True,
|
|
84
|
+
check=False,
|
|
85
|
+
timeout=10,
|
|
86
|
+
)
|
|
87
|
+
if stats_result.returncode != 0:
|
|
88
|
+
logging.debug("docker stats command failed for container %s", container_id)
|
|
89
|
+
return 0, 0
|
|
90
|
+
stats = stats_result.stdout.strip().split(": ")[1].split(", ")
|
|
91
|
+
cpu_usage = float(stats[0].replace("% CPU", "").strip())
|
|
92
|
+
memory_usage = stats[1].split(" / ")[0]
|
|
93
|
+
mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
|
|
94
|
+
if mem_unit == "KiB":
|
|
95
|
+
memory_usage_mb = float(mem_value) / 1024
|
|
96
|
+
elif mem_unit == "MiB":
|
|
97
|
+
memory_usage_mb = float(mem_value)
|
|
98
|
+
elif mem_unit == "GiB":
|
|
99
|
+
memory_usage_mb = float(mem_value) * 1024
|
|
100
|
+
else:
|
|
101
|
+
memory_usage_mb = float(mem_value)
|
|
102
|
+
return cpu_usage, memory_usage_mb
|
|
103
|
+
except subprocess.TimeoutExpired:
|
|
104
|
+
logging.debug("docker stats command timed out for container %s", container_id)
|
|
105
|
+
return 0, 0
|
|
106
|
+
except (ValueError, IndexError) as e:
|
|
107
|
+
logging.debug("Error parsing docker stats for container %s: %s", container_id, e)
|
|
108
|
+
return 0, 0
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logging.debug("Unexpected error getting container stats for %s: %s", container_id, e)
|
|
111
|
+
return 0, 0
|
|
112
|
+
|
|
113
|
+
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
114
|
+
def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
|
|
115
|
+
"""
|
|
116
|
+
Get GPU usage for a specific container.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
container_id (str): ID of the Docker container.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Tuple[float, int]: GPU utilization percentage and GPU memory usage in MB.
|
|
123
|
+
"""
|
|
124
|
+
container_pid = self.get_pid_id_by_container_id(container_id)
|
|
125
|
+
gpu_util = self.get_container_gpu_usage(container_pid)
|
|
126
|
+
gpu_mem_used = self.get_container_gpu_memory_usage(container_pid)
|
|
127
|
+
return gpu_util, gpu_mem_used
|
|
128
|
+
|
|
129
|
+
@log_errors(default_return="", raise_exception=False, log_error=False)
|
|
130
|
+
def get_pid_id_by_container_id(self, container_id: str) -> str:
|
|
131
|
+
"""
|
|
132
|
+
Get PID for a container ID.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
container_id (str): ID of the Docker container.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
str: PID of the container.
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
pid_result = subprocess.run(
|
|
142
|
+
[
|
|
143
|
+
"docker",
|
|
144
|
+
"inspect",
|
|
145
|
+
"--format",
|
|
146
|
+
"{{.State.Pid}}",
|
|
147
|
+
container_id,
|
|
148
|
+
],
|
|
149
|
+
capture_output=True,
|
|
150
|
+
text=True,
|
|
151
|
+
check=False,
|
|
152
|
+
timeout=10,
|
|
153
|
+
)
|
|
154
|
+
if pid_result.returncode != 0:
|
|
155
|
+
logging.debug("docker inspect command failed for container %s", container_id)
|
|
156
|
+
return ""
|
|
157
|
+
container_pid = pid_result.stdout.strip()
|
|
158
|
+
return container_pid
|
|
159
|
+
except subprocess.TimeoutExpired:
|
|
160
|
+
logging.debug("docker inspect command timed out for container %s", container_id)
|
|
161
|
+
return ""
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logging.debug("Error getting PID for container %s: %s", container_id, e)
|
|
164
|
+
return ""
|
|
165
|
+
|
|
166
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
167
|
+
def get_container_gpu_usage(self, container_pid: str) -> float:
|
|
168
|
+
"""
|
|
169
|
+
Get GPU usage for a container PID.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
container_pid (str): PID of the Docker container.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
float: GPU utilization percentage.
|
|
176
|
+
"""
|
|
177
|
+
if not has_gpu():
|
|
178
|
+
return 0
|
|
179
|
+
gpu_util = 0
|
|
180
|
+
try:
|
|
181
|
+
result = subprocess.run(
|
|
182
|
+
["nvidia-smi", "pmon", "-c", "1"],
|
|
183
|
+
capture_output=True,
|
|
184
|
+
text=True,
|
|
185
|
+
check=False,
|
|
186
|
+
timeout=5,
|
|
187
|
+
)
|
|
188
|
+
if result.returncode != 0:
|
|
189
|
+
logging.debug("nvidia-smi pmon command failed in get_container_gpu_usage")
|
|
190
|
+
return 0
|
|
191
|
+
pmon_output = result.stdout.strip().split("\n")
|
|
192
|
+
for line in pmon_output[2:]:
|
|
193
|
+
parts = line.split()
|
|
194
|
+
if len(parts) >= 8:
|
|
195
|
+
pid = parts[1]
|
|
196
|
+
gpu_usage = parts[3]
|
|
197
|
+
if pid == str(container_pid):
|
|
198
|
+
gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
|
|
199
|
+
except subprocess.TimeoutExpired:
|
|
200
|
+
logging.debug("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
|
|
201
|
+
return 0
|
|
202
|
+
except (ValueError, IndexError) as e:
|
|
203
|
+
logging.debug("Error parsing GPU usage info: %s", e)
|
|
204
|
+
return 0
|
|
205
|
+
except FileNotFoundError:
|
|
206
|
+
logging.debug("nvidia-smi not found on this system")
|
|
207
|
+
return 0
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logging.debug("Unexpected error in get_container_gpu_usage: %s", e)
|
|
210
|
+
return 0
|
|
211
|
+
return gpu_util
|
|
212
|
+
|
|
213
|
+
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
214
|
+
def get_container_gpu_memory_usage(self, container_pid: str) -> int:
|
|
215
|
+
"""
|
|
216
|
+
Get GPU memory usage for a container PID.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
container_pid (str): PID of the Docker container.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
int: GPU memory usage in MB.
|
|
223
|
+
"""
|
|
224
|
+
if not has_gpu():
|
|
225
|
+
return 0
|
|
226
|
+
cmd = [
|
|
227
|
+
"nvidia-smi",
|
|
228
|
+
"--query-compute-apps=pid,used_memory",
|
|
229
|
+
"--format=csv,noheader,nounits",
|
|
230
|
+
]
|
|
231
|
+
total_memory = 0
|
|
232
|
+
try:
|
|
233
|
+
result = subprocess.run(
|
|
234
|
+
cmd,
|
|
235
|
+
stdout=subprocess.PIPE,
|
|
236
|
+
stderr=subprocess.PIPE,
|
|
237
|
+
text=True,
|
|
238
|
+
check=False,
|
|
239
|
+
timeout=5,
|
|
240
|
+
)
|
|
241
|
+
if result.returncode != 0:
|
|
242
|
+
logging.debug("nvidia-smi command failed in get_container_gpu_memory_usage")
|
|
243
|
+
return 0
|
|
244
|
+
for line in result.stdout.splitlines():
|
|
245
|
+
parts = line.strip().split(", ")
|
|
246
|
+
if len(parts) == 2:
|
|
247
|
+
process_pid, used_memory = parts
|
|
248
|
+
if process_pid == str(container_pid):
|
|
249
|
+
total_memory += int(used_memory)
|
|
250
|
+
except subprocess.TimeoutExpired:
|
|
251
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
|
|
252
|
+
return 0
|
|
253
|
+
except (ValueError, IndexError) as e:
|
|
254
|
+
logging.debug("Error parsing GPU memory usage info: %s", e)
|
|
255
|
+
return 0
|
|
256
|
+
except FileNotFoundError:
|
|
257
|
+
logging.debug("nvidia-smi not found on this system")
|
|
258
|
+
return 0
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logging.debug("Unexpected error in get_container_gpu_memory_usage: %s", e)
|
|
261
|
+
return 0
|
|
262
|
+
return total_memory
|
|
263
|
+
|
|
264
|
+
@log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
|
|
265
|
+
def get_available_resources(self) -> Tuple[float, float, int, float]:
|
|
266
|
+
"""
|
|
267
|
+
Get available machine resources.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
Tuple[float, float, int, float]: Available memory in GB, available CPU percentage,
|
|
271
|
+
free GPU memory in MB, and GPU utilization percentage.
|
|
272
|
+
"""
|
|
273
|
+
available_memory = psutil.virtual_memory().available / 1024**3
|
|
274
|
+
available_cpu = 100 - psutil.cpu_percent(1)
|
|
275
|
+
gpu_memory_free, gpu_utilization = self._get_gpu_resources()
|
|
276
|
+
return available_memory, available_cpu, gpu_memory_free, gpu_utilization
|
|
277
|
+
|
|
278
|
+
@log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
|
|
279
|
+
def _get_gpu_resources(self) -> Tuple[int, float]:
|
|
280
|
+
"""
|
|
281
|
+
Get available GPU resources.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Tuple[int, float]: Free GPU memory in MB and GPU utilization percentage.
|
|
285
|
+
"""
|
|
286
|
+
gpu_memory_free = 0
|
|
287
|
+
gpu_utilization = 0.0
|
|
288
|
+
if not has_gpu():
|
|
289
|
+
return gpu_memory_free, gpu_utilization
|
|
290
|
+
|
|
291
|
+
try:
|
|
292
|
+
result = subprocess.run(
|
|
293
|
+
["nvidia-smi"],
|
|
294
|
+
stdout=subprocess.PIPE,
|
|
295
|
+
stderr=subprocess.PIPE,
|
|
296
|
+
timeout=5,
|
|
297
|
+
check=False,
|
|
298
|
+
)
|
|
299
|
+
if result.returncode != 0:
|
|
300
|
+
logging.debug("nvidia-smi command failed in _get_gpu_resources")
|
|
301
|
+
return 0, 0.0
|
|
302
|
+
except subprocess.TimeoutExpired:
|
|
303
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
|
|
304
|
+
return 0, 0.0
|
|
305
|
+
except FileNotFoundError:
|
|
306
|
+
logging.debug("nvidia-smi not found on this system")
|
|
307
|
+
return 0, 0.0
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logging.debug("Error running nvidia-smi in _get_gpu_resources: %s", e)
|
|
310
|
+
return 0, 0.0
|
|
311
|
+
|
|
312
|
+
info_list = get_gpu_info()
|
|
313
|
+
if not info_list:
|
|
314
|
+
return 0, 0.0
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
for info in info_list:
|
|
318
|
+
info_split = info.split(", ")
|
|
319
|
+
if len(info_split) >= 6:
|
|
320
|
+
gpu_memory_free += int(info_split[5])
|
|
321
|
+
gpu_utilization += float(info_split[2])
|
|
322
|
+
gpu_utilization /= len(info_list) if info_list else 1
|
|
323
|
+
except (ValueError, IndexError) as e:
|
|
324
|
+
logging.debug("Error parsing GPU resources: %s", e)
|
|
325
|
+
return 0, 0.0
|
|
326
|
+
|
|
327
|
+
return gpu_memory_free, gpu_utilization
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
class ActionsResourcesTracker:
|
|
331
|
+
"""Tracks Docker container action resources"""
|
|
332
|
+
|
|
333
|
+
def __init__(self, scaling: Scaling):
|
|
334
|
+
"""Initialize ActionsResourcesTracker"""
|
|
335
|
+
self.scaling = scaling
|
|
336
|
+
self.max_actions_usage = {}
|
|
337
|
+
self.resources_tracker = ResourcesTracker()
|
|
338
|
+
self.client = docker.from_env()
|
|
339
|
+
self.logged_stopped_containers = []
|
|
340
|
+
|
|
341
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
342
|
+
def update_actions_resources(self) -> None:
|
|
343
|
+
"""Process both running and exited containers.
|
|
344
|
+
|
|
345
|
+
Note: Does not remove containers to keep logs. Only tracks resource usage.
|
|
346
|
+
"""
|
|
347
|
+
exited_containers = self.client.containers.list(
|
|
348
|
+
filters={"status": "exited"},
|
|
349
|
+
all=True,
|
|
350
|
+
)
|
|
351
|
+
running_containers = self.client.containers.list(filters={"status": "running"})
|
|
352
|
+
if exited_containers:
|
|
353
|
+
for container in exited_containers:
|
|
354
|
+
try:
|
|
355
|
+
if container.id in self.logged_stopped_containers:
|
|
356
|
+
continue
|
|
357
|
+
self._update_container_action_status(container, "completed")
|
|
358
|
+
self.logged_stopped_containers.append(container.id)
|
|
359
|
+
# COMMENTED OUT: Do not remove containers to keep logs
|
|
360
|
+
# container.remove()
|
|
361
|
+
except Exception as err:
|
|
362
|
+
logging.error(
|
|
363
|
+
"Error processing exited container %s: %s",
|
|
364
|
+
container.id,
|
|
365
|
+
str(err),
|
|
366
|
+
)
|
|
367
|
+
if running_containers:
|
|
368
|
+
for container in running_containers:
|
|
369
|
+
try:
|
|
370
|
+
self._update_container_action_status(container, "running")
|
|
371
|
+
except Exception as err:
|
|
372
|
+
logging.error(
|
|
373
|
+
"Error processing running container %s: %s",
|
|
374
|
+
container.id,
|
|
375
|
+
str(err),
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
@log_errors(default_return=[], raise_exception=False)
|
|
379
|
+
def get_sub_containers_by_label(self, label_key: str, label_value: str) -> list:
|
|
380
|
+
"""Get running containers with specified label key and value"""
|
|
381
|
+
containers = self.client.containers.list(
|
|
382
|
+
filters={
|
|
383
|
+
"label": [f"{label_key}={label_value}"],
|
|
384
|
+
"status": "running",
|
|
385
|
+
}
|
|
386
|
+
)
|
|
387
|
+
return containers
|
|
388
|
+
|
|
389
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
390
|
+
def _update_container_action_status(self, container, status: str) -> None:
|
|
391
|
+
"""Update action status for a specific container"""
|
|
392
|
+
inspect_data = self.client.api.inspect_container(container.id)
|
|
393
|
+
start_time = inspect_data["State"]["StartedAt"]
|
|
394
|
+
finish_time = (
|
|
395
|
+
inspect_data["State"]["FinishedAt"]
|
|
396
|
+
if status == "completed"
|
|
397
|
+
else datetime.now(timezone.utc).isoformat()
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
def remove_quotation_marks(args):
|
|
401
|
+
"""Remove quotes from container args"""
|
|
402
|
+
new_args = []
|
|
403
|
+
for arg in args:
|
|
404
|
+
new_args.extend(x.replace('"', "").replace("'", "") for x in arg.split(" "))
|
|
405
|
+
return new_args
|
|
406
|
+
|
|
407
|
+
def is_valid_objectid(s: str) -> bool:
|
|
408
|
+
"""Check if string is a valid MongoDB ObjectId (24 hex characters)"""
|
|
409
|
+
s = s.strip()
|
|
410
|
+
return len(s) == 24 and all(c in '0123456789abcdefABCDEF' for c in s)
|
|
411
|
+
|
|
412
|
+
valid_objectids = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if is_valid_objectid(arg)]
|
|
413
|
+
action_record_id = valid_objectids[-1] if valid_objectids else None
|
|
414
|
+
if not action_record_id:
|
|
415
|
+
logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
|
|
416
|
+
duration = calculate_time_difference(start_time, finish_time)
|
|
417
|
+
(
|
|
418
|
+
current_gpu_utilization,
|
|
419
|
+
current_gpu_memory,
|
|
420
|
+
current_cpu_utilization,
|
|
421
|
+
current_memory_utilization,
|
|
422
|
+
) = self.get_current_action_usage(container, status)
|
|
423
|
+
sub_containers = self.get_sub_containers_by_label("action_id", action_record_id)
|
|
424
|
+
for sub_container in sub_containers:
|
|
425
|
+
if sub_container.id in self.logged_stopped_containers:
|
|
426
|
+
continue
|
|
427
|
+
(
|
|
428
|
+
sub_container_gpu_utilization,
|
|
429
|
+
sub_container_gpu_memory,
|
|
430
|
+
sub_container_cpu_utilization,
|
|
431
|
+
sub_container_memory_utilization,
|
|
432
|
+
) = self.get_current_action_usage(sub_container, status)
|
|
433
|
+
current_gpu_utilization += sub_container_gpu_utilization
|
|
434
|
+
current_gpu_memory += sub_container_gpu_memory
|
|
435
|
+
current_cpu_utilization += sub_container_cpu_utilization
|
|
436
|
+
current_memory_utilization += sub_container_memory_utilization
|
|
437
|
+
# COMMENTED OUT: Do not stop/remove sub-containers to keep logs
|
|
438
|
+
if status == "completed":
|
|
439
|
+
try:
|
|
440
|
+
sub_container.stop()
|
|
441
|
+
self.logged_stopped_containers.append(sub_container.id)
|
|
442
|
+
# sub_container.remove(force=True)
|
|
443
|
+
except Exception as err:
|
|
444
|
+
logging.error(
|
|
445
|
+
"Error removing sub-container %s: %s",
|
|
446
|
+
sub_container.id,
|
|
447
|
+
str(err),
|
|
448
|
+
)
|
|
449
|
+
(
|
|
450
|
+
max_gpu_utilization,
|
|
451
|
+
max_gpu_memory,
|
|
452
|
+
max_cpu_utilization,
|
|
453
|
+
max_memory_utilization,
|
|
454
|
+
) = self.update_max_action_usage(
|
|
455
|
+
action_record_id,
|
|
456
|
+
current_gpu_utilization,
|
|
457
|
+
current_gpu_memory,
|
|
458
|
+
current_cpu_utilization,
|
|
459
|
+
current_memory_utilization,
|
|
460
|
+
)
|
|
461
|
+
logging.info(
|
|
462
|
+
"Updating action status: service_provider=%s, action_id=%s, running=%s, status=%s, duration=%s, start=%s, gpu_util=%.2f%%, cpu_util=%.2f%%, gpu_mem=%dMB, mem_util=%.2f%%, created=%s, updated=%s",
|
|
463
|
+
os.environ["SERVICE_PROVIDER"],
|
|
464
|
+
action_record_id,
|
|
465
|
+
status == "running",
|
|
466
|
+
status,
|
|
467
|
+
duration,
|
|
468
|
+
start_time,
|
|
469
|
+
max_gpu_utilization,
|
|
470
|
+
max_cpu_utilization,
|
|
471
|
+
max_gpu_memory,
|
|
472
|
+
max_memory_utilization,
|
|
473
|
+
start_time,
|
|
474
|
+
finish_time,
|
|
475
|
+
)
|
|
476
|
+
self.scaling.update_action_status(
|
|
477
|
+
service_provider=os.environ["SERVICE_PROVIDER"],
|
|
478
|
+
action_record_id=action_record_id,
|
|
479
|
+
isRunning=status == "running",
|
|
480
|
+
status=status,
|
|
481
|
+
action_duration=duration,
|
|
482
|
+
docker_start_time=start_time,
|
|
483
|
+
gpuUtilisation=max_gpu_utilization,
|
|
484
|
+
cpuUtilisation=max_cpu_utilization,
|
|
485
|
+
gpuMemoryUsed=max_gpu_memory,
|
|
486
|
+
memoryUtilisation=max_memory_utilization,
|
|
487
|
+
createdAt=start_time,
|
|
488
|
+
updatedAt=finish_time,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
@log_errors(default_return=(0, 0, 0, 0), raise_exception=False)
|
|
492
|
+
def get_current_action_usage(self, container, status: str) -> Tuple[float, int, float, float]:
|
|
493
|
+
"""Get current resource usage for a container"""
|
|
494
|
+
current_gpu_utilization = 0
|
|
495
|
+
current_gpu_memory = 0
|
|
496
|
+
current_cpu_utilization = 0
|
|
497
|
+
current_memory_utilization = 0
|
|
498
|
+
if status == "running":
|
|
499
|
+
try:
|
|
500
|
+
(
|
|
501
|
+
current_cpu_utilization,
|
|
502
|
+
current_memory_utilization,
|
|
503
|
+
) = self.resources_tracker.get_container_cpu_and_memory(container)
|
|
504
|
+
(
|
|
505
|
+
current_gpu_utilization,
|
|
506
|
+
current_gpu_memory,
|
|
507
|
+
) = self.resources_tracker.get_container_gpu_info(container_id=container.id)
|
|
508
|
+
except Exception as err:
|
|
509
|
+
logging.error(
|
|
510
|
+
"Error getting container usage metrics: %s",
|
|
511
|
+
str(err),
|
|
512
|
+
)
|
|
513
|
+
return (
|
|
514
|
+
current_gpu_utilization,
|
|
515
|
+
current_gpu_memory,
|
|
516
|
+
current_cpu_utilization,
|
|
517
|
+
current_memory_utilization,
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
@log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
|
|
521
|
+
def update_max_action_usage(
|
|
522
|
+
self,
|
|
523
|
+
action_record_id: str,
|
|
524
|
+
current_gpu_utilization: float,
|
|
525
|
+
current_gpu_memory: int,
|
|
526
|
+
current_cpu_utilization: float,
|
|
527
|
+
current_memory_utilization: float,
|
|
528
|
+
) -> Tuple[float, int, float, float]:
|
|
529
|
+
|
|
530
|
+
"""Update and return maximum resource usage values for an action"""
|
|
531
|
+
if action_record_id not in self.max_actions_usage:
|
|
532
|
+
self.max_actions_usage[action_record_id] = {
|
|
533
|
+
"gpu_utilization": 0,
|
|
534
|
+
"gpu_memory": 0,
|
|
535
|
+
"cpu_utilization": 0,
|
|
536
|
+
"memory_utilization": 0,
|
|
537
|
+
}
|
|
538
|
+
current_values = {
|
|
539
|
+
"gpu_utilization": current_gpu_utilization or 0,
|
|
540
|
+
"gpu_memory": current_gpu_memory or 0,
|
|
541
|
+
"cpu_utilization": current_cpu_utilization or 0,
|
|
542
|
+
"memory_utilization": current_memory_utilization or 0,
|
|
543
|
+
}
|
|
544
|
+
for key in current_values:
|
|
545
|
+
self.max_actions_usage[action_record_id][key] = max(
|
|
546
|
+
current_values[key],
|
|
547
|
+
self.max_actions_usage[action_record_id][key],
|
|
548
|
+
)
|
|
549
|
+
return (
|
|
550
|
+
self.max_actions_usage[action_record_id]["gpu_utilization"],
|
|
551
|
+
self.max_actions_usage[action_record_id]["gpu_memory"],
|
|
552
|
+
self.max_actions_usage[action_record_id]["cpu_utilization"],
|
|
553
|
+
self.max_actions_usage[action_record_id]["memory_utilization"],
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
class MachineResourcesTracker:
|
|
558
|
+
"""Tracks machine-level resources like CPU, memory and GPU"""
|
|
559
|
+
|
|
560
|
+
def __init__(self, scaling: Scaling):
|
|
561
|
+
"""Initialize MachineResourcesTracker"""
|
|
562
|
+
self.scaling = scaling
|
|
563
|
+
self.resources_tracker = ResourcesTracker()
|
|
564
|
+
|
|
565
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
566
|
+
def update_available_resources(self):
|
|
567
|
+
"""Update available machine resources"""
|
|
568
|
+
(
|
|
569
|
+
available_memory,
|
|
570
|
+
available_cpu,
|
|
571
|
+
gpu_memory_free,
|
|
572
|
+
gpu_utilization,
|
|
573
|
+
) = self.resources_tracker.get_available_resources()
|
|
574
|
+
_, err, _ = self.scaling.update_available_resources(
|
|
575
|
+
availableCPU=available_cpu,
|
|
576
|
+
availableMemory=available_memory,
|
|
577
|
+
availableGPU=100 - gpu_utilization,
|
|
578
|
+
availableGPUMemory=gpu_memory_free,
|
|
579
|
+
)
|
|
580
|
+
if err is not None:
|
|
581
|
+
logging.error(
|
|
582
|
+
"Error in updating available resources: %s",
|
|
583
|
+
err,
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
class KafkaResourceMonitor:
|
|
588
|
+
"""
|
|
589
|
+
Monitors system resources and publishes them to Kafka in a separate thread.
|
|
590
|
+
This class provides thread-safe start/stop operations for resource monitoring.
|
|
591
|
+
"""
|
|
592
|
+
|
|
593
|
+
def __init__(
|
|
594
|
+
self,
|
|
595
|
+
instance_id: Optional[str] = None,
|
|
596
|
+
kafka_bootstrap: Optional[str] = None,
|
|
597
|
+
interval_seconds: int = 60,
|
|
598
|
+
):
|
|
599
|
+
"""
|
|
600
|
+
Initialize KafkaResourceMonitor.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
instance_id: Instance identifier for Kafka topic. Defaults to INSTANCE_ID env var.
|
|
604
|
+
kafka_bootstrap: Kafka bootstrap servers. Required - should be obtained from Scaling.get_kafka_bootstrap_servers().
|
|
605
|
+
interval_seconds: Interval between resource checks in seconds. Defaults to 60.
|
|
606
|
+
"""
|
|
607
|
+
self.instance_id = instance_id or os.getenv("INSTANCE_ID")
|
|
608
|
+
if not self.instance_id:
|
|
609
|
+
raise ValueError("instance_id must be provided or INSTANCE_ID env var must be set")
|
|
610
|
+
|
|
611
|
+
if not kafka_bootstrap:
|
|
612
|
+
raise ValueError("kafka_bootstrap must be provided - use Scaling.get_kafka_bootstrap_servers() to get internal Kafka config")
|
|
613
|
+
|
|
614
|
+
self.kafka_bootstrap = kafka_bootstrap
|
|
615
|
+
self.interval_seconds = interval_seconds
|
|
616
|
+
self.topic_name = "compute_instance_resource_utilization"
|
|
617
|
+
|
|
618
|
+
self._stop_event = threading.Event()
|
|
619
|
+
self._monitor_thread: Optional[threading.Thread] = None
|
|
620
|
+
self._producer = None
|
|
621
|
+
self._is_running = False
|
|
622
|
+
|
|
623
|
+
@staticmethod
|
|
624
|
+
def get_all_gpu_memory() -> Dict[int, tuple]:
|
|
625
|
+
"""
|
|
626
|
+
Get GPU memory usage and total for all GPUs.
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
Dict[int, tuple]: Dictionary mapping GPU ID to (used_gb, total_gb).
|
|
630
|
+
Returns empty dict if nvidia-smi is not available.
|
|
631
|
+
"""
|
|
632
|
+
gpu_usage = {}
|
|
633
|
+
|
|
634
|
+
try:
|
|
635
|
+
cmd = [
|
|
636
|
+
"nvidia-smi",
|
|
637
|
+
"--query-gpu=index,memory.used,memory.total",
|
|
638
|
+
"--format=csv,noheader,nounits"
|
|
639
|
+
]
|
|
640
|
+
result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, timeout=5)
|
|
641
|
+
lines = result.decode().strip().split("\n")
|
|
642
|
+
|
|
643
|
+
for line in lines:
|
|
644
|
+
gpu_id_str, mem_used_mb_str, mem_total_mb_str = line.split(",")
|
|
645
|
+
gpu_id = int(gpu_id_str.strip())
|
|
646
|
+
mem_used_gb = int(mem_used_mb_str.strip()) / 1024 # MB → GB
|
|
647
|
+
mem_total_gb = int(mem_total_mb_str.strip()) / 1024 # MB → GB
|
|
648
|
+
gpu_usage[gpu_id] = (round(mem_used_gb, 2), round(mem_total_gb, 2))
|
|
649
|
+
|
|
650
|
+
except Exception as e:
|
|
651
|
+
logging.debug("Failed to get GPU memory info: %s", e)
|
|
652
|
+
return {}
|
|
653
|
+
|
|
654
|
+
return gpu_usage
|
|
655
|
+
|
|
656
|
+
@staticmethod
|
|
657
|
+
def get_all_storage_info() -> Dict[str, float]:
|
|
658
|
+
"""
|
|
659
|
+
Get free storage space for all mounted drives.
|
|
660
|
+
|
|
661
|
+
Returns:
|
|
662
|
+
Dict[str, float]: Dictionary mapping mount point to free storage space in GB.
|
|
663
|
+
"""
|
|
664
|
+
storage_info = {}
|
|
665
|
+
|
|
666
|
+
try:
|
|
667
|
+
# Get all disk partitions
|
|
668
|
+
partitions = psutil.disk_partitions()
|
|
669
|
+
|
|
670
|
+
for partition in partitions:
|
|
671
|
+
try:
|
|
672
|
+
# Get usage statistics for this partition
|
|
673
|
+
usage = psutil.disk_usage(partition.mountpoint)
|
|
674
|
+
|
|
675
|
+
# Convert bytes to GB
|
|
676
|
+
free_gb = usage.free / (1024 ** 3)
|
|
677
|
+
|
|
678
|
+
storage_info[partition.mountpoint] = round(free_gb, 2)
|
|
679
|
+
|
|
680
|
+
except PermissionError:
|
|
681
|
+
# Skip drives that we can't access (common on Windows)
|
|
682
|
+
logging.debug("Permission denied accessing %s", partition.mountpoint)
|
|
683
|
+
continue
|
|
684
|
+
except Exception as e:
|
|
685
|
+
logging.debug("Error getting storage info for %s: %s", partition.mountpoint, e)
|
|
686
|
+
continue
|
|
687
|
+
|
|
688
|
+
except Exception as e:
|
|
689
|
+
logging.debug("Failed to get storage info: %s", e)
|
|
690
|
+
return {}
|
|
691
|
+
|
|
692
|
+
return storage_info
|
|
693
|
+
|
|
694
|
+
def get_stats(self) -> Tuple[float, int, float, float, Dict[int, tuple], Dict[str, float]]:
|
|
695
|
+
"""
|
|
696
|
+
Collect current system resource statistics.
|
|
697
|
+
|
|
698
|
+
Returns:
|
|
699
|
+
Tuple[float, int, float, float, Dict[int, tuple], Dict[str, float]]:
|
|
700
|
+
CPU usage %, CPU cores, RAM total GB, RAM used GB, GPU memory dict (used, total), Free storage dict
|
|
701
|
+
"""
|
|
702
|
+
cpu_usage = psutil.cpu_percent(interval=1)
|
|
703
|
+
cpu_cores = psutil.cpu_count(logical=True) # Total logical CPU cores
|
|
704
|
+
|
|
705
|
+
mem = psutil.virtual_memory()
|
|
706
|
+
ram_total = mem.total / (1024 ** 3)
|
|
707
|
+
ram_used = mem.used / (1024 ** 3)
|
|
708
|
+
|
|
709
|
+
gpu_usage = self.get_all_gpu_memory()
|
|
710
|
+
storage_info = self.get_all_storage_info()
|
|
711
|
+
|
|
712
|
+
return cpu_usage, cpu_cores, ram_total, ram_used, gpu_usage, storage_info
|
|
713
|
+
|
|
714
|
+
def _monitor_worker(self):
|
|
715
|
+
"""
|
|
716
|
+
Worker function that runs in a separate thread to monitor and publish resources.
|
|
717
|
+
"""
|
|
718
|
+
try:
|
|
719
|
+
from kafka import KafkaProducer
|
|
720
|
+
|
|
721
|
+
self._producer = KafkaProducer(
|
|
722
|
+
bootstrap_servers=self.kafka_bootstrap,
|
|
723
|
+
value_serializer=lambda v: json.dumps(v).encode("utf-8"),
|
|
724
|
+
retries=5,
|
|
725
|
+
)
|
|
726
|
+
logging.info("Kafka resource monitor started. Publishing to topic: %s", self.topic_name)
|
|
727
|
+
|
|
728
|
+
except ImportError:
|
|
729
|
+
logging.error("kafka-python not installed. Install with: pip install kafka-python")
|
|
730
|
+
return
|
|
731
|
+
except Exception as e:
|
|
732
|
+
logging.error("Failed to initialize Kafka producer: %s", e)
|
|
733
|
+
return
|
|
734
|
+
|
|
735
|
+
while not self._stop_event.is_set():
|
|
736
|
+
try:
|
|
737
|
+
cpu, cpu_cores, total, used, gpus, storage = self.get_stats()
|
|
738
|
+
|
|
739
|
+
# Format GPU info for output: {0: {"used_gb": x, "total_gb": y}, ...}
|
|
740
|
+
gpu_memory_gb = {k: {"used_gb": v[0], "total_gb": v[1]} for k, v in gpus.items()}
|
|
741
|
+
payload = {
|
|
742
|
+
"instance_id": self.instance_id,
|
|
743
|
+
"cpu_usage_percent": round(cpu, 2),
|
|
744
|
+
"cpu_cores": cpu_cores,
|
|
745
|
+
"ram_total_gb": round(total, 2),
|
|
746
|
+
"ram_used_gb": round(used, 2),
|
|
747
|
+
"gpu_memory_gb": gpu_memory_gb, # dict: {0: {used_gb, total_gb}, ...}
|
|
748
|
+
"free_storage_gb": storage, # dict: {"/": 50.5, "C:": 123.4}
|
|
749
|
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
self._producer.send(self.topic_name, payload)
|
|
753
|
+
self._producer.flush()
|
|
754
|
+
|
|
755
|
+
logging.debug("Published resource stats: %s", payload)
|
|
756
|
+
|
|
757
|
+
except Exception as e:
|
|
758
|
+
logging.error("Error in resource monitor loop: %s", e)
|
|
759
|
+
|
|
760
|
+
# Wait for interval or until stop event is set
|
|
761
|
+
if self._stop_event.wait(self.interval_seconds):
|
|
762
|
+
break
|
|
763
|
+
|
|
764
|
+
# Cleanup
|
|
765
|
+
if self._producer:
|
|
766
|
+
try:
|
|
767
|
+
self._producer.close()
|
|
768
|
+
except Exception as e:
|
|
769
|
+
logging.debug("Error closing Kafka producer: %s", e)
|
|
770
|
+
|
|
771
|
+
logging.info("Kafka resource monitor stopped.")
|
|
772
|
+
|
|
773
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
774
|
+
def start(self):
|
|
775
|
+
"""
|
|
776
|
+
Start the resource monitoring thread.
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
bool: True if started successfully, False otherwise.
|
|
780
|
+
"""
|
|
781
|
+
if self._is_running:
|
|
782
|
+
logging.warning("Kafka resource monitor is already running.")
|
|
783
|
+
return False
|
|
784
|
+
|
|
785
|
+
self._stop_event.clear()
|
|
786
|
+
self._monitor_thread = threading.Thread(
|
|
787
|
+
target=self._monitor_worker,
|
|
788
|
+
daemon=True,
|
|
789
|
+
name="KafkaResourceMonitor"
|
|
790
|
+
)
|
|
791
|
+
self._monitor_thread.start()
|
|
792
|
+
self._is_running = True
|
|
793
|
+
|
|
794
|
+
logging.info("Started Kafka resource monitor thread.")
|
|
795
|
+
return True
|
|
796
|
+
|
|
797
|
+
@log_errors(raise_exception=False, log_error=True)
|
|
798
|
+
def stop(self, timeout: int = 10):
|
|
799
|
+
"""
|
|
800
|
+
Stop the resource monitoring thread gracefully.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
timeout: Maximum time to wait for thread to stop in seconds.
|
|
804
|
+
|
|
805
|
+
Returns:
|
|
806
|
+
bool: True if stopped successfully, False otherwise.
|
|
807
|
+
"""
|
|
808
|
+
if not self._is_running:
|
|
809
|
+
logging.warning("Kafka resource monitor is not running.")
|
|
810
|
+
return False
|
|
811
|
+
|
|
812
|
+
logging.info("Stopping Kafka resource monitor...")
|
|
813
|
+
self._stop_event.set()
|
|
814
|
+
|
|
815
|
+
if self._monitor_thread and self._monitor_thread.is_alive():
|
|
816
|
+
self._monitor_thread.join(timeout=timeout)
|
|
817
|
+
|
|
818
|
+
if self._monitor_thread.is_alive():
|
|
819
|
+
logging.error("Kafka resource monitor thread did not stop within timeout.")
|
|
820
|
+
return False
|
|
821
|
+
|
|
822
|
+
self._is_running = False
|
|
823
|
+
logging.info("Kafka resource monitor stopped successfully.")
|
|
824
|
+
return True
|
|
825
|
+
|
|
826
|
+
def is_running(self) -> bool:
|
|
827
|
+
"""
|
|
828
|
+
Check if the resource monitor is currently running.
|
|
829
|
+
|
|
830
|
+
Returns:
|
|
831
|
+
bool: True if running, False otherwise.
|
|
832
|
+
"""
|
|
833
|
+
return self._is_running
|
|
834
|
+
|
|
835
|
+
def __enter__(self):
|
|
836
|
+
"""Context manager entry."""
|
|
837
|
+
self.start()
|
|
838
|
+
return self
|
|
839
|
+
|
|
840
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
841
|
+
"""Context manager exit."""
|
|
842
|
+
self.stop()
|