matrice-compute 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,478 @@
1
+ """
2
+ This module contains classes for tracking machine and action resources.
3
+ """
4
+
5
+ import os
6
+ import subprocess
7
+ import logging
8
+ from datetime import datetime, timezone
9
+ import psutil
10
+ import docker
11
+ from typing import List, Tuple, Dict, Optional
12
+ from matrice_compute.instance_utils import (
13
+ has_gpu,
14
+ get_gpu_info,
15
+ calculate_time_difference,
16
+ )
17
+ from matrice_compute.scaling import Scaling
18
+ from matrice_common.utils import log_errors
19
+
20
+
21
+ class ResourcesTracker:
22
+ """Tracks machine and container resources."""
23
+
24
+ def __init__(self) -> None:
25
+ """
26
+ Initialize ResourcesTracker.
27
+ """
28
+ pass
29
+
30
+ @log_errors(default_return=(0, 0), raise_exception=False)
31
+ def get_container_cpu_and_memory(self, container: docker.models.containers.Container) -> Tuple[float, float]:
32
+ """
33
+ Get CPU and memory usage for a container.
34
+
35
+ Args:
36
+ container (docker.models.containers.Container): Docker container instance.
37
+
38
+ Returns:
39
+ Tuple[float, float]: CPU utilization percentage and memory utilization percentage.
40
+ """
41
+ stats = container.stats(stream=False)
42
+ if stats:
43
+ cpu_utilization = 0
44
+ cpu_delta = (
45
+ stats["cpu_stats"]["cpu_usage"]["total_usage"]
46
+ - stats["precpu_stats"]["cpu_usage"]["total_usage"]
47
+ )
48
+ system_delta = stats["cpu_stats"].get("system_cpu_usage", 0) - stats[
49
+ "precpu_stats"
50
+ ].get("system_cpu_usage", 0)
51
+ if system_delta > 0:
52
+ cpu_utilization = cpu_delta / system_delta * 100.0
53
+ memory_usage = stats["memory_stats"].get("usage", 0)
54
+ memory_limit = stats["memory_stats"].get("limit", 1)
55
+ memory_utilization = memory_usage / memory_limit * 100.0
56
+ return cpu_utilization, memory_utilization
57
+ return 0, 0
58
+
59
+ @log_errors(default_return=(0, 0), raise_exception=False)
60
+ def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
61
+ """
62
+ Get CPU and memory usage for a specific container by its ID.
63
+
64
+ Args:
65
+ container_id (str): ID of the Docker container.
66
+
67
+ Returns:
68
+ Tuple[float, float]: CPU utilization percentage and memory usage in MB.
69
+ """
70
+ stats_result = subprocess.run(
71
+ [
72
+ "docker",
73
+ "stats",
74
+ "--no-stream",
75
+ "--format",
76
+ "{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
77
+ container_id,
78
+ ],
79
+ capture_output=True,
80
+ text=True,
81
+ check=True,
82
+ )
83
+ stats = stats_result.stdout.strip().split(": ")[1].split(", ")
84
+ cpu_usage = float(stats[0].replace("% CPU", "").strip())
85
+ memory_usage = stats[1].split(" / ")[0]
86
+ mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
87
+ if mem_unit == "KiB":
88
+ memory_usage_mb = float(mem_value) / 1024
89
+ elif mem_unit == "MiB":
90
+ memory_usage_mb = float(mem_value)
91
+ elif mem_unit == "GiB":
92
+ memory_usage_mb = float(mem_value) * 1024
93
+ else:
94
+ memory_usage_mb = float(mem_value)
95
+ return cpu_usage, memory_usage_mb
96
+
97
+ @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
98
+ def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
99
+ """
100
+ Get GPU usage for a specific container.
101
+
102
+ Args:
103
+ container_id (str): ID of the Docker container.
104
+
105
+ Returns:
106
+ Tuple[float, int]: GPU utilization percentage and GPU memory usage in MB.
107
+ """
108
+ container_pid = self.get_pid_id_by_container_id(container_id)
109
+ gpu_util = self.get_container_gpu_usage(container_pid)
110
+ gpu_mem_used = self.get_container_gpu_memory_usage(container_pid)
111
+ return gpu_util, gpu_mem_used
112
+
113
+ @log_errors(default_return="", raise_exception=False)
114
+ def get_pid_id_by_container_id(self, container_id: str) -> str:
115
+ """
116
+ Get PID for a container ID.
117
+
118
+ Args:
119
+ container_id (str): ID of the Docker container.
120
+
121
+ Returns:
122
+ str: PID of the container.
123
+ """
124
+ pid_result = subprocess.run(
125
+ [
126
+ "docker",
127
+ "inspect",
128
+ "--format",
129
+ "{{.State.Pid}}",
130
+ container_id,
131
+ ],
132
+ capture_output=True,
133
+ text=True,
134
+ check=True,
135
+ )
136
+ container_pid = pid_result.stdout.strip()
137
+ return container_pid
138
+
139
+ @log_errors(default_return=0, raise_exception=False, log_error=False)
140
+ def get_container_gpu_usage(self, container_pid: str) -> float:
141
+ """
142
+ Get GPU usage for a container PID.
143
+
144
+ Args:
145
+ container_pid (str): PID of the Docker container.
146
+
147
+ Returns:
148
+ float: GPU utilization percentage.
149
+ """
150
+ if not has_gpu():
151
+ return 0
152
+ gpu_util = 0
153
+ result = subprocess.run(
154
+ ["nvidia-smi", "pmon", "-c", "1"],
155
+ capture_output=True,
156
+ text=True,
157
+ check=True,
158
+ )
159
+ pmon_output = result.stdout.strip().split("\n")
160
+ for line in pmon_output[2:]:
161
+ parts = line.split()
162
+ if len(parts) >= 8:
163
+ pid = parts[1]
164
+ gpu_usage = parts[3]
165
+ if pid == str(container_pid):
166
+ gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
167
+ return gpu_util
168
+
169
+ @log_errors(default_return=0, raise_exception=False, log_error=False)
170
+ def get_container_gpu_memory_usage(self, container_pid: str) -> int:
171
+ """
172
+ Get GPU memory usage for a container PID.
173
+
174
+ Args:
175
+ container_pid (str): PID of the Docker container.
176
+
177
+ Returns:
178
+ int: GPU memory usage in MB.
179
+ """
180
+ if not has_gpu():
181
+ return 0
182
+ cmd = [
183
+ "nvidia-smi",
184
+ "--query-compute-apps=pid,used_memory",
185
+ "--format=csv,noheader,nounits",
186
+ ]
187
+ total_memory = 0
188
+ result = subprocess.run(
189
+ cmd,
190
+ stdout=subprocess.PIPE,
191
+ stderr=subprocess.PIPE,
192
+ text=True,
193
+ check=True,
194
+ )
195
+ for line in result.stdout.splitlines():
196
+ parts = line.strip().split(", ")
197
+ if len(parts) == 2:
198
+ process_pid, used_memory = parts
199
+ if process_pid == str(container_pid):
200
+ total_memory += int(used_memory)
201
+ return total_memory
202
+
203
+ @log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
204
+ def get_available_resources(self) -> Tuple[float, float, int, float]:
205
+ """
206
+ Get available machine resources.
207
+
208
+ Returns:
209
+ Tuple[float, float, int, float]: Available memory in GB, available CPU percentage,
210
+ free GPU memory in MB, and GPU utilization percentage.
211
+ """
212
+ available_memory = psutil.virtual_memory().available / 1024**3
213
+ available_cpu = 100 - psutil.cpu_percent(1)
214
+ gpu_memory_free, gpu_utilization = self._get_gpu_resources()
215
+ return available_memory, available_cpu, gpu_memory_free, gpu_utilization
216
+
217
+ @log_errors(default_return=(0, 0.0), raise_exception=False, log_error=False)
218
+ def _get_gpu_resources(self) -> Tuple[int, float]:
219
+ """
220
+ Get available GPU resources.
221
+
222
+ Returns:
223
+ Tuple[int, float]: Free GPU memory in MB and GPU utilization percentage.
224
+ """
225
+ gpu_memory_free = 0
226
+ gpu_utilization = 0.0
227
+ if not has_gpu():
228
+ return gpu_memory_free, gpu_utilization
229
+
230
+ subprocess.check_output("nvidia-smi")
231
+ info_list = get_gpu_info()
232
+ for info in info_list:
233
+ info_split = info.split(", ")
234
+ gpu_memory_free += int(info_split[5])
235
+ gpu_utilization += float(info_split[2])
236
+ gpu_utilization /= len(info_list) if info_list else 1
237
+
238
+ return gpu_memory_free, gpu_utilization
239
+
240
+
241
+ class ActionsResourcesTracker:
242
+ """Tracks Docker container action resources"""
243
+
244
+ def __init__(self, scaling: Scaling):
245
+ """Initialize ActionsResourcesTracker"""
246
+ self.scaling = scaling
247
+ self.max_actions_usage = {}
248
+ self.resources_tracker = ResourcesTracker()
249
+ self.client = docker.from_env()
250
+
251
+ @log_errors(raise_exception=False, log_error=True)
252
+ def update_actions_resources(self) -> None:
253
+ """Process both running and exited containers"""
254
+ exited_containers = self.client.containers.list(
255
+ filters={"status": "exited"},
256
+ all=True,
257
+ )
258
+ running_containers = self.client.containers.list(filters={"status": "running"})
259
+ if exited_containers:
260
+ for container in exited_containers:
261
+ try:
262
+ self._update_container_action_status(container, "completed")
263
+ container.remove()
264
+ except Exception as err:
265
+ logging.error(
266
+ "Error processing exited container %s: %s",
267
+ container.id,
268
+ str(err),
269
+ )
270
+ if running_containers:
271
+ for container in running_containers:
272
+ try:
273
+ self._update_container_action_status(container, "running")
274
+ except Exception as err:
275
+ logging.error(
276
+ "Error processing running container %s: %s",
277
+ container.id,
278
+ str(err),
279
+ )
280
+
281
+ @log_errors(default_return=[], raise_exception=False)
282
+ def get_sub_containers_by_label(self, label_key: str, label_value: str) -> list:
283
+ """Get running containers with specified label key and value"""
284
+ containers = self.client.containers.list(
285
+ filters={
286
+ "label": [f"{label_key}={label_value}"],
287
+ "status": "running",
288
+ }
289
+ )
290
+ return containers
291
+
292
+ @log_errors(raise_exception=False, log_error=True)
293
+ def _update_container_action_status(self, container, status: str) -> None:
294
+ """Update action status for a specific container"""
295
+ inspect_data = self.client.api.inspect_container(container.id)
296
+ start_time = inspect_data["State"]["StartedAt"]
297
+ finish_time = (
298
+ inspect_data["State"]["FinishedAt"]
299
+ if status == "completed"
300
+ else datetime.now(timezone.utc).isoformat()
301
+ )
302
+
303
+ def remove_quotation_marks(args):
304
+ """Remove quotes from container args"""
305
+ new_args = []
306
+ for arg in args:
307
+ new_args.extend(x.replace('"', "").replace("'", "") for x in arg.split(" "))
308
+ return new_args
309
+
310
+ args_24 = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if len(arg) == 24 and "pypi" not in arg]
311
+ action_record_id = args_24[-1] if args_24 else None
312
+ if not action_record_id:
313
+ logging.warning("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
314
+ duration = calculate_time_difference(start_time, finish_time)
315
+ (
316
+ current_gpu_utilization,
317
+ current_gpu_memory,
318
+ current_cpu_utilization,
319
+ current_memory_utilization,
320
+ ) = self.get_current_action_usage(container, status)
321
+ sub_containers = self.get_sub_containers_by_label("action_id", action_record_id)
322
+ for sub_container in sub_containers:
323
+ (
324
+ sub_container_gpu_utilization,
325
+ sub_container_gpu_memory,
326
+ sub_container_cpu_utilization,
327
+ sub_container_memory_utilization,
328
+ ) = self.get_current_action_usage(sub_container, status)
329
+ current_gpu_utilization += sub_container_gpu_utilization
330
+ current_gpu_memory += sub_container_gpu_memory
331
+ current_cpu_utilization += sub_container_cpu_utilization
332
+ current_memory_utilization += sub_container_memory_utilization
333
+ if status == "completed":
334
+ try:
335
+ sub_container.stop()
336
+ sub_container.remove(force=True)
337
+ except Exception as err:
338
+ logging.error(
339
+ "Error removing sub-container %s: %s",
340
+ sub_container.id,
341
+ str(err),
342
+ )
343
+ (
344
+ max_gpu_utilization,
345
+ max_gpu_memory,
346
+ max_cpu_utilization,
347
+ max_memory_utilization,
348
+ ) = self.update_max_action_usage(
349
+ action_record_id,
350
+ current_gpu_utilization,
351
+ current_gpu_memory,
352
+ current_cpu_utilization,
353
+ current_memory_utilization,
354
+ )
355
+ logging.info(
356
+ "Updating action status: service_provider=%s, action_id=%s, running=%s, status=%s, duration=%s, start=%s, gpu_util=%.2f%%, cpu_util=%.2f%%, gpu_mem=%dMB, mem_util=%.2f%%, created=%s, updated=%s",
357
+ os.environ["SERVICE_PROVIDER"],
358
+ action_record_id,
359
+ status == "running",
360
+ status,
361
+ duration,
362
+ start_time,
363
+ max_gpu_utilization,
364
+ max_cpu_utilization,
365
+ max_gpu_memory,
366
+ max_memory_utilization,
367
+ start_time,
368
+ finish_time,
369
+ )
370
+ self.scaling.update_action_status(
371
+ service_provider=os.environ["SERVICE_PROVIDER"],
372
+ action_record_id=action_record_id,
373
+ isRunning=status == "running",
374
+ status=status,
375
+ action_duration=duration,
376
+ docker_start_time=start_time,
377
+ gpuUtilisation=max_gpu_utilization,
378
+ cpuUtilisation=max_cpu_utilization,
379
+ gpuMemoryUsed=max_gpu_memory,
380
+ memoryUtilisation=max_memory_utilization,
381
+ createdAt=start_time,
382
+ updatedAt=finish_time,
383
+ )
384
+
385
+ @log_errors(default_return=(0, 0, 0, 0), raise_exception=False)
386
+ def get_current_action_usage(self, container, status: str) -> Tuple[float, int, float, float]:
387
+ """Get current resource usage for a container"""
388
+ current_gpu_utilization = 0
389
+ current_gpu_memory = 0
390
+ current_cpu_utilization = 0
391
+ current_memory_utilization = 0
392
+ if status == "running":
393
+ try:
394
+ (
395
+ current_cpu_utilization,
396
+ current_memory_utilization,
397
+ ) = self.resources_tracker.get_container_cpu_and_memory(container)
398
+ (
399
+ current_gpu_utilization,
400
+ current_gpu_memory,
401
+ ) = self.resources_tracker.get_container_gpu_info(container_id=container.id)
402
+ except Exception as err:
403
+ logging.error(
404
+ "Error getting container usage metrics: %s",
405
+ str(err),
406
+ )
407
+ return (
408
+ current_gpu_utilization,
409
+ current_gpu_memory,
410
+ current_cpu_utilization,
411
+ current_memory_utilization,
412
+ )
413
+
414
+ @log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
415
+ def update_max_action_usage(
416
+ self,
417
+ action_record_id: str,
418
+ current_gpu_utilization: float,
419
+ current_gpu_memory: int,
420
+ current_cpu_utilization: float,
421
+ current_memory_utilization: float,
422
+ ) -> Tuple[float, int, float, float]:
423
+
424
+ """Update and return maximum resource usage values for an action"""
425
+ if action_record_id not in self.max_actions_usage:
426
+ self.max_actions_usage[action_record_id] = {
427
+ "gpu_utilization": 0,
428
+ "gpu_memory": 0,
429
+ "cpu_utilization": 0,
430
+ "memory_utilization": 0,
431
+ }
432
+ current_values = {
433
+ "gpu_utilization": current_gpu_utilization or 0,
434
+ "gpu_memory": current_gpu_memory or 0,
435
+ "cpu_utilization": current_cpu_utilization or 0,
436
+ "memory_utilization": current_memory_utilization or 0,
437
+ }
438
+ for key in current_values:
439
+ self.max_actions_usage[action_record_id][key] = max(
440
+ current_values[key],
441
+ self.max_actions_usage[action_record_id][key],
442
+ )
443
+ return (
444
+ self.max_actions_usage[action_record_id]["gpu_utilization"],
445
+ self.max_actions_usage[action_record_id]["gpu_memory"],
446
+ self.max_actions_usage[action_record_id]["cpu_utilization"],
447
+ self.max_actions_usage[action_record_id]["memory_utilization"],
448
+ )
449
+
450
+
451
+ class MachineResourcesTracker:
452
+ """Tracks machine-level resources like CPU, memory and GPU"""
453
+
454
+ def __init__(self, scaling: Scaling):
455
+ """Initialize MachineResourcesTracker"""
456
+ self.scaling = scaling
457
+ self.resources_tracker = ResourcesTracker()
458
+
459
+ @log_errors(raise_exception=False, log_error=True)
460
+ def update_available_resources(self):
461
+ """Update available machine resources"""
462
+ (
463
+ available_memory,
464
+ available_cpu,
465
+ gpu_memory_free,
466
+ gpu_utilization,
467
+ ) = self.resources_tracker.get_available_resources()
468
+ _, err, _ = self.scaling.update_available_resources(
469
+ availableCPU=available_cpu,
470
+ availableMemory=available_memory,
471
+ availableGPU=100 - gpu_utilization,
472
+ availableGPUMemory=gpu_memory_free,
473
+ )
474
+ if err is not None:
475
+ logging.error(
476
+ "Error in updating available resources: %s",
477
+ err,
478
+ )