matrice-compute 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +104 -19
- matrice_compute/instance_utils.py +520 -111
- matrice_compute/resources_tracker.py +125 -53
- matrice_compute/scaling.py +658 -406
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/RECORD +9 -9
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.19.dist-info → matrice_compute-0.1.21.dist-info}/top_level.txt +0 -0
|
@@ -56,7 +56,7 @@ class ResourcesTracker:
|
|
|
56
56
|
return cpu_utilization, memory_utilization
|
|
57
57
|
return 0, 0
|
|
58
58
|
|
|
59
|
-
@log_errors(default_return=(0, 0), raise_exception=False)
|
|
59
|
+
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
60
60
|
def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
|
|
61
61
|
"""
|
|
62
62
|
Get CPU and memory usage for a specific container by its ID.
|
|
@@ -67,32 +67,46 @@ class ResourcesTracker:
|
|
|
67
67
|
Returns:
|
|
68
68
|
Tuple[float, float]: CPU utilization percentage and memory usage in MB.
|
|
69
69
|
"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
70
|
+
try:
|
|
71
|
+
stats_result = subprocess.run(
|
|
72
|
+
[
|
|
73
|
+
"docker",
|
|
74
|
+
"stats",
|
|
75
|
+
"--no-stream",
|
|
76
|
+
"--format",
|
|
77
|
+
"{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
|
|
78
|
+
container_id,
|
|
79
|
+
],
|
|
80
|
+
capture_output=True,
|
|
81
|
+
text=True,
|
|
82
|
+
check=False,
|
|
83
|
+
timeout=10,
|
|
84
|
+
)
|
|
85
|
+
if stats_result.returncode != 0:
|
|
86
|
+
logging.debug("docker stats command failed for container %s", container_id)
|
|
87
|
+
return 0, 0
|
|
88
|
+
stats = stats_result.stdout.strip().split(": ")[1].split(", ")
|
|
89
|
+
cpu_usage = float(stats[0].replace("% CPU", "").strip())
|
|
90
|
+
memory_usage = stats[1].split(" / ")[0]
|
|
91
|
+
mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
|
|
92
|
+
if mem_unit == "KiB":
|
|
93
|
+
memory_usage_mb = float(mem_value) / 1024
|
|
94
|
+
elif mem_unit == "MiB":
|
|
95
|
+
memory_usage_mb = float(mem_value)
|
|
96
|
+
elif mem_unit == "GiB":
|
|
97
|
+
memory_usage_mb = float(mem_value) * 1024
|
|
98
|
+
else:
|
|
99
|
+
memory_usage_mb = float(mem_value)
|
|
100
|
+
return cpu_usage, memory_usage_mb
|
|
101
|
+
except subprocess.TimeoutExpired:
|
|
102
|
+
logging.debug("docker stats command timed out for container %s", container_id)
|
|
103
|
+
return 0, 0
|
|
104
|
+
except (ValueError, IndexError) as e:
|
|
105
|
+
logging.debug("Error parsing docker stats for container %s: %s", container_id, e)
|
|
106
|
+
return 0, 0
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logging.debug("Unexpected error getting container stats for %s: %s", container_id, e)
|
|
109
|
+
return 0, 0
|
|
96
110
|
|
|
97
111
|
@log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
|
|
98
112
|
def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
|
|
@@ -110,7 +124,7 @@ class ResourcesTracker:
|
|
|
110
124
|
gpu_mem_used = self.get_container_gpu_memory_usage(container_pid)
|
|
111
125
|
return gpu_util, gpu_mem_used
|
|
112
126
|
|
|
113
|
-
@log_errors(default_return="", raise_exception=False)
|
|
127
|
+
@log_errors(default_return="", raise_exception=False, log_error=False)
|
|
114
128
|
def get_pid_id_by_container_id(self, container_id: str) -> str:
|
|
115
129
|
"""
|
|
116
130
|
Get PID for a container ID.
|
|
@@ -121,20 +135,31 @@ class ResourcesTracker:
|
|
|
121
135
|
Returns:
|
|
122
136
|
str: PID of the container.
|
|
123
137
|
"""
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
+
try:
|
|
139
|
+
pid_result = subprocess.run(
|
|
140
|
+
[
|
|
141
|
+
"docker",
|
|
142
|
+
"inspect",
|
|
143
|
+
"--format",
|
|
144
|
+
"{{.State.Pid}}",
|
|
145
|
+
container_id,
|
|
146
|
+
],
|
|
147
|
+
capture_output=True,
|
|
148
|
+
text=True,
|
|
149
|
+
check=False,
|
|
150
|
+
timeout=10,
|
|
151
|
+
)
|
|
152
|
+
if pid_result.returncode != 0:
|
|
153
|
+
logging.debug("docker inspect command failed for container %s", container_id)
|
|
154
|
+
return ""
|
|
155
|
+
container_pid = pid_result.stdout.strip()
|
|
156
|
+
return container_pid
|
|
157
|
+
except subprocess.TimeoutExpired:
|
|
158
|
+
logging.debug("docker inspect command timed out for container %s", container_id)
|
|
159
|
+
return ""
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logging.debug("Error getting PID for container %s: %s", container_id, e)
|
|
162
|
+
return ""
|
|
138
163
|
|
|
139
164
|
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
140
165
|
def get_container_gpu_usage(self, container_pid: str) -> float:
|
|
@@ -155,9 +180,12 @@ class ResourcesTracker:
|
|
|
155
180
|
["nvidia-smi", "pmon", "-c", "1"],
|
|
156
181
|
capture_output=True,
|
|
157
182
|
text=True,
|
|
158
|
-
check=
|
|
183
|
+
check=False,
|
|
159
184
|
timeout=5,
|
|
160
185
|
)
|
|
186
|
+
if result.returncode != 0:
|
|
187
|
+
logging.debug("nvidia-smi pmon command failed in get_container_gpu_usage")
|
|
188
|
+
return 0
|
|
161
189
|
pmon_output = result.stdout.strip().split("\n")
|
|
162
190
|
for line in pmon_output[2:]:
|
|
163
191
|
parts = line.split()
|
|
@@ -167,7 +195,16 @@ class ResourcesTracker:
|
|
|
167
195
|
if pid == str(container_pid):
|
|
168
196
|
gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
|
|
169
197
|
except subprocess.TimeoutExpired:
|
|
170
|
-
logging.
|
|
198
|
+
logging.debug("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
|
|
199
|
+
return 0
|
|
200
|
+
except (ValueError, IndexError) as e:
|
|
201
|
+
logging.debug("Error parsing GPU usage info: %s", e)
|
|
202
|
+
return 0
|
|
203
|
+
except FileNotFoundError:
|
|
204
|
+
logging.debug("nvidia-smi not found on this system")
|
|
205
|
+
return 0
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logging.debug("Unexpected error in get_container_gpu_usage: %s", e)
|
|
171
208
|
return 0
|
|
172
209
|
return gpu_util
|
|
173
210
|
|
|
@@ -196,9 +233,12 @@ class ResourcesTracker:
|
|
|
196
233
|
stdout=subprocess.PIPE,
|
|
197
234
|
stderr=subprocess.PIPE,
|
|
198
235
|
text=True,
|
|
199
|
-
check=
|
|
236
|
+
check=False,
|
|
200
237
|
timeout=5,
|
|
201
238
|
)
|
|
239
|
+
if result.returncode != 0:
|
|
240
|
+
logging.debug("nvidia-smi command failed in get_container_gpu_memory_usage")
|
|
241
|
+
return 0
|
|
202
242
|
for line in result.stdout.splitlines():
|
|
203
243
|
parts = line.strip().split(", ")
|
|
204
244
|
if len(parts) == 2:
|
|
@@ -206,7 +246,16 @@ class ResourcesTracker:
|
|
|
206
246
|
if process_pid == str(container_pid):
|
|
207
247
|
total_memory += int(used_memory)
|
|
208
248
|
except subprocess.TimeoutExpired:
|
|
209
|
-
logging.
|
|
249
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
|
|
250
|
+
return 0
|
|
251
|
+
except (ValueError, IndexError) as e:
|
|
252
|
+
logging.debug("Error parsing GPU memory usage info: %s", e)
|
|
253
|
+
return 0
|
|
254
|
+
except FileNotFoundError:
|
|
255
|
+
logging.debug("nvidia-smi not found on this system")
|
|
256
|
+
return 0
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logging.debug("Unexpected error in get_container_gpu_memory_usage: %s", e)
|
|
210
259
|
return 0
|
|
211
260
|
return total_memory
|
|
212
261
|
|
|
@@ -238,17 +287,40 @@ class ResourcesTracker:
|
|
|
238
287
|
return gpu_memory_free, gpu_utilization
|
|
239
288
|
|
|
240
289
|
try:
|
|
241
|
-
subprocess.
|
|
290
|
+
result = subprocess.run(
|
|
291
|
+
["nvidia-smi"],
|
|
292
|
+
stdout=subprocess.PIPE,
|
|
293
|
+
stderr=subprocess.PIPE,
|
|
294
|
+
timeout=5,
|
|
295
|
+
check=False,
|
|
296
|
+
)
|
|
297
|
+
if result.returncode != 0:
|
|
298
|
+
logging.debug("nvidia-smi command failed in _get_gpu_resources")
|
|
299
|
+
return 0, 0.0
|
|
242
300
|
except subprocess.TimeoutExpired:
|
|
243
|
-
logging.
|
|
301
|
+
logging.debug("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
|
|
302
|
+
return 0, 0.0
|
|
303
|
+
except FileNotFoundError:
|
|
304
|
+
logging.debug("nvidia-smi not found on this system")
|
|
305
|
+
return 0, 0.0
|
|
306
|
+
except Exception as e:
|
|
307
|
+
logging.debug("Error running nvidia-smi in _get_gpu_resources: %s", e)
|
|
244
308
|
return 0, 0.0
|
|
245
309
|
|
|
246
310
|
info_list = get_gpu_info()
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
311
|
+
if not info_list:
|
|
312
|
+
return 0, 0.0
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
for info in info_list:
|
|
316
|
+
info_split = info.split(", ")
|
|
317
|
+
if len(info_split) >= 6:
|
|
318
|
+
gpu_memory_free += int(info_split[5])
|
|
319
|
+
gpu_utilization += float(info_split[2])
|
|
320
|
+
gpu_utilization /= len(info_list) if info_list else 1
|
|
321
|
+
except (ValueError, IndexError) as e:
|
|
322
|
+
logging.debug("Error parsing GPU resources: %s", e)
|
|
323
|
+
return 0, 0.0
|
|
252
324
|
|
|
253
325
|
return gpu_memory_free, gpu_utilization
|
|
254
326
|
|