matrice-compute 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +1 -1
- matrice_compute/instance_utils.py +39 -14
- matrice_compute/resources_tracker.py +43 -28
- {matrice_compute-0.1.14.dist-info → matrice_compute-0.1.15.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.14.dist-info → matrice_compute-0.1.15.dist-info}/RECORD +8 -8
- {matrice_compute-0.1.14.dist-info → matrice_compute-0.1.15.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.14.dist-info → matrice_compute-0.1.15.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.14.dist-info → matrice_compute-0.1.15.dist-info}/top_level.txt +0 -0
|
@@ -1117,7 +1117,7 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1117
1117
|
|
|
1118
1118
|
# Add worker container run command
|
|
1119
1119
|
worker_cmd = (
|
|
1120
|
-
f"docker run -d --pull=always "
|
|
1120
|
+
f"docker run -d --net=host --pull=always "
|
|
1121
1121
|
f"--name lpr-worker "
|
|
1122
1122
|
f"-p {external_port}:8082 "
|
|
1123
1123
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -128,8 +128,12 @@ def has_gpu() -> bool:
|
|
|
128
128
|
Returns:
|
|
129
129
|
bool: True if GPU is present, False otherwise
|
|
130
130
|
"""
|
|
131
|
-
|
|
132
|
-
|
|
131
|
+
try:
|
|
132
|
+
subprocess.run("nvidia-smi", timeout=5)
|
|
133
|
+
return True
|
|
134
|
+
except subprocess.TimeoutExpired:
|
|
135
|
+
logging.warning("nvidia-smi command timed out after 5 seconds")
|
|
136
|
+
return False
|
|
133
137
|
|
|
134
138
|
|
|
135
139
|
@log_errors(default_return=0, raise_exception=False)
|
|
@@ -141,13 +145,17 @@ def get_gpu_memory_usage() -> float:
|
|
|
141
145
|
float: Memory usage between 0 and 1
|
|
142
146
|
"""
|
|
143
147
|
command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
148
|
+
try:
|
|
149
|
+
output = subprocess.check_output(command.split(), timeout=5).decode("ascii").strip().split("\n")
|
|
150
|
+
memory_percentages = []
|
|
151
|
+
for line in output:
|
|
152
|
+
used, total = map(int, line.split(","))
|
|
153
|
+
usage_percentage = used / total
|
|
154
|
+
memory_percentages.append(usage_percentage)
|
|
155
|
+
return min(memory_percentages)
|
|
156
|
+
except subprocess.TimeoutExpired:
|
|
157
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
|
|
158
|
+
return 0
|
|
151
159
|
|
|
152
160
|
|
|
153
161
|
@log_errors(default_return=0, raise_exception=False)
|
|
@@ -194,17 +202,24 @@ def get_gpu_info() -> list:
|
|
|
194
202
|
Returns:
|
|
195
203
|
list: GPU information strings
|
|
196
204
|
"""
|
|
197
|
-
|
|
205
|
+
proc = subprocess.Popen(
|
|
198
206
|
[
|
|
199
207
|
"nvidia-smi",
|
|
200
208
|
"--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
|
|
201
209
|
"--format=csv,noheader,nounits",
|
|
202
210
|
],
|
|
203
211
|
stdout=subprocess.PIPE,
|
|
204
|
-
|
|
205
|
-
|
|
212
|
+
stderr=subprocess.PIPE,
|
|
213
|
+
)
|
|
214
|
+
try:
|
|
215
|
+
stdout, stderr = proc.communicate(timeout=5)
|
|
206
216
|
output = stdout.decode("UTF-8")
|
|
207
217
|
return output.split("\n")[:-1]
|
|
218
|
+
except subprocess.TimeoutExpired:
|
|
219
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_info")
|
|
220
|
+
proc.kill()
|
|
221
|
+
proc.communicate() # flush output after kill
|
|
222
|
+
return []
|
|
208
223
|
|
|
209
224
|
|
|
210
225
|
@log_errors(default_return="", raise_exception=False)
|
|
@@ -505,7 +520,12 @@ def get_gpu_with_sufficient_memory_for_action(
|
|
|
505
520
|
"""
|
|
506
521
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
507
522
|
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
|
508
|
-
|
|
523
|
+
try:
|
|
524
|
+
memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
|
|
525
|
+
except subprocess.TimeoutExpired:
|
|
526
|
+
logging.error("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
|
|
527
|
+
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
528
|
+
|
|
509
529
|
if len(memory_free_info) < 2:
|
|
510
530
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
511
531
|
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
|
|
@@ -548,7 +568,12 @@ def get_single_gpu_with_sufficient_memory_for_action(
|
|
|
548
568
|
"""
|
|
549
569
|
required_gpu_memory = get_required_gpu_memory(action_details)
|
|
550
570
|
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
|
551
|
-
|
|
571
|
+
try:
|
|
572
|
+
memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
|
|
573
|
+
except subprocess.TimeoutExpired:
|
|
574
|
+
logging.error("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
|
|
575
|
+
raise ValueError("Failed to get GPU information - nvidia-smi timed out")
|
|
576
|
+
|
|
552
577
|
if len(memory_free_info) < 2:
|
|
553
578
|
raise ValueError("No GPU information available from nvidia-smi")
|
|
554
579
|
memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
|
|
@@ -150,20 +150,25 @@ class ResourcesTracker:
|
|
|
150
150
|
if not has_gpu():
|
|
151
151
|
return 0
|
|
152
152
|
gpu_util = 0
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
153
|
+
try:
|
|
154
|
+
result = subprocess.run(
|
|
155
|
+
["nvidia-smi", "pmon", "-c", "1"],
|
|
156
|
+
capture_output=True,
|
|
157
|
+
text=True,
|
|
158
|
+
check=True,
|
|
159
|
+
timeout=5,
|
|
160
|
+
)
|
|
161
|
+
pmon_output = result.stdout.strip().split("\n")
|
|
162
|
+
for line in pmon_output[2:]:
|
|
163
|
+
parts = line.split()
|
|
164
|
+
if len(parts) >= 8:
|
|
165
|
+
pid = parts[1]
|
|
166
|
+
gpu_usage = parts[3]
|
|
167
|
+
if pid == str(container_pid):
|
|
168
|
+
gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
|
|
169
|
+
except subprocess.TimeoutExpired:
|
|
170
|
+
logging.warning("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
|
|
171
|
+
return 0
|
|
167
172
|
return gpu_util
|
|
168
173
|
|
|
169
174
|
@log_errors(default_return=0, raise_exception=False, log_error=False)
|
|
@@ -185,19 +190,24 @@ class ResourcesTracker:
|
|
|
185
190
|
"--format=csv,noheader,nounits",
|
|
186
191
|
]
|
|
187
192
|
total_memory = 0
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
if
|
|
200
|
-
|
|
193
|
+
try:
|
|
194
|
+
result = subprocess.run(
|
|
195
|
+
cmd,
|
|
196
|
+
stdout=subprocess.PIPE,
|
|
197
|
+
stderr=subprocess.PIPE,
|
|
198
|
+
text=True,
|
|
199
|
+
check=True,
|
|
200
|
+
timeout=5,
|
|
201
|
+
)
|
|
202
|
+
for line in result.stdout.splitlines():
|
|
203
|
+
parts = line.strip().split(", ")
|
|
204
|
+
if len(parts) == 2:
|
|
205
|
+
process_pid, used_memory = parts
|
|
206
|
+
if process_pid == str(container_pid):
|
|
207
|
+
total_memory += int(used_memory)
|
|
208
|
+
except subprocess.TimeoutExpired:
|
|
209
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
|
|
210
|
+
return 0
|
|
201
211
|
return total_memory
|
|
202
212
|
|
|
203
213
|
@log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
|
|
@@ -227,7 +237,12 @@ class ResourcesTracker:
|
|
|
227
237
|
if not has_gpu():
|
|
228
238
|
return gpu_memory_free, gpu_utilization
|
|
229
239
|
|
|
230
|
-
|
|
240
|
+
try:
|
|
241
|
+
subprocess.check_output("nvidia-smi", timeout=5)
|
|
242
|
+
except subprocess.TimeoutExpired:
|
|
243
|
+
logging.warning("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
|
|
244
|
+
return 0, 0.0
|
|
245
|
+
|
|
231
246
|
info_list = get_gpu_info()
|
|
232
247
|
for info in info_list:
|
|
233
248
|
info_split = info.split(", ")
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
2
|
+
matrice_compute/action_instance.py,sha256=_EYTNmUDnKKMmiRX2WCjP_MC2o5kA1RYAvHVKhadnvk,59774
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
5
|
matrice_compute/instance_manager.py,sha256=8USyX09ZxLvnVNIrjRogbyUeMCfgWnasuRqYkkVF4tQ,10146
|
|
6
|
-
matrice_compute/instance_utils.py,sha256=
|
|
6
|
+
matrice_compute/instance_utils.py,sha256=cANKRUlUzfecnzVEMC6Gkg9K7GZajH9ojNPiChdJL9s,23455
|
|
7
7
|
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
8
8
|
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
matrice_compute/resources_tracker.py,sha256=
|
|
9
|
+
matrice_compute/resources_tracker.py,sha256=n57IJmT5GjNEX8yQL7nbKv57bjvESYM-vRQcQ0DgQXQ,19256
|
|
10
10
|
matrice_compute/scaling.py,sha256=3F8SWvy9wWczpJ6dbY5RrXWw5ByZlIzAPJklir3KIFI,35359
|
|
11
11
|
matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
|
|
12
12
|
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
13
|
-
matrice_compute-0.1.
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
13
|
+
matrice_compute-0.1.15.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
14
|
+
matrice_compute-0.1.15.dist-info/METADATA,sha256=kHaPM9mLcbUZF0k3QxCRQ1jn1zuasO2l73S8yPlfPf0,1038
|
|
15
|
+
matrice_compute-0.1.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
16
|
+
matrice_compute-0.1.15.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
17
|
+
matrice_compute-0.1.15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|