matrice-compute 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1117,7 +1117,7 @@ def lpr_setup_execute(self: ActionInstance):
1117
1117
 
1118
1118
  # Add worker container run command
1119
1119
  worker_cmd = (
1120
- f"docker run -d --pull=always "
1120
+ f"docker run -d --net=host --pull=always "
1121
1121
  f"--name lpr-worker "
1122
1122
  f"-p {external_port}:8082 "
1123
1123
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -128,8 +128,12 @@ def has_gpu() -> bool:
128
128
  Returns:
129
129
  bool: True if GPU is present, False otherwise
130
130
  """
131
- subprocess.run("nvidia-smi", check=True)
132
- return True
131
+ try:
132
+ subprocess.run("nvidia-smi", timeout=5)
133
+ return True
134
+ except subprocess.TimeoutExpired:
135
+ logging.warning("nvidia-smi command timed out after 5 seconds")
136
+ return False
133
137
 
134
138
 
135
139
  @log_errors(default_return=0, raise_exception=False)
@@ -141,13 +145,17 @@ def get_gpu_memory_usage() -> float:
141
145
  float: Memory usage between 0 and 1
142
146
  """
143
147
  command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
144
- output = subprocess.check_output(command.split()).decode("ascii").strip().split("\n")
145
- memory_percentages = []
146
- for line in output:
147
- used, total = map(int, line.split(","))
148
- usage_percentage = used / total
149
- memory_percentages.append(usage_percentage)
150
- return min(memory_percentages)
148
+ try:
149
+ output = subprocess.check_output(command.split(), timeout=5).decode("ascii").strip().split("\n")
150
+ memory_percentages = []
151
+ for line in output:
152
+ used, total = map(int, line.split(","))
153
+ usage_percentage = used / total
154
+ memory_percentages.append(usage_percentage)
155
+ return min(memory_percentages)
156
+ except subprocess.TimeoutExpired:
157
+ logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
158
+ return 0
151
159
 
152
160
 
153
161
  @log_errors(default_return=0, raise_exception=False)
@@ -194,17 +202,24 @@ def get_gpu_info() -> list:
194
202
  Returns:
195
203
  list: GPU information strings
196
204
  """
197
- with subprocess.Popen(
205
+ proc = subprocess.Popen(
198
206
  [
199
207
  "nvidia-smi",
200
208
  "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
201
209
  "--format=csv,noheader,nounits",
202
210
  ],
203
211
  stdout=subprocess.PIPE,
204
- ) as proc:
205
- stdout, _ = proc.communicate()
212
+ stderr=subprocess.PIPE,
213
+ )
214
+ try:
215
+ stdout, stderr = proc.communicate(timeout=5)
206
216
  output = stdout.decode("UTF-8")
207
217
  return output.split("\n")[:-1]
218
+ except subprocess.TimeoutExpired:
219
+ logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_info")
220
+ proc.kill()
221
+ proc.communicate() # flush output after kill
222
+ return []
208
223
 
209
224
 
210
225
  @log_errors(default_return="", raise_exception=False)
@@ -505,7 +520,12 @@ def get_gpu_with_sufficient_memory_for_action(
505
520
  """
506
521
  required_gpu_memory = get_required_gpu_memory(action_details)
507
522
  command = "nvidia-smi --query-gpu=memory.free --format=csv"
508
- memory_free_info = subprocess.check_output(command.split()).decode("ascii").split("\n")
523
+ try:
524
+ memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
525
+ except subprocess.TimeoutExpired:
526
+ logging.error("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
527
+ raise ValueError("Failed to get GPU information - nvidia-smi timed out")
528
+
509
529
  if len(memory_free_info) < 2:
510
530
  raise ValueError("No GPU information available from nvidia-smi")
511
531
  memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
@@ -548,7 +568,12 @@ def get_single_gpu_with_sufficient_memory_for_action(
548
568
  """
549
569
  required_gpu_memory = get_required_gpu_memory(action_details)
550
570
  command = "nvidia-smi --query-gpu=memory.free --format=csv"
551
- memory_free_info = subprocess.check_output(command.split()).decode("ascii").split("\n")
571
+ try:
572
+ memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
573
+ except subprocess.TimeoutExpired:
574
+ logging.error("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
575
+ raise ValueError("Failed to get GPU information - nvidia-smi timed out")
576
+
552
577
  if len(memory_free_info) < 2:
553
578
  raise ValueError("No GPU information available from nvidia-smi")
554
579
  memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
@@ -150,20 +150,25 @@ class ResourcesTracker:
150
150
  if not has_gpu():
151
151
  return 0
152
152
  gpu_util = 0
153
- result = subprocess.run(
154
- ["nvidia-smi", "pmon", "-c", "1"],
155
- capture_output=True,
156
- text=True,
157
- check=True,
158
- )
159
- pmon_output = result.stdout.strip().split("\n")
160
- for line in pmon_output[2:]:
161
- parts = line.split()
162
- if len(parts) >= 8:
163
- pid = parts[1]
164
- gpu_usage = parts[3]
165
- if pid == str(container_pid):
166
- gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
153
+ try:
154
+ result = subprocess.run(
155
+ ["nvidia-smi", "pmon", "-c", "1"],
156
+ capture_output=True,
157
+ text=True,
158
+ check=True,
159
+ timeout=5,
160
+ )
161
+ pmon_output = result.stdout.strip().split("\n")
162
+ for line in pmon_output[2:]:
163
+ parts = line.split()
164
+ if len(parts) >= 8:
165
+ pid = parts[1]
166
+ gpu_usage = parts[3]
167
+ if pid == str(container_pid):
168
+ gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
169
+ except subprocess.TimeoutExpired:
170
+ logging.warning("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
171
+ return 0
167
172
  return gpu_util
168
173
 
169
174
  @log_errors(default_return=0, raise_exception=False, log_error=False)
@@ -185,19 +190,24 @@ class ResourcesTracker:
185
190
  "--format=csv,noheader,nounits",
186
191
  ]
187
192
  total_memory = 0
188
- result = subprocess.run(
189
- cmd,
190
- stdout=subprocess.PIPE,
191
- stderr=subprocess.PIPE,
192
- text=True,
193
- check=True,
194
- )
195
- for line in result.stdout.splitlines():
196
- parts = line.strip().split(", ")
197
- if len(parts) == 2:
198
- process_pid, used_memory = parts
199
- if process_pid == str(container_pid):
200
- total_memory += int(used_memory)
193
+ try:
194
+ result = subprocess.run(
195
+ cmd,
196
+ stdout=subprocess.PIPE,
197
+ stderr=subprocess.PIPE,
198
+ text=True,
199
+ check=True,
200
+ timeout=5,
201
+ )
202
+ for line in result.stdout.splitlines():
203
+ parts = line.strip().split(", ")
204
+ if len(parts) == 2:
205
+ process_pid, used_memory = parts
206
+ if process_pid == str(container_pid):
207
+ total_memory += int(used_memory)
208
+ except subprocess.TimeoutExpired:
209
+ logging.warning("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
210
+ return 0
201
211
  return total_memory
202
212
 
203
213
  @log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
@@ -227,7 +237,12 @@ class ResourcesTracker:
227
237
  if not has_gpu():
228
238
  return gpu_memory_free, gpu_utilization
229
239
 
230
- subprocess.check_output("nvidia-smi")
240
+ try:
241
+ subprocess.check_output("nvidia-smi", timeout=5)
242
+ except subprocess.TimeoutExpired:
243
+ logging.warning("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
244
+ return 0, 0.0
245
+
231
246
  info_list = get_gpu_info()
232
247
  for info in info_list:
233
248
  info_split = info.split(", ")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.14
3
+ Version: 0.1.15
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,17 +1,17 @@
1
1
  matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
2
- matrice_compute/action_instance.py,sha256=aYNpRySPatxFltn_ekVmCd5h69I992_YerUTZwGWyHA,59763
2
+ matrice_compute/action_instance.py,sha256=_EYTNmUDnKKMmiRX2WCjP_MC2o5kA1RYAvHVKhadnvk,59774
3
3
  matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
4
4
  matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
5
  matrice_compute/instance_manager.py,sha256=8USyX09ZxLvnVNIrjRogbyUeMCfgWnasuRqYkkVF4tQ,10146
6
- matrice_compute/instance_utils.py,sha256=7jnWurSpq8PQxPGlSTc0qmpNdD5jIL8pjYKdjhVhS60,22310
6
+ matrice_compute/instance_utils.py,sha256=cANKRUlUzfecnzVEMC6Gkg9K7GZajH9ojNPiChdJL9s,23455
7
7
  matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
8
8
  matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- matrice_compute/resources_tracker.py,sha256=2hLKVxYihROtQ6fO4V_BplTgvkN8qH2H9_qxpOIpZkc,18521
9
+ matrice_compute/resources_tracker.py,sha256=n57IJmT5GjNEX8yQL7nbKv57bjvESYM-vRQcQ0DgQXQ,19256
10
10
  matrice_compute/scaling.py,sha256=3F8SWvy9wWczpJ6dbY5RrXWw5ByZlIzAPJklir3KIFI,35359
11
11
  matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
12
12
  matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
13
- matrice_compute-0.1.14.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
14
- matrice_compute-0.1.14.dist-info/METADATA,sha256=u8ZIOoIX3uMEA4Lgaiuh73xsoPSdcHTZXAJuIBpn6KE,1038
15
- matrice_compute-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
- matrice_compute-0.1.14.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
17
- matrice_compute-0.1.14.dist-info/RECORD,,
13
+ matrice_compute-0.1.15.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
14
+ matrice_compute-0.1.15.dist-info/METADATA,sha256=kHaPM9mLcbUZF0k3QxCRQ1jn1zuasO2l73S8yPlfPf0,1038
15
+ matrice_compute-0.1.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ matrice_compute-0.1.15.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
17
+ matrice_compute-0.1.15.dist-info/RECORD,,