matrice-compute 0.1.18__tar.gz → 0.1.20__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/action_instance.py +40 -9
  4. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/instance_utils.py +305 -94
  5. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/resources_tracker.py +125 -53
  6. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/LICENSE.txt +0 -0
  7. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/README.md +0 -0
  8. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/matrice_compute.egg-info/SOURCES.txt +0 -0
  9. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/matrice_compute.egg-info/dependency_links.txt +0 -0
  10. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/matrice_compute.egg-info/not-zip-safe +0 -0
  11. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/matrice_compute.egg-info/top_level.txt +0 -0
  12. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/pyproject.toml +0 -0
  13. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/setup.cfg +0 -0
  14. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/setup.py +0 -0
  15. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/__init__.py +0 -0
  16. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/actions_manager.py +0 -0
  17. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  18. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/instance_manager.py +0 -0
  19. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/prechecks.py +0 -0
  20. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/py.typed +0 -0
  21. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/scaling.py +0 -0
  22. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/shutdown_manager.py +0 -0
  23. {matrice_compute-0.1.18 → matrice_compute-0.1.20}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.18
3
+ Version: 0.1.20
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.18
3
+ Version: 0.1.20
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -12,6 +12,7 @@ from matrice_compute.instance_utils import (
12
12
  get_gpu_with_sufficient_memory_for_action,
13
13
  get_decrypted_access_key_pair,
14
14
  get_max_file_system,
15
+ get_best_service_ip_and_network,
15
16
  )
16
17
  from matrice_compute.task_utils import (
17
18
  setup_workspace_and_run_task,
@@ -526,13 +527,18 @@ class ActionInstance:
526
527
 
527
528
  if username and password:
528
529
  login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
529
- subprocess.run(login_cmd, shell=True, check=True)
530
+ result = subprocess.run(login_cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
531
+ if result.returncode != 0:
532
+ raise Exception(f"Docker login failed with exit code {result.returncode}: {result.stderr}")
530
533
  logging.info("Docker login successful")
531
534
  else:
532
535
  logging.warning(
533
536
  "Docker credentials not available, skipping Docker login"
534
537
  )
535
538
 
539
+ except subprocess.TimeoutExpired:
540
+ logging.error("Docker login timed out after 30 seconds")
541
+ raise Exception("Docker login timed out")
536
542
  except Exception as err:
537
543
  logging.error(
538
544
  "Docker login failed: %s",
@@ -1151,9 +1157,17 @@ def inference_ws_server_execute(self: ActionInstance):
1151
1157
  return
1152
1158
  image = action_details["actionDetails"].get("docker")
1153
1159
 
1154
-
1155
1160
  self.setup_action_requirements(action_details)
1156
1161
 
1162
+ # Get the best IP and network configuration for port 8102
1163
+ ws_host, use_host_network = get_best_service_ip_and_network(8102)
1164
+
1165
+ # Store ws_host in environment variable for use by other actions (e.g., fe_fs_streaming)
1166
+ if not os.environ.get("INFERENCE_WS_HOST"):
1167
+ os.environ["INFERENCE_WS_HOST"] = ws_host
1168
+
1169
+ logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1170
+
1157
1171
  # Inference WebSocket server with --net=host (Port: 8102)
1158
1172
  worker_cmd = (
1159
1173
  f"docker run -d --pull=always --net=host "
@@ -1164,7 +1178,6 @@ def inference_ws_server_execute(self: ActionInstance):
1164
1178
  f"{image} "
1165
1179
  f"./app "
1166
1180
  f"{self.action_record_id} "
1167
-
1168
1181
  )
1169
1182
  logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
1170
1183
 
@@ -1185,7 +1198,13 @@ def fe_fs_streaming_execute(self: ActionInstance):
1185
1198
  image = action_details["actionDetails"].get("docker")
1186
1199
 
1187
1200
  self.setup_action_requirements(action_details)
1188
-
1201
+
1202
+ # Get the ws_host from environment variable set by inference_ws_server_execute
1203
+ ws_host = os.environ.get("INFERENCE_WS_HOST", "localhost")
1204
+ ws_url = f"{ws_host}:8102"
1205
+
1206
+ logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1207
+
1189
1208
  # Frontend streaming with --net=host (Port: 3000)
1190
1209
  worker_cmd = (
1191
1210
  f"docker run -d --pull=always --net=host "
@@ -1195,9 +1214,10 @@ def fe_fs_streaming_execute(self: ActionInstance):
1195
1214
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1196
1215
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1197
1216
  f"-e PORT=3000 "
1217
+ f'-e WS_HOST="{ws_url}" '
1198
1218
  f"{image}"
1199
1219
  )
1200
- logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
1220
+ logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
1201
1221
 
1202
1222
  # Docker Command run
1203
1223
  self.start(worker_cmd, "fe_fs_streaming")
@@ -1304,6 +1324,11 @@ def redis_setup_execute(self: ActionInstance):
1304
1324
  action_id=action_id,
1305
1325
  )
1306
1326
 
1327
+ # Get the best IP for Redis (port 6379)
1328
+ redis_host, _ = get_best_service_ip_and_network(6379)
1329
+
1330
+ logging.info(f"Redis will use IP: {redis_host} on port 6379")
1331
+
1307
1332
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1308
1333
 
1309
1334
  # Redis container with --net=host (Port: 6379)
@@ -1315,7 +1340,7 @@ def redis_setup_execute(self: ActionInstance):
1315
1340
  f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1316
1341
  )
1317
1342
 
1318
- logging.info("Starting Redis container (Port: 6379): %s", redis_cmd)
1343
+ logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
1319
1344
 
1320
1345
  # Start Redis container first
1321
1346
  redis_process = subprocess.Popen(
@@ -1324,13 +1349,13 @@ def redis_setup_execute(self: ActionInstance):
1324
1349
  stdout=subprocess.PIPE,
1325
1350
  stderr=subprocess.PIPE,
1326
1351
  )
1327
- logging.info("Redis container started successfully on localhost:6379")
1352
+ logging.info("Redis container started successfully on %s:6379", redis_host)
1328
1353
 
1329
1354
  # Wait for Redis to be ready
1330
1355
  time.sleep(5)
1331
1356
 
1332
1357
  env_vars = {
1333
- "REDIS_URL": f"localhost:6379",
1358
+ "REDIS_URL": f"{redis_host}:6379",
1334
1359
  "REDIS_PASSWORD": redis_password,
1335
1360
  }
1336
1361
 
@@ -1348,7 +1373,7 @@ def redis_setup_execute(self: ActionInstance):
1348
1373
  f"{self.action_record_id} "
1349
1374
  )
1350
1375
 
1351
- logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
1376
+ logging.info("Starting bg-redis management (Port: 8082) with REDIS_URL=%s: %s", env_vars['REDIS_URL'], cmd)
1352
1377
 
1353
1378
  self.start(cmd, "redis_setup")
1354
1379
 
@@ -1386,6 +1411,12 @@ def model_deploy_execute(self: ActionInstance):
1386
1411
  action_id=action_id,
1387
1412
  )
1388
1413
  use_gpu = self.get_gpu_config(action_details)
1414
+
1415
+ gpuRequired = action_details["actionDetails"]["gpuRequired"]
1416
+ if gpuRequired==False:
1417
+ use_gpu = ""
1418
+ else:
1419
+ use_gpu = "--runtime=nvidia"
1389
1420
  extra_env_vars = {"INTERNAL_PORT": internal_port}
1390
1421
  cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1391
1422
  logging.info("cmd is: %s", cmd)
@@ -95,28 +95,72 @@ def get_instance_info(service_provider: str = None, instance_id: str = None) ->
95
95
  return str(auto_service_provider), str(auto_instance_id)
96
96
 
97
97
 
98
+ def _normalize_timestamp(timestamp_str: str) -> str:
99
+ """
100
+ Normalize timestamp string to handle different precision levels.
101
+
102
+ Handles nanoseconds (9 digits), microseconds (6 digits), milliseconds (3 digits),
103
+ and various timezone formats across different cloud providers.
104
+
105
+ Args:
106
+ timestamp_str (str): Timestamp string in various formats
107
+
108
+ Returns:
109
+ str: Normalized timestamp string compatible with fromisoformat()
110
+ """
111
+ # Replace 'Z' with '+00:00' for UTC timestamps
112
+ timestamp_str = timestamp_str.replace("Z", "+00:00")
113
+
114
+ # Handle fractional seconds - Python's datetime only supports up to 6 digits (microseconds)
115
+ # Some providers (like OCI, GCP) may return nanoseconds (9 digits)
116
+ if "." in timestamp_str:
117
+ # Split into main part and fractional part
118
+ if "+" in timestamp_str:
119
+ main_part, tz_part = timestamp_str.rsplit("+", 1)
120
+ tz_suffix = "+" + tz_part
121
+ elif timestamp_str.count("-") > 2: # Has negative timezone offset
122
+ main_part, tz_part = timestamp_str.rsplit("-", 1)
123
+ tz_suffix = "-" + tz_part
124
+ else:
125
+ main_part = timestamp_str
126
+ tz_suffix = ""
127
+
128
+ # Split main part into date/time and fractional seconds
129
+ datetime_part, fractional = main_part.rsplit(".", 1)
130
+
131
+ # Truncate fractional seconds to 6 digits (microseconds)
132
+ if len(fractional) > 6:
133
+ fractional = fractional[:6]
134
+
135
+ # Reconstruct timestamp
136
+ timestamp_str = f"{datetime_part}.{fractional}{tz_suffix}"
137
+
138
+ return timestamp_str
139
+
140
+
98
141
  @log_errors(default_return=0, raise_exception=False, log_error=False)
99
142
  def calculate_time_difference(start_time_str: str, finish_time_str: str) -> int:
100
143
  """
101
144
  Calculate time difference between start and finish times.
145
+
146
+ Robust handling of timestamps from different cloud providers (AWS, GCP, Azure, OCI)
147
+ and different precision levels (nanoseconds, microseconds, milliseconds).
102
148
 
103
149
  Args:
104
- start_time_str (str): Start time string
105
- finish_time_str (str): Finish time string
150
+ start_time_str (str): Start time string in ISO format
151
+ finish_time_str (str): Finish time string in ISO format
106
152
 
107
153
  Returns:
108
154
  int: Time difference in seconds
109
155
  """
110
- if os.environ["SERVICE_PROVIDER"] in [
111
- "AWS",
112
- "OCI",
113
- "LAMBDA",
114
- ]:
115
- start_time = datetime.fromisoformat(start_time_str.split(".")[0] + "+00:00")
116
- finish_time = datetime.fromisoformat(finish_time_str.split(".")[0] + "+00:00")
117
- else:
118
- start_time = datetime.fromisoformat(start_time_str.replace("Z", "+00:00"))
119
- finish_time = datetime.fromisoformat(finish_time_str.replace("Z", "+00:00"))
156
+ # Normalize both timestamps to handle different formats
157
+ normalized_start = _normalize_timestamp(start_time_str)
158
+ normalized_finish = _normalize_timestamp(finish_time_str)
159
+
160
+ # Parse the normalized timestamps
161
+ start_time = datetime.fromisoformat(normalized_start)
162
+ finish_time = datetime.fromisoformat(normalized_finish)
163
+
120
164
  return int((finish_time - start_time).total_seconds())
121
165
 
122
166
 
@@ -129,14 +173,25 @@ def has_gpu() -> bool:
129
173
  bool: True if GPU is present, False otherwise
130
174
  """
131
175
  try:
132
- subprocess.run("nvidia-smi", timeout=5)
133
- return True
176
+ result = subprocess.run(
177
+ ["nvidia-smi"],
178
+ stdout=subprocess.PIPE,
179
+ stderr=subprocess.PIPE,
180
+ timeout=5,
181
+ check=False,
182
+ )
183
+ return result.returncode == 0
134
184
  except subprocess.TimeoutExpired:
135
- logging.warning("nvidia-smi command timed out after 5 seconds")
185
+ logging.debug("nvidia-smi command timed out after 5 seconds")
186
+ return False
187
+ except FileNotFoundError:
188
+ logging.debug("nvidia-smi not found on this system")
189
+ return False
190
+ except Exception:
136
191
  return False
137
192
 
138
193
 
139
- @log_errors(default_return=0, raise_exception=False)
194
+ @log_errors(default_return=0, raise_exception=False, log_error=False)
140
195
  def get_gpu_memory_usage() -> float:
141
196
  """
142
197
  Get GPU memory usage percentage.
@@ -144,17 +199,35 @@ def get_gpu_memory_usage() -> float:
144
199
  Returns:
145
200
  float: Memory usage between 0 and 1
146
201
  """
147
- command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
202
+ command = ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"]
148
203
  try:
149
- output = subprocess.check_output(command.split(), timeout=5).decode("ascii").strip().split("\n")
204
+ result = subprocess.run(
205
+ command,
206
+ stdout=subprocess.PIPE,
207
+ stderr=subprocess.PIPE,
208
+ timeout=5,
209
+ check=False,
210
+ )
211
+ if result.returncode != 0:
212
+ logging.debug("nvidia-smi command failed in get_gpu_memory_usage")
213
+ return 0
214
+ output = result.stdout.decode("ascii").strip().split("\n")
150
215
  memory_percentages = []
151
216
  for line in output:
152
- used, total = map(int, line.split(","))
153
- usage_percentage = used / total
154
- memory_percentages.append(usage_percentage)
155
- return min(memory_percentages)
217
+ if line.strip():
218
+ used, total = map(int, line.split(","))
219
+ if total > 0:
220
+ usage_percentage = used / total
221
+ memory_percentages.append(usage_percentage)
222
+ return min(memory_percentages) if memory_percentages else 0
156
223
  except subprocess.TimeoutExpired:
157
- logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
224
+ logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
225
+ return 0
226
+ except (ValueError, IndexError) as e:
227
+ logging.debug("Error parsing GPU memory info: %s", e)
228
+ return 0
229
+ except Exception as e:
230
+ logging.debug("Unexpected error in get_gpu_memory_usage: %s", e)
158
231
  return 0
159
232
 
160
233
 
@@ -194,7 +267,7 @@ def get_mem_usage() -> float:
194
267
  return mem_usage
195
268
 
196
269
 
197
- @log_errors(default_return=[], raise_exception=False)
270
+ @log_errors(default_return=[], raise_exception=False, log_error=False)
198
271
  def get_gpu_info() -> list:
199
272
  """
200
273
  Get GPU information.
@@ -202,23 +275,34 @@ def get_gpu_info() -> list:
202
275
  Returns:
203
276
  list: GPU information strings
204
277
  """
205
- proc = subprocess.Popen(
206
- [
207
- "nvidia-smi",
208
- "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
209
- "--format=csv,noheader,nounits",
210
- ],
211
- stdout=subprocess.PIPE,
212
- stderr=subprocess.PIPE,
213
- )
214
278
  try:
215
- stdout, stderr = proc.communicate(timeout=5)
216
- output = stdout.decode("UTF-8")
217
- return output.split("\n")[:-1]
218
- except subprocess.TimeoutExpired:
219
- logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_info")
220
- proc.kill()
221
- proc.communicate() # flush output after kill
279
+ proc = subprocess.Popen(
280
+ [
281
+ "nvidia-smi",
282
+ "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
283
+ "--format=csv,noheader,nounits",
284
+ ],
285
+ stdout=subprocess.PIPE,
286
+ stderr=subprocess.PIPE,
287
+ )
288
+ try:
289
+ stdout, stderr = proc.communicate(timeout=5)
290
+ if proc.returncode != 0:
291
+ logging.debug("nvidia-smi command failed in get_gpu_info")
292
+ return []
293
+ output = stdout.decode("UTF-8")
294
+ result = [line for line in output.split("\n") if line.strip()]
295
+ return result
296
+ except subprocess.TimeoutExpired:
297
+ logging.debug("nvidia-smi command timed out after 5 seconds in get_gpu_info")
298
+ proc.kill()
299
+ proc.communicate() # flush output after kill
300
+ return []
301
+ except FileNotFoundError:
302
+ logging.debug("nvidia-smi not found on this system")
303
+ return []
304
+ except Exception as e:
305
+ logging.debug("Error getting GPU info: %s", e)
222
306
  return []
223
307
 
224
308
 
@@ -241,11 +325,29 @@ def is_docker_running() -> bool:
241
325
  Returns:
242
326
  bool: True if Docker containers are running
243
327
  """
244
- command = "docker ps"
245
- docker_images = (
246
- subprocess.check_output(command.split()).decode("ascii").split("\n")[:-1][1:]
247
- )
248
- return bool(docker_images)
328
+ command = ["docker", "ps"]
329
+ try:
330
+ result = subprocess.run(
331
+ command,
332
+ stdout=subprocess.PIPE,
333
+ stderr=subprocess.PIPE,
334
+ check=False,
335
+ timeout=10,
336
+ )
337
+ if result.returncode != 0:
338
+ logging.warning("docker ps command failed")
339
+ return False
340
+ docker_images = result.stdout.decode("ascii").split("\n")[:-1][1:]
341
+ return bool(docker_images)
342
+ except subprocess.TimeoutExpired:
343
+ logging.warning("docker ps command timed out")
344
+ return False
345
+ except FileNotFoundError:
346
+ logging.warning("docker command not found")
347
+ return False
348
+ except Exception as e:
349
+ logging.warning("Error checking if docker is running: %s", e)
350
+ return False
249
351
 
250
352
 
251
353
  @log_errors(default_return=None, raise_exception=False)
@@ -502,7 +604,7 @@ def is_allowed_gpu_device(gpu_index: int) -> bool:
502
604
  return int(gpu_index) in allowed_gpus
503
605
 
504
606
 
505
- @log_errors(raise_exception=True)
607
+ @log_errors(raise_exception=True, log_error=False)
506
608
  def get_gpu_with_sufficient_memory_for_action(
507
609
  action_details: dict,
508
610
  ) -> list:
@@ -519,16 +621,38 @@ def get_gpu_with_sufficient_memory_for_action(
519
621
  ValueError: If insufficient GPU memory
520
622
  """
521
623
  required_gpu_memory = get_required_gpu_memory(action_details)
522
- command = "nvidia-smi --query-gpu=memory.free --format=csv"
624
+ command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
523
625
  try:
524
- memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
626
+ result = subprocess.run(
627
+ command,
628
+ stdout=subprocess.PIPE,
629
+ stderr=subprocess.PIPE,
630
+ timeout=5,
631
+ check=False,
632
+ )
633
+ if result.returncode != 0:
634
+ raise ValueError("Failed to get GPU information - nvidia-smi command failed")
635
+ memory_free_info = result.stdout.decode("ascii").strip().split("\n")
525
636
  except subprocess.TimeoutExpired:
526
- logging.error("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
637
+ logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
527
638
  raise ValueError("Failed to get GPU information - nvidia-smi timed out")
639
+ except FileNotFoundError:
640
+ raise ValueError("nvidia-smi not found - no GPU support available")
641
+ except Exception as e:
642
+ logging.warning("Error running nvidia-smi: %s", e)
643
+ raise ValueError(f"Failed to get GPU information: {e}")
528
644
 
529
645
  if len(memory_free_info) < 2:
530
646
  raise ValueError("No GPU information available from nvidia-smi")
531
- memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
647
+
648
+ try:
649
+ memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
650
+ except (ValueError, IndexError) as e:
651
+ raise ValueError(f"Error parsing GPU memory information: {e}")
652
+
653
+ if not memory_free_values:
654
+ raise ValueError("No GPU devices found")
655
+
532
656
  if required_gpu_memory < 80000:
533
657
  try:
534
658
  return get_single_gpu_with_sufficient_memory_for_action(action_details)
@@ -546,11 +670,11 @@ def get_gpu_with_sufficient_memory_for_action(
546
670
  if total_memory >= required_gpu_memory:
547
671
  return selected_gpus
548
672
  raise ValueError(
549
- f"Insufficient GPU memory available. Required: {required_gpu_memory}, Available: {total_memory}"
673
+ f"Insufficient GPU memory available. Required: {required_gpu_memory}MB, Available: {total_memory}MB"
550
674
  )
551
675
 
552
676
 
553
- @log_errors(raise_exception=True)
677
+ @log_errors(raise_exception=True, log_error=False)
554
678
  def get_single_gpu_with_sufficient_memory_for_action(
555
679
  action_details: dict,
556
680
  ) -> list:
@@ -567,16 +691,38 @@ def get_single_gpu_with_sufficient_memory_for_action(
567
691
  ValueError: If no GPU has sufficient memory
568
692
  """
569
693
  required_gpu_memory = get_required_gpu_memory(action_details)
570
- command = "nvidia-smi --query-gpu=memory.free --format=csv"
694
+ command = ["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]
571
695
  try:
572
- memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
696
+ result = subprocess.run(
697
+ command,
698
+ stdout=subprocess.PIPE,
699
+ stderr=subprocess.PIPE,
700
+ timeout=5,
701
+ check=False,
702
+ )
703
+ if result.returncode != 0:
704
+ raise ValueError("Failed to get GPU information - nvidia-smi command failed")
705
+ memory_free_info = result.stdout.decode("ascii").strip().split("\n")
573
706
  except subprocess.TimeoutExpired:
574
- logging.error("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
707
+ logging.warning("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
575
708
  raise ValueError("Failed to get GPU information - nvidia-smi timed out")
709
+ except FileNotFoundError:
710
+ raise ValueError("nvidia-smi not found - no GPU support available")
711
+ except Exception as e:
712
+ logging.warning("Error running nvidia-smi: %s", e)
713
+ raise ValueError(f"Failed to get GPU information: {e}")
576
714
 
577
715
  if len(memory_free_info) < 2:
578
716
  raise ValueError("No GPU information available from nvidia-smi")
579
- memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
717
+
718
+ try:
719
+ memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:] if x.strip()]
720
+ except (ValueError, IndexError) as e:
721
+ raise ValueError(f"Error parsing GPU memory information: {e}")
722
+
723
+ if not memory_free_values:
724
+ raise ValueError("No GPU devices found")
725
+
580
726
  best_fit_gpu = None
581
727
  best_fit_memory = float("inf")
582
728
  for i, mem in enumerate(memory_free_values):
@@ -692,47 +838,112 @@ def get_encrypted_access_key_pair(
692
838
 
693
839
  return encoded_access_key, encoded_secret_key
694
840
 
695
- @log_errors(default_return=False, raise_exception=False)
696
- def check_public_port_exposure(port: int) -> bool:
841
+ def _get_private_ip() -> str:
697
842
  """
698
- Check if port is publicly accessible.
843
+ Get the actual private/LAN IP address using UDP socket trick.
844
+ This works reliably even in Docker, NAT, VPN, etc.
845
+
846
+ Returns:
847
+ str: Private IP address or None if not available
848
+ """
849
+ try:
850
+ # Use UDP socket to determine which interface would be used for external connection
851
+ # No actual packets are sent
852
+ with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
853
+ s.connect(("8.8.8.8", 80))
854
+ private_ip = s.getsockname()[0]
855
+ return private_ip
856
+ except Exception:
857
+ return None
699
858
 
700
- Args:
701
- port (int): Port number to check
702
859
 
703
- Returns:
704
- bool: True if port is publicly accessible
705
- """
706
- is_public_exposed = False
707
- is_locally_available = False
708
- # Check if port is publicly accessible
709
- public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
710
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as conn_sock:
711
- conn_sock.settimeout(3)
712
- result = conn_sock.connect_ex((public_ip, port))
713
- is_public_exposed = result == 0
860
+ def _public_ip_is_local(public_ip: str) -> bool:
861
+ """
862
+ Check if a public IP address is actually assigned to a local network interface.
863
+ This is true on cloud servers with real public IPs, false behind NAT.
714
864
 
715
- # Check if port is locally available
716
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as bind_sock:
717
- bind_sock.setsockopt(
718
- socket.SOL_SOCKET,
719
- socket.SO_REUSEADDR,
720
- 1,
721
- )
722
- bind_sock.bind(("", port))
723
- bind_sock.listen(1)
724
- is_locally_available = True
725
-
726
- if not is_public_exposed:
727
- logging.debug(
728
- "Port %d is not publicly exposed",
729
- port,
730
- )
865
+ Args:
866
+ public_ip (str): The public IP to check
867
+
868
+ Returns:
869
+ bool: True if the public IP is on a local interface
870
+ """
871
+ try:
872
+ for iface, addrs in psutil.net_if_addrs().items():
873
+ for addr in addrs:
874
+ if addr.family == socket.AF_INET:
875
+ if addr.address == public_ip:
876
+ return True
731
877
  return False
732
- if not is_locally_available:
733
- logging.debug(
734
- "Port %d is not locally available",
735
- port,
736
- )
878
+ except Exception:
737
879
  return False
738
- return True
880
+
881
+
882
+ @log_errors(default_return=("localhost", True), raise_exception=False)
883
+ def get_best_service_ip_and_network(port: int) -> tuple:
884
+ """
885
+ Determine the best IP address and network configuration for a service.
886
+
887
+ This function intelligently selects the best IP to bind a service to:
888
+
889
+ Priority:
890
+ 1. Public IP if it's actually on a local interface (cloud servers)
891
+ 2. Private/LAN IP (NAT, local network, Docker)
892
+ 3. localhost with --net=host (fallback)
893
+
894
+ Args:
895
+ port (int): Port number for the service
896
+
897
+ Returns:
898
+ tuple: (ip_address, use_host_network) where:
899
+ - ip_address: The IP address to use (public, private, or localhost)
900
+ - use_host_network: True if should use --net=host, False if should use port mapping
901
+ """
902
+ try:
903
+ # Check if port is available (not already in use)
904
+ try:
905
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_sock:
906
+ test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
907
+ test_sock.bind(("0.0.0.0", port))
908
+ test_sock.listen(1)
909
+ # Port is available - socket closes automatically
910
+ except OSError as e:
911
+ logging.warning(f"Port {port} is already in use or cannot be bound: {e}, will use --net=host")
912
+ return "localhost", True
913
+
914
+ # Get the actual private/LAN IP
915
+ private_ip = _get_private_ip()
916
+ if private_ip:
917
+ logging.info(f"Determined private/LAN IP: {private_ip}")
918
+ else:
919
+ logging.debug("Could not determine private IP")
920
+
921
+ # Try to get public IP from external service
922
+ public_ip = None
923
+ try:
924
+ public_ip = urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8").strip()
925
+ # Validate it's a proper IP address
926
+ socket.inet_aton(public_ip)
927
+ logging.info(f"Determined external/public IP: {public_ip}")
928
+ except Exception as e:
929
+ logging.debug(f"Could not determine public IP: {e}")
930
+
931
+ # Decision logic: Choose the best IP
932
+
933
+ # 1. If public IP is on a local interface, use it (cloud server with real public IP)
934
+ if public_ip and _public_ip_is_local(public_ip):
935
+ logging.info(f"Public IP {public_ip} is on local interface, using it for port {port}")
936
+ return public_ip, False
937
+
938
+ # 2. If we have a valid private IP, use it (most common case: NAT, LAN, Docker)
939
+ if private_ip and not private_ip.startswith("127."):
940
+ logging.info(f"Using private/LAN IP {private_ip} for port {port}")
941
+ return private_ip, False
942
+
943
+ # 3. Fall back to localhost with --net=host
944
+ logging.info(f"No suitable IP found, using localhost with --net=host for port {port}")
945
+ return "localhost", True
946
+
947
+ except Exception as e:
948
+ logging.warning(f"Error determining best IP for port {port}: {e}, falling back to localhost")
949
+ return "localhost", True
@@ -56,7 +56,7 @@ class ResourcesTracker:
56
56
  return cpu_utilization, memory_utilization
57
57
  return 0, 0
58
58
 
59
- @log_errors(default_return=(0, 0), raise_exception=False)
59
+ @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
60
60
  def get_container_cpu_and_memory_with_container_id(self, container_id: str) -> Tuple[float, float]:
61
61
  """
62
62
  Get CPU and memory usage for a specific container by its ID.
@@ -67,32 +67,46 @@ class ResourcesTracker:
67
67
  Returns:
68
68
  Tuple[float, float]: CPU utilization percentage and memory usage in MB.
69
69
  """
70
- stats_result = subprocess.run(
71
- [
72
- "docker",
73
- "stats",
74
- "--no-stream",
75
- "--format",
76
- "{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
77
- container_id,
78
- ],
79
- capture_output=True,
80
- text=True,
81
- check=True,
82
- )
83
- stats = stats_result.stdout.strip().split(": ")[1].split(", ")
84
- cpu_usage = float(stats[0].replace("% CPU", "").strip())
85
- memory_usage = stats[1].split(" / ")[0]
86
- mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
87
- if mem_unit == "KiB":
88
- memory_usage_mb = float(mem_value) / 1024
89
- elif mem_unit == "MiB":
90
- memory_usage_mb = float(mem_value)
91
- elif mem_unit == "GiB":
92
- memory_usage_mb = float(mem_value) * 1024
93
- else:
94
- memory_usage_mb = float(mem_value)
95
- return cpu_usage, memory_usage_mb
70
+ try:
71
+ stats_result = subprocess.run(
72
+ [
73
+ "docker",
74
+ "stats",
75
+ "--no-stream",
76
+ "--format",
77
+ "{{.ID}}: {{.CPUPerc}} CPU, {{.MemUsage}} RAM",
78
+ container_id,
79
+ ],
80
+ capture_output=True,
81
+ text=True,
82
+ check=False,
83
+ timeout=10,
84
+ )
85
+ if stats_result.returncode != 0:
86
+ logging.debug("docker stats command failed for container %s", container_id)
87
+ return 0, 0
88
+ stats = stats_result.stdout.strip().split(": ")[1].split(", ")
89
+ cpu_usage = float(stats[0].replace("% CPU", "").strip())
90
+ memory_usage = stats[1].split(" / ")[0]
91
+ mem_value, mem_unit = memory_usage[:-3], memory_usage[-3:]
92
+ if mem_unit == "KiB":
93
+ memory_usage_mb = float(mem_value) / 1024
94
+ elif mem_unit == "MiB":
95
+ memory_usage_mb = float(mem_value)
96
+ elif mem_unit == "GiB":
97
+ memory_usage_mb = float(mem_value) * 1024
98
+ else:
99
+ memory_usage_mb = float(mem_value)
100
+ return cpu_usage, memory_usage_mb
101
+ except subprocess.TimeoutExpired:
102
+ logging.debug("docker stats command timed out for container %s", container_id)
103
+ return 0, 0
104
+ except (ValueError, IndexError) as e:
105
+ logging.debug("Error parsing docker stats for container %s: %s", container_id, e)
106
+ return 0, 0
107
+ except Exception as e:
108
+ logging.debug("Unexpected error getting container stats for %s: %s", container_id, e)
109
+ return 0, 0
96
110
 
97
111
  @log_errors(default_return=(0, 0), raise_exception=False, log_error=False)
98
112
  def get_container_gpu_info(self, container_id: str) -> Tuple[float, int]:
@@ -110,7 +124,7 @@ class ResourcesTracker:
110
124
  gpu_mem_used = self.get_container_gpu_memory_usage(container_pid)
111
125
  return gpu_util, gpu_mem_used
112
126
 
113
- @log_errors(default_return="", raise_exception=False)
127
+ @log_errors(default_return="", raise_exception=False, log_error=False)
114
128
  def get_pid_id_by_container_id(self, container_id: str) -> str:
115
129
  """
116
130
  Get PID for a container ID.
@@ -121,20 +135,31 @@ class ResourcesTracker:
121
135
  Returns:
122
136
  str: PID of the container.
123
137
  """
124
- pid_result = subprocess.run(
125
- [
126
- "docker",
127
- "inspect",
128
- "--format",
129
- "{{.State.Pid}}",
130
- container_id,
131
- ],
132
- capture_output=True,
133
- text=True,
134
- check=True,
135
- )
136
- container_pid = pid_result.stdout.strip()
137
- return container_pid
138
+ try:
139
+ pid_result = subprocess.run(
140
+ [
141
+ "docker",
142
+ "inspect",
143
+ "--format",
144
+ "{{.State.Pid}}",
145
+ container_id,
146
+ ],
147
+ capture_output=True,
148
+ text=True,
149
+ check=False,
150
+ timeout=10,
151
+ )
152
+ if pid_result.returncode != 0:
153
+ logging.debug("docker inspect command failed for container %s", container_id)
154
+ return ""
155
+ container_pid = pid_result.stdout.strip()
156
+ return container_pid
157
+ except subprocess.TimeoutExpired:
158
+ logging.debug("docker inspect command timed out for container %s", container_id)
159
+ return ""
160
+ except Exception as e:
161
+ logging.debug("Error getting PID for container %s: %s", container_id, e)
162
+ return ""
138
163
 
139
164
  @log_errors(default_return=0, raise_exception=False, log_error=False)
140
165
  def get_container_gpu_usage(self, container_pid: str) -> float:
@@ -155,9 +180,12 @@ class ResourcesTracker:
155
180
  ["nvidia-smi", "pmon", "-c", "1"],
156
181
  capture_output=True,
157
182
  text=True,
158
- check=True,
183
+ check=False,
159
184
  timeout=5,
160
185
  )
186
+ if result.returncode != 0:
187
+ logging.debug("nvidia-smi pmon command failed in get_container_gpu_usage")
188
+ return 0
161
189
  pmon_output = result.stdout.strip().split("\n")
162
190
  for line in pmon_output[2:]:
163
191
  parts = line.split()
@@ -167,7 +195,16 @@ class ResourcesTracker:
167
195
  if pid == str(container_pid):
168
196
  gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
169
197
  except subprocess.TimeoutExpired:
170
- logging.warning("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
198
+ logging.debug("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
199
+ return 0
200
+ except (ValueError, IndexError) as e:
201
+ logging.debug("Error parsing GPU usage info: %s", e)
202
+ return 0
203
+ except FileNotFoundError:
204
+ logging.debug("nvidia-smi not found on this system")
205
+ return 0
206
+ except Exception as e:
207
+ logging.debug("Unexpected error in get_container_gpu_usage: %s", e)
171
208
  return 0
172
209
  return gpu_util
173
210
 
@@ -196,9 +233,12 @@ class ResourcesTracker:
196
233
  stdout=subprocess.PIPE,
197
234
  stderr=subprocess.PIPE,
198
235
  text=True,
199
- check=True,
236
+ check=False,
200
237
  timeout=5,
201
238
  )
239
+ if result.returncode != 0:
240
+ logging.debug("nvidia-smi command failed in get_container_gpu_memory_usage")
241
+ return 0
202
242
  for line in result.stdout.splitlines():
203
243
  parts = line.strip().split(", ")
204
244
  if len(parts) == 2:
@@ -206,7 +246,16 @@ class ResourcesTracker:
206
246
  if process_pid == str(container_pid):
207
247
  total_memory += int(used_memory)
208
248
  except subprocess.TimeoutExpired:
209
- logging.warning("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
249
+ logging.debug("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
250
+ return 0
251
+ except (ValueError, IndexError) as e:
252
+ logging.debug("Error parsing GPU memory usage info: %s", e)
253
+ return 0
254
+ except FileNotFoundError:
255
+ logging.debug("nvidia-smi not found on this system")
256
+ return 0
257
+ except Exception as e:
258
+ logging.debug("Unexpected error in get_container_gpu_memory_usage: %s", e)
210
259
  return 0
211
260
  return total_memory
212
261
 
@@ -238,17 +287,40 @@ class ResourcesTracker:
238
287
  return gpu_memory_free, gpu_utilization
239
288
 
240
289
  try:
241
- subprocess.check_output("nvidia-smi", timeout=5)
290
+ result = subprocess.run(
291
+ ["nvidia-smi"],
292
+ stdout=subprocess.PIPE,
293
+ stderr=subprocess.PIPE,
294
+ timeout=5,
295
+ check=False,
296
+ )
297
+ if result.returncode != 0:
298
+ logging.debug("nvidia-smi command failed in _get_gpu_resources")
299
+ return 0, 0.0
242
300
  except subprocess.TimeoutExpired:
243
- logging.warning("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
301
+ logging.debug("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
302
+ return 0, 0.0
303
+ except FileNotFoundError:
304
+ logging.debug("nvidia-smi not found on this system")
305
+ return 0, 0.0
306
+ except Exception as e:
307
+ logging.debug("Error running nvidia-smi in _get_gpu_resources: %s", e)
244
308
  return 0, 0.0
245
309
 
246
310
  info_list = get_gpu_info()
247
- for info in info_list:
248
- info_split = info.split(", ")
249
- gpu_memory_free += int(info_split[5])
250
- gpu_utilization += float(info_split[2])
251
- gpu_utilization /= len(info_list) if info_list else 1
311
+ if not info_list:
312
+ return 0, 0.0
313
+
314
+ try:
315
+ for info in info_list:
316
+ info_split = info.split(", ")
317
+ if len(info_split) >= 6:
318
+ gpu_memory_free += int(info_split[5])
319
+ gpu_utilization += float(info_split[2])
320
+ gpu_utilization /= len(info_list) if info_list else 1
321
+ except (ValueError, IndexError) as e:
322
+ logging.debug("Error parsing GPU resources: %s", e)
323
+ return 0, 0.0
252
324
 
253
325
  return gpu_memory_free, gpu_utilization
254
326