matrice-compute 0.1.13__tar.gz → 0.1.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/action_instance.py +118 -103
  4. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/instance_utils.py +46 -15
  5. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/resources_tracker.py +59 -32
  6. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/scaling.py +5 -2
  7. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/task_utils.py +2 -2
  8. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/LICENSE.txt +0 -0
  9. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/README.md +0 -0
  10. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/SOURCES.txt +0 -0
  11. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/dependency_links.txt +0 -0
  12. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/not-zip-safe +0 -0
  13. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/matrice_compute.egg-info/top_level.txt +0 -0
  14. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/pyproject.toml +0 -0
  15. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/setup.cfg +0 -0
  16. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/setup.py +0 -0
  17. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/__init__.py +0 -0
  18. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/actions_manager.py +0 -0
  19. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  20. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/instance_manager.py +0 -0
  21. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/prechecks.py +0 -0
  22. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/py.typed +0 -0
  23. {matrice_compute-0.1.13 → matrice_compute-0.1.15}/src/matrice_compute/shutdown_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -575,75 +575,75 @@ class ActionInstance:
575
575
  )
576
576
  raise
577
577
 
578
- @log_errors(raise_exception=False)
579
- def create_redis_container(self, redis_image=None, redis_password=None):
580
- """Create and start a Redis container using Docker.
581
-
582
- Args:
583
- redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
584
-
585
- Returns:
586
- tuple: (container_info, error, message)
587
- """
588
- if redis_image is None:
589
- redis_image = "redis:latest"
590
-
591
- network_name = f"redis_network_{int(time.time())}"
592
- subprocess.run(f"docker network create {network_name}", shell=True, check=True)
593
-
594
- try:
595
- # Get an available port for Redis
596
- external_port = "6379"
597
-
598
- # Generate a unique container name and password
599
- container_name = f"redis_container_{int(time.time())}"
600
-
601
- # Build the docker command to create Redis container with password
602
- cmd = (
603
- f"docker run -d "
604
- f"--network {network_name} "
605
- f"--name {container_name} "
606
- f"-p {external_port}:6379 "
607
- f"--restart unless-stopped "
608
- f"{redis_image} "
609
- f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
610
- )
611
-
612
- logging.info("Creating Redis container with command: %s", cmd)
613
-
614
- # Execute the command
615
- result = subprocess.run(
616
- cmd, shell=True, capture_output=True, text=True, timeout=60
617
- )
618
-
619
- if result.returncode == 0:
620
- container_id = result.stdout.strip()
621
- container_info = {
622
- "container_id": container_id,
623
- "container_name": container_name,
624
- "network_name": network_name,
625
- "external_port": external_port,
626
- "internal_port": 6379,
627
- "password": redis_password,
628
- "image": redis_image,
629
- "status": "running",
630
- }
631
-
632
- logging.info("Redis container created successfully: %s", container_info)
633
- return container_info, None, "Redis container created successfully"
634
- else:
635
- error_message = f"Failed to create Redis container: {result.stderr}"
636
- logging.error(error_message)
637
- return None, "ContainerCreationError", error_message
638
-
639
- except subprocess.TimeoutExpired:
640
- error_message = "Timeout while creating Redis container"
641
- logging.error(error_message)
642
- return None, "TimeoutError", error_message
643
- except Exception as e:
644
- error_message = f"Unexpected error creating Redis container: {str(e)}"
645
- logging.error(error_message)
646
- return None, "UnexpectedError", error_message
578
+ # @log_errors(raise_exception=False)
579
+ # def create_redis_container(self, redis_image=None, redis_password=None):
580
+ # """Create and start a Redis container using Docker.
581
+
582
+ # Args:
583
+ # redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
584
+
585
+ # Returns:
586
+ # tuple: (container_info, error, message)
587
+ # """
588
+ # if redis_image is None:
589
+ # redis_image = "redis:latest"
590
+
591
+ # network_name = f"redis_network_{int(time.time())}"
592
+ # subprocess.run(f"docker network create {network_name}", shell=True, check=True)
593
+
594
+ # try:
595
+ # # Get an available port for Redis
596
+ # external_port = "6379"
597
+
598
+ # # Generate a unique container name and password
599
+ # container_name = f"redis_container_{int(time.time())}"
600
+
601
+ # # Build the docker command to create Redis container with password
602
+ # cmd = (
603
+ # f"docker run -d "
604
+ # f"--network {network_name} "
605
+ # f"--name {container_name} "
606
+ # f"-p {external_port}:6379 "
607
+ # f"--restart unless-stopped "
608
+ # f"{redis_image} "
609
+ # f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
610
+ # )
611
+
612
+ # logging.info("Creating Redis container with command: %s", cmd)
613
+
614
+ # # Execute the command
615
+ # result = subprocess.run(
616
+ # cmd, shell=True, capture_output=True, text=True, timeout=60
617
+ # )
618
+
619
+ # if result.returncode == 0:
620
+ # container_id = result.stdout.strip()
621
+ # container_info = {
622
+ # "container_id": container_id,
623
+ # "container_name": container_name,
624
+ # "network_name": network_name,
625
+ # "external_port": external_port,
626
+ # "internal_port": 6379,
627
+ # "password": redis_password,
628
+ # "image": redis_image,
629
+ # "status": "running",
630
+ # }
631
+
632
+ # logging.info("Redis container created successfully: %s", container_info)
633
+ # return container_info, None, "Redis container created successfully"
634
+ # else:
635
+ # error_message = f"Failed to create Redis container: {result.stderr}"
636
+ # logging.error(error_message)
637
+ # return None, "ContainerCreationError", error_message
638
+
639
+ # except subprocess.TimeoutExpired:
640
+ # error_message = "Timeout while creating Redis container"
641
+ # logging.error(error_message)
642
+ # return None, "TimeoutError", error_message
643
+ # except Exception as e:
644
+ # error_message = f"Unexpected error creating Redis container: {str(e)}"
645
+ # logging.error(error_message)
646
+ # return None, "UnexpectedError", error_message
647
647
 
648
648
  @log_errors(raise_exception=False, log_error=False)
649
649
  def send_logs_continuously(self):
@@ -1053,7 +1053,7 @@ def database_setup_execute(self: ActionInstance):
1053
1053
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1054
1054
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
1055
1055
  f"-e PROJECT_ID={project_id} "
1056
- f"-e ENV=dev "
1056
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1057
1057
  f"{image} "
1058
1058
  )
1059
1059
  print("Docker command", cmd)
@@ -1117,13 +1117,14 @@ def lpr_setup_execute(self: ActionInstance):
1117
1117
 
1118
1118
  # Add worker container run command
1119
1119
  worker_cmd = (
1120
- f"docker run -d --pull=always "
1120
+ f"docker run -d --net=host --pull=always "
1121
1121
  f"--name lpr-worker "
1122
1122
  f"-p {external_port}:8082 "
1123
1123
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1124
1124
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1125
1125
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1126
1126
  f'-e ACTION_ID="{self.action_record_id}" '
1127
+ f'-e PORT={external_port} '
1127
1128
  f"{image}"
1128
1129
  )
1129
1130
  print("Worker docker run command:", worker_cmd)
@@ -1134,27 +1135,31 @@ def lpr_setup_execute(self: ActionInstance):
1134
1135
  @log_errors(raise_exception=False)
1135
1136
  def inference_ws_server_execute(self: ActionInstance):
1136
1137
  """
1137
- Creates and start inference pipline.
1138
+ Creates and start inference pipeline.
1139
+ Inference WebSocket server runs on port 8102 (localhost only with --net=host).
1138
1140
  """
1139
1141
  action_details = self.get_action_details()
1140
1142
 
1141
1143
  if not action_details:
1142
1144
  return
1143
1145
  image = action_details["actionDetails"].get("docker")
1146
+
1144
1147
 
1145
1148
  self.setup_action_requirements(action_details)
1146
1149
 
1147
- # Add worker container run command
1150
+ # Inference WebSocket server with --net=host (Port: 8102)
1148
1151
  worker_cmd = (
1149
- f"docker run -d --pull=always "
1152
+ f"docker run -d --pull=always --net=host "
1150
1153
  f"--name inference "
1151
- f"-p 8102:8102 "
1152
1154
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1153
1155
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1154
1156
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1155
- f"{image}"
1157
+ f"{image} "
1158
+ f"./app "
1159
+ f"{self.action_record_id} "
1160
+
1156
1161
  )
1157
- print("inference docker run command:", worker_cmd)
1162
+ logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
1158
1163
 
1159
1164
  # Docker Command run
1160
1165
  self.start(worker_cmd, "inference_ws_server")
@@ -1163,7 +1168,8 @@ def inference_ws_server_execute(self: ActionInstance):
1163
1168
  @log_errors(raise_exception=False)
1164
1169
  def fe_fs_streaming_execute(self: ActionInstance):
1165
1170
  """
1166
- Creates and setup the frontend for fs streaming
1171
+ Creates and setup the frontend for fs streaming.
1172
+ Frontend streaming runs on port 3000 (localhost only with --net=host).
1167
1173
  """
1168
1174
  action_details = self.get_action_details()
1169
1175
 
@@ -1173,17 +1179,16 @@ def fe_fs_streaming_execute(self: ActionInstance):
1173
1179
 
1174
1180
  self.setup_action_requirements(action_details)
1175
1181
 
1176
- # Add worker container run command
1182
+ # Frontend streaming with --net=host (Port: 3000)
1177
1183
  worker_cmd = (
1178
- f"docker run -d --pull=always "
1184
+ f"docker run -d --pull=always --net=host "
1179
1185
  f"--name fe_streaming "
1180
- f"-p 3000:3000 "
1181
1186
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1182
1187
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1183
1188
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1184
1189
  f"{image}"
1185
1190
  )
1186
- print("fe_fs_stremaing docker run command:", worker_cmd)
1191
+ logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
1187
1192
 
1188
1193
  # Docker Command run
1189
1194
  self.start(worker_cmd, "fe_fs_streaming")
@@ -1235,8 +1240,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
1235
1240
  def redis_setup_execute(self: ActionInstance):
1236
1241
  """
1237
1242
  Creates and starts a Redis container using Docker.
1243
+ Redis runs on port 6379 (localhost only with --net=host).
1238
1244
  """
1239
- external_port = self.scaling.get_open_port()
1240
1245
  work_fs = get_max_file_system()
1241
1246
 
1242
1247
  action_details = self.get_action_details()
@@ -1248,18 +1253,6 @@ def redis_setup_execute(self: ActionInstance):
1248
1253
  "password", f"redis_pass_{int(time.time())}"
1249
1254
  )
1250
1255
 
1251
- container_info, error, message = self.create_redis_container(
1252
- action_details["actionDetails"].get("redis_image", "redis:latest"),
1253
- redis_password=redis_password,
1254
- )
1255
- if error:
1256
- logging.error(
1257
- "Error creating Redis container: %s",
1258
- message,
1259
- )
1260
- return
1261
- logging.info("Redis container created successfully: %s", container_info)
1262
-
1263
1256
  # Initialize redis container
1264
1257
  self.setup_action_requirements(
1265
1258
  action_details,
@@ -1268,17 +1261,39 @@ def redis_setup_execute(self: ActionInstance):
1268
1261
  action_id=action_id,
1269
1262
  )
1270
1263
 
1264
+ redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1265
+
1266
+ # Redis container with --net=host (Port: 6379)
1267
+ redis_cmd = (
1268
+ f"docker run -d --net=host "
1269
+ f"--name redis_container_{int(time.time())} "
1270
+ f"--restart unless-stopped "
1271
+ f"{redis_image} "
1272
+ f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1273
+ )
1274
+
1275
+ logging.info("Starting Redis container (Port: 6379): %s", redis_cmd)
1276
+
1277
+ # Start Redis container first
1278
+ redis_process = subprocess.Popen(
1279
+ redis_cmd,
1280
+ shell=True,
1281
+ stdout=subprocess.PIPE,
1282
+ stderr=subprocess.PIPE,
1283
+ )
1284
+ logging.info("Redis container started successfully on localhost:6379")
1285
+
1286
+ # Wait for Redis to be ready
1287
+ time.sleep(5)
1288
+
1271
1289
  env_vars = {
1272
- "REDIS_URL": f"{container_info['container_name']}:{container_info['external_port']}",
1273
- "REDIS_PASSWORD": container_info["password"],
1290
+ "REDIS_URL": f"localhost:6379",
1291
+ "REDIS_PASSWORD": redis_password,
1274
1292
  }
1275
1293
 
1276
- network_config = f" --network {container_info['network_name']} -p 8082:8082"
1277
-
1278
- # Make the docker file here
1294
+ # bg-redis management container with --net=host (Port: 8082)
1279
1295
  cmd = (
1280
- f"docker run "
1281
- f"{network_config} "
1296
+ f"docker run --net=host "
1282
1297
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1283
1298
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1284
1299
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1290,7 +1305,7 @@ def redis_setup_execute(self: ActionInstance):
1290
1305
  f"{self.action_record_id} "
1291
1306
  )
1292
1307
 
1293
- logging.info("cmd is: %s", cmd)
1308
+ logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
1294
1309
 
1295
1310
  self.start(cmd, "redis_setup")
1296
1311
 
@@ -128,8 +128,12 @@ def has_gpu() -> bool:
128
128
  Returns:
129
129
  bool: True if GPU is present, False otherwise
130
130
  """
131
- subprocess.run("nvidia-smi", check=True)
132
- return True
131
+ try:
132
+ subprocess.run("nvidia-smi", timeout=5)
133
+ return True
134
+ except subprocess.TimeoutExpired:
135
+ logging.warning("nvidia-smi command timed out after 5 seconds")
136
+ return False
133
137
 
134
138
 
135
139
  @log_errors(default_return=0, raise_exception=False)
@@ -141,13 +145,17 @@ def get_gpu_memory_usage() -> float:
141
145
  float: Memory usage between 0 and 1
142
146
  """
143
147
  command = "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,nounits,noheader"
144
- output = subprocess.check_output(command.split()).decode("ascii").strip().split("\n")
145
- memory_percentages = []
146
- for line in output:
147
- used, total = map(int, line.split(","))
148
- usage_percentage = used / total
149
- memory_percentages.append(usage_percentage)
150
- return min(memory_percentages)
148
+ try:
149
+ output = subprocess.check_output(command.split(), timeout=5).decode("ascii").strip().split("\n")
150
+ memory_percentages = []
151
+ for line in output:
152
+ used, total = map(int, line.split(","))
153
+ usage_percentage = used / total
154
+ memory_percentages.append(usage_percentage)
155
+ return min(memory_percentages)
156
+ except subprocess.TimeoutExpired:
157
+ logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_memory_usage")
158
+ return 0
151
159
 
152
160
 
153
161
  @log_errors(default_return=0, raise_exception=False)
@@ -194,17 +202,24 @@ def get_gpu_info() -> list:
194
202
  Returns:
195
203
  list: GPU information strings
196
204
  """
197
- with subprocess.Popen(
205
+ proc = subprocess.Popen(
198
206
  [
199
207
  "nvidia-smi",
200
208
  "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode,temperature.gpu",
201
209
  "--format=csv,noheader,nounits",
202
210
  ],
203
211
  stdout=subprocess.PIPE,
204
- ) as proc:
205
- stdout, _ = proc.communicate()
212
+ stderr=subprocess.PIPE,
213
+ )
214
+ try:
215
+ stdout, stderr = proc.communicate(timeout=5)
206
216
  output = stdout.decode("UTF-8")
207
217
  return output.split("\n")[:-1]
218
+ except subprocess.TimeoutExpired:
219
+ logging.warning("nvidia-smi command timed out after 5 seconds in get_gpu_info")
220
+ proc.kill()
221
+ proc.communicate() # flush output after kill
222
+ return []
208
223
 
209
224
 
210
225
  @log_errors(default_return="", raise_exception=False)
@@ -366,7 +381,13 @@ def get_max_file_system() -> str:
366
381
  max_available_filesystem,
367
382
  max_disk["available"],
368
383
  )
369
- if max_available_filesystem in ["/", ""]:
384
+ # Check if filesystem is writable, or if it's root/empty
385
+ if max_available_filesystem in ["/", ""] or not os.access(max_available_filesystem, os.W_OK):
386
+ if max_available_filesystem not in ["/", ""]:
387
+ logging.warning(
388
+ "Filesystem %s is not writable, falling back to home directory",
389
+ max_available_filesystem,
390
+ )
370
391
  home_dir = os.path.expanduser("~")
371
392
  if not os.environ.get("WORKSPACE_DIR"):
372
393
  logging.error("WORKSPACE_DIR environment variable not set")
@@ -499,7 +520,12 @@ def get_gpu_with_sufficient_memory_for_action(
499
520
  """
500
521
  required_gpu_memory = get_required_gpu_memory(action_details)
501
522
  command = "nvidia-smi --query-gpu=memory.free --format=csv"
502
- memory_free_info = subprocess.check_output(command.split()).decode("ascii").split("\n")
523
+ try:
524
+ memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
525
+ except subprocess.TimeoutExpired:
526
+ logging.error("nvidia-smi command timed out after 5 seconds in get_gpu_with_sufficient_memory_for_action")
527
+ raise ValueError("Failed to get GPU information - nvidia-smi timed out")
528
+
503
529
  if len(memory_free_info) < 2:
504
530
  raise ValueError("No GPU information available from nvidia-smi")
505
531
  memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
@@ -542,7 +568,12 @@ def get_single_gpu_with_sufficient_memory_for_action(
542
568
  """
543
569
  required_gpu_memory = get_required_gpu_memory(action_details)
544
570
  command = "nvidia-smi --query-gpu=memory.free --format=csv"
545
- memory_free_info = subprocess.check_output(command.split()).decode("ascii").split("\n")
571
+ try:
572
+ memory_free_info = subprocess.check_output(command.split(), timeout=5).decode("ascii").split("\n")
573
+ except subprocess.TimeoutExpired:
574
+ logging.error("nvidia-smi command timed out after 5 seconds in get_single_gpu_with_sufficient_memory_for_action")
575
+ raise ValueError("Failed to get GPU information - nvidia-smi timed out")
576
+
546
577
  if len(memory_free_info) < 2:
547
578
  raise ValueError("No GPU information available from nvidia-smi")
548
579
  memory_free_values = [int(x.split()[0]) for x in memory_free_info[1:-1]]
@@ -150,20 +150,25 @@ class ResourcesTracker:
150
150
  if not has_gpu():
151
151
  return 0
152
152
  gpu_util = 0
153
- result = subprocess.run(
154
- ["nvidia-smi", "pmon", "-c", "1"],
155
- capture_output=True,
156
- text=True,
157
- check=True,
158
- )
159
- pmon_output = result.stdout.strip().split("\n")
160
- for line in pmon_output[2:]:
161
- parts = line.split()
162
- if len(parts) >= 8:
163
- pid = parts[1]
164
- gpu_usage = parts[3]
165
- if pid == str(container_pid):
166
- gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
153
+ try:
154
+ result = subprocess.run(
155
+ ["nvidia-smi", "pmon", "-c", "1"],
156
+ capture_output=True,
157
+ text=True,
158
+ check=True,
159
+ timeout=5,
160
+ )
161
+ pmon_output = result.stdout.strip().split("\n")
162
+ for line in pmon_output[2:]:
163
+ parts = line.split()
164
+ if len(parts) >= 8:
165
+ pid = parts[1]
166
+ gpu_usage = parts[3]
167
+ if pid == str(container_pid):
168
+ gpu_util += float(gpu_usage) if gpu_usage != "-" else 0
169
+ except subprocess.TimeoutExpired:
170
+ logging.warning("nvidia-smi pmon command timed out after 5 seconds in get_container_gpu_usage")
171
+ return 0
167
172
  return gpu_util
168
173
 
169
174
  @log_errors(default_return=0, raise_exception=False, log_error=False)
@@ -185,19 +190,24 @@ class ResourcesTracker:
185
190
  "--format=csv,noheader,nounits",
186
191
  ]
187
192
  total_memory = 0
188
- result = subprocess.run(
189
- cmd,
190
- stdout=subprocess.PIPE,
191
- stderr=subprocess.PIPE,
192
- text=True,
193
- check=True,
194
- )
195
- for line in result.stdout.splitlines():
196
- parts = line.strip().split(", ")
197
- if len(parts) == 2:
198
- process_pid, used_memory = parts
199
- if process_pid == str(container_pid):
200
- total_memory += int(used_memory)
193
+ try:
194
+ result = subprocess.run(
195
+ cmd,
196
+ stdout=subprocess.PIPE,
197
+ stderr=subprocess.PIPE,
198
+ text=True,
199
+ check=True,
200
+ timeout=5,
201
+ )
202
+ for line in result.stdout.splitlines():
203
+ parts = line.strip().split(", ")
204
+ if len(parts) == 2:
205
+ process_pid, used_memory = parts
206
+ if process_pid == str(container_pid):
207
+ total_memory += int(used_memory)
208
+ except subprocess.TimeoutExpired:
209
+ logging.warning("nvidia-smi command timed out after 5 seconds in get_container_gpu_memory_usage")
210
+ return 0
201
211
  return total_memory
202
212
 
203
213
  @log_errors(default_return=(0, 0, 0, 0), raise_exception=False, log_error=True)
@@ -227,7 +237,12 @@ class ResourcesTracker:
227
237
  if not has_gpu():
228
238
  return gpu_memory_free, gpu_utilization
229
239
 
230
- subprocess.check_output("nvidia-smi")
240
+ try:
241
+ subprocess.check_output("nvidia-smi", timeout=5)
242
+ except subprocess.TimeoutExpired:
243
+ logging.warning("nvidia-smi command timed out after 5 seconds in _get_gpu_resources")
244
+ return 0, 0.0
245
+
231
246
  info_list = get_gpu_info()
232
247
  for info in info_list:
233
248
  info_split = info.split(", ")
@@ -247,10 +262,14 @@ class ActionsResourcesTracker:
247
262
  self.max_actions_usage = {}
248
263
  self.resources_tracker = ResourcesTracker()
249
264
  self.client = docker.from_env()
265
+ self.logged_stopped_containers = []
250
266
 
251
267
  @log_errors(raise_exception=False, log_error=True)
252
268
  def update_actions_resources(self) -> None:
253
- """Process both running and exited containers"""
269
+ """Process both running and exited containers.
270
+
271
+ Note: Does not remove containers to keep logs. Only tracks resource usage.
272
+ """
254
273
  exited_containers = self.client.containers.list(
255
274
  filters={"status": "exited"},
256
275
  all=True,
@@ -259,8 +278,12 @@ class ActionsResourcesTracker:
259
278
  if exited_containers:
260
279
  for container in exited_containers:
261
280
  try:
281
+ if container.id in self.logged_stopped_containers:
282
+ continue
262
283
  self._update_container_action_status(container, "completed")
263
- container.remove()
284
+ self.logged_stopped_containers.append(container.id)
285
+ # COMMENTED OUT: Do not remove containers to keep logs
286
+ # container.remove()
264
287
  except Exception as err:
265
288
  logging.error(
266
289
  "Error processing exited container %s: %s",
@@ -310,7 +333,7 @@ class ActionsResourcesTracker:
310
333
  args_24 = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if len(arg) == 24 and "pypi" not in arg]
311
334
  action_record_id = args_24[-1] if args_24 else None
312
335
  if not action_record_id:
313
- logging.warning("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
336
+ logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
314
337
  duration = calculate_time_difference(start_time, finish_time)
315
338
  (
316
339
  current_gpu_utilization,
@@ -320,6 +343,8 @@ class ActionsResourcesTracker:
320
343
  ) = self.get_current_action_usage(container, status)
321
344
  sub_containers = self.get_sub_containers_by_label("action_id", action_record_id)
322
345
  for sub_container in sub_containers:
346
+ if sub_container.id in self.logged_stopped_containers:
347
+ continue
323
348
  (
324
349
  sub_container_gpu_utilization,
325
350
  sub_container_gpu_memory,
@@ -330,10 +355,12 @@ class ActionsResourcesTracker:
330
355
  current_gpu_memory += sub_container_gpu_memory
331
356
  current_cpu_utilization += sub_container_cpu_utilization
332
357
  current_memory_utilization += sub_container_memory_utilization
358
+ # COMMENTED OUT: Do not stop/remove sub-containers to keep logs
333
359
  if status == "completed":
334
360
  try:
335
361
  sub_container.stop()
336
- sub_container.remove(force=True)
362
+ self.logged_stopped_containers.append(sub_container.id)
363
+ # sub_container.remove(force=True)
337
364
  except Exception as err:
338
365
  logging.error(
339
366
  "Error removing sub-container %s: %s",
@@ -9,6 +9,7 @@ import json
9
9
  import time
10
10
  import base64
11
11
 
12
+ # TODO: update /scaling to /compute
12
13
 
13
14
  class Scaling:
14
15
 
@@ -185,6 +186,8 @@ class Scaling:
185
186
  Returns:
186
187
  Tuple of (data, error, message) from API response
187
188
  """
189
+ if not action_record_id:
190
+ return None, "Action record id is required", "Action record id is required"
188
191
  logging.info(
189
192
  "Updating action status for action %s",
190
193
  action_record_id,
@@ -499,7 +502,7 @@ class Scaling:
499
502
 
500
503
  # Using REST API directly
501
504
  try:
502
- path = f"/v1/scaling/update_available_resources/{self.instance_id}"
505
+ path = f"/v1/compute/update_available_resources/{self.instance_id}"
503
506
  resp = self.rpc.put(path=path, payload=payload)
504
507
  return self.handle_response(
505
508
  resp,
@@ -644,7 +647,7 @@ class Scaling:
644
647
  Returns:
645
648
  Tuple of (data, error, message) from API response
646
649
  """
647
- path = f"/v1/scaling/get_models_secret_keys?secret_name={secret_name}"
650
+ path = f"/v1/compute/get_models_secret_keys?secret_name={secret_name}"
648
651
  resp = self.rpc.get(path=path)
649
652
  return self.handle_response(
650
653
  resp,
@@ -29,8 +29,8 @@ def setup_workspace_and_run_task(
29
29
  workspace_dir = f"{work_fs}/{action_id}"
30
30
  codebase_zip_path = f"{workspace_dir}/file.zip"
31
31
  requirements_txt_path = f"{workspace_dir}/requirements.txt"
32
- if os.path.exists(workspace_dir):
33
- return
32
+ # if os.path.exists(workspace_dir): # don't skip if workspace already exists, override it
33
+ # return
34
34
  os.makedirs(workspace_dir, exist_ok=True)
35
35
 
36
36
  # Download codebase ZIP file