matrice-compute 0.1.35__py3-none-any.whl → 0.1.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,6 @@ import signal
10
10
  import urllib.request
11
11
  from matrice_compute.instance_utils import (
12
12
  get_gpu_with_sufficient_memory_for_action,
13
- get_gpu_config_for_deployment,
14
13
  get_decrypted_access_key_pair,
15
14
  get_max_file_system,
16
15
  get_best_service_ip_and_network,
@@ -27,10 +26,6 @@ from matrice_common.utils import log_errors
27
26
  class ActionInstance:
28
27
  """Base class for tasks that run in Action containers."""
29
28
 
30
- # Class-level dictionary to track deployed services and their ports
31
- # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
32
- _deployed_services = {}
33
-
34
29
  def __init__(self, scaling: Scaling, action_info: dict):
35
30
  """Initialize an action instance.
36
31
 
@@ -90,67 +85,6 @@ class ActionInstance:
90
85
  raise ValueError(f"Unknown action type: {self.action_type}")
91
86
  self.task = self.actions_map[self.action_type]
92
87
 
93
- @classmethod
94
- def is_first_deployment_for_service(cls, service_id):
95
- """Check if this is the first deployment for a given service.
96
-
97
- Args:
98
- service_id (str): Service ID (_idService)
99
-
100
- Returns:
101
- bool: True if this is the first deployment, False otherwise
102
- """
103
- if not service_id:
104
- return False
105
- return service_id not in cls._deployed_services
106
-
107
- @classmethod
108
- def get_or_create_triton_ports(cls, service_id, scaling_instance):
109
- """Get existing TRITON_PORTS for a service or create new ones.
110
-
111
- Args:
112
- service_id (str): Service ID (_idService)
113
- scaling_instance: Scaling instance to get open ports
114
-
115
- Returns:
116
- str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
117
- """
118
- if not service_id:
119
- # No service_id, generate new ports
120
- port1 = scaling_instance.get_open_port()
121
- port2 = scaling_instance.get_open_port()
122
- port3 = scaling_instance.get_open_port()
123
- return f"{port1},{port2},{port3}"
124
-
125
- # Check if ports already exist for this service
126
- if service_id in cls._deployed_services:
127
- triton_ports = cls._deployed_services[service_id]["triton_ports"]
128
- logging.info(
129
- "Reusing TRITON_PORTS for service %s: %s",
130
- service_id,
131
- triton_ports
132
- )
133
- return triton_ports
134
-
135
- # First deployment: generate new ports and store them
136
- port1 = scaling_instance.get_open_port()
137
- port2 = scaling_instance.get_open_port()
138
- port3 = scaling_instance.get_open_port()
139
- triton_ports = f"{port1},{port2},{port3}"
140
-
141
- # Store for future use
142
- cls._deployed_services[service_id] = {
143
- "triton_ports": triton_ports,
144
- "is_first": False
145
- }
146
-
147
- logging.info(
148
- "First deployment for service %s - generated TRITON_PORTS: %s",
149
- service_id,
150
- triton_ports
151
- )
152
- return triton_ports
153
-
154
88
  @log_errors(default_return={}, raise_exception=True, log_error=False)
155
89
  def _init_credentials(self):
156
90
  """Initialize Matrice credentials.
@@ -297,7 +231,7 @@ class ActionInstance:
297
231
  getattr(self, "action_record_id", "unknown"),
298
232
  )
299
233
  else:
300
- logging.info(
234
+ logging.debug(
301
235
  "No additional logs to send for action %s",
302
236
  getattr(self, "action_record_id", "unknown"),
303
237
  )
@@ -352,13 +286,13 @@ class ActionInstance:
352
286
  ).get("gpuMemory", 0)
353
287
 
354
288
  logging.info(
355
- "Action %s requires GPU with %d MB memory - selecting GPU(s) with most free memory",
289
+ "Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
356
290
  action_id,
357
291
  required_memory
358
292
  )
359
293
 
360
294
  try:
361
- # Get the GPU(s) with most free memory that have sufficient memory
295
+ # Get the best-fit GPU(s) with sufficient memory
362
296
  gpu_indices = get_gpu_with_sufficient_memory_for_action(
363
297
  action_details=action_details
364
298
  )
@@ -412,7 +346,6 @@ class ActionInstance:
412
346
  destination_workspace_path: str = "/usr/src/workspace",
413
347
  docker_workdir: str = "",
414
348
  extra_pkgs: list = [],
415
- container_name: str = "",
416
349
  ):
417
350
  """Build base Docker command with common options.
418
351
 
@@ -427,7 +360,6 @@ class ActionInstance:
427
360
  destination_workspace_path (str): Container workspace path
428
361
  docker_workdir (str): Docker working directory
429
362
  extra_pkgs (list): List of extra packages to install
430
- container_name (str): Docker container name (format: {action_type}_{action_id})
431
363
  Returns:
432
364
  str: Base Docker command
433
365
  """
@@ -492,20 +424,17 @@ class ActionInstance:
492
424
  ]
493
425
  )
494
426
 
495
- # Build container name option if provided
496
- name_option = f"--name {container_name}" if container_name else ""
497
-
498
427
  # if the service provider is local, then put --restart unless-stopped
499
428
  if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
500
429
  env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
501
430
 
502
431
  cmd_parts = [
503
- f"docker run -d {use_gpu} ",
504
- name_option,
432
+ f"docker run {use_gpu} ",
505
433
  network_config,
506
434
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
507
435
  *volumes,
508
436
  # Container configuration and startup commands
437
+ f"--cidfile ./{self.action_record_id}.cid ",
509
438
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
510
439
  f'/bin/bash -c "cd {docker_workdir} && '
511
440
  f"{env_exports} && "
@@ -893,34 +822,6 @@ class ActionInstance:
893
822
  job_params=action_details["jobParams"],
894
823
  )
895
824
 
896
- @staticmethod
897
- def container_exists(container_id: str) -> bool:
898
- """Check if a Docker container exists.
899
-
900
- Args:
901
- container_id (str): Container ID or name to check
902
-
903
- Returns:
904
- bool: True if container exists, False otherwise
905
- """
906
- if not container_id:
907
- return False
908
- try:
909
- result = subprocess.run(
910
- ["docker", "inspect", container_id],
911
- capture_output=True,
912
- text=True,
913
- timeout=10
914
- )
915
- return result.returncode == 0
916
- except Exception as e:
917
- logging.warning(
918
- "Error checking if container %s exists: %s",
919
- container_id,
920
- str(e)
921
- )
922
- return False
923
-
924
825
  @log_errors(raise_exception=True)
925
826
  def start_process(self, cmd, log_name):
926
827
  """Start the process and initialize logging.
@@ -935,54 +836,60 @@ class ActionInstance:
935
836
  self.cmd = cmd
936
837
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
937
838
 
938
- # Run docker with -d flag to get container ID from stdout
939
- process = subprocess.Popen(
940
- shlex.split(self.cmd),
941
- stdout=subprocess.PIPE,
942
- stderr=subprocess.PIPE,
943
- text=True,
944
- env={**os.environ},
945
- )
946
-
947
- # Use a longer timeout for docker run since --pull=always may need to
948
- # download large images on first run. Default: 30 minutes (1800 seconds)
949
- # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
950
- docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
951
- logging.info(
952
- "Waiting for docker container to start for action %s (timeout: %d seconds)",
953
- self.action_record_id,
954
- docker_start_timeout,
955
- )
956
- stdout, stderr = process.communicate(timeout=docker_start_timeout)
839
+ with open(self.log_path, "wb") as out:
840
+ self.process = subprocess.Popen(
841
+ shlex.split(self.cmd),
842
+ stdout=out,
843
+ stderr=out,
844
+ env={**os.environ},
845
+ start_new_session=True,
846
+ )
957
847
 
958
- if process.returncode != 0:
848
+ self.container_id = None
849
+
850
+ cid_file_path = f"./{self.action_record_id}.cid"
851
+ max_retries = 5
852
+ retry_delay = 1 # seconds
853
+ for attempt in range(max_retries):
854
+ try:
855
+ with open(cid_file_path, "r") as cid_file:
856
+ container_id = cid_file.read().strip()
857
+ self.container_id = container_id
858
+ logging.info(
859
+ "Started process for action %s with container ID: %s",
860
+ self.action_record_id,
861
+ self.container_id,
862
+ )
863
+ break
864
+ except FileNotFoundError:
865
+ logging.warning(
866
+ "CID file not found for action %s, attempt %d/%d",
867
+ self.action_record_id,
868
+ attempt + 1,
869
+ max_retries,
870
+ )
871
+ time.sleep(retry_delay)
872
+ except Exception as e:
873
+ logging.error(
874
+ "Error reading CID file for action %s: %s",
875
+ self.action_record_id,
876
+ str(e),
877
+ )
878
+ time.sleep(retry_delay)
879
+ else:
959
880
  logging.error(
960
- "Docker run failed for action %s: %s",
881
+ "Failed to read CID file for action %s after %d attempts",
961
882
  self.action_record_id,
962
- stderr,
883
+ max_retries,
963
884
  )
964
- raise RuntimeError(f"Docker run failed: {stderr}")
965
-
966
- self.container_id = stdout.strip()
967
- logging.info(
968
- "Started container for action %s with ID: %s",
969
- self.action_record_id,
970
- self.container_id,
971
- )
972
-
973
- # Start following container logs in background
974
- self.process = subprocess.Popen(
975
- ["docker", "logs", "-f", self.container_id],
976
- stdout=open(self.log_path, "wb"),
977
- stderr=subprocess.STDOUT,
978
- start_new_session=True,
979
- )
885
+ raise Exception("Failed to start process: CID file not found")
980
886
 
981
- # Report container id to scaling service
887
+ # report container id to scaling service
982
888
  self.scaling.update_action_container_id(
983
889
  action_record_id=self.action_record_id,
984
890
  container_id=self.container_id,
985
891
  )
892
+
986
893
 
987
894
  @log_errors(raise_exception=False)
988
895
  def start_logger(self):
@@ -1143,8 +1050,7 @@ def data_preparation_execute(
1143
1050
  "Started pulling Docker image with PID: %s",
1144
1051
  process.pid,
1145
1052
  )
1146
- container_name = f"data_prep_{self.action_record_id}"
1147
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1053
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1148
1054
  logging.info("cmd is: %s", cmd)
1149
1055
  self.start(cmd, "data_preparation_log")
1150
1056
 
@@ -1173,8 +1079,7 @@ def data_processing_execute(self: ActionInstance):
1173
1079
  service="bg-job-scheduler",
1174
1080
  job_params=action["jobParams"],
1175
1081
  )
1176
- container_name = f"data_processing_{self.action_record_id}"
1177
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1082
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1178
1083
  logging.info("cmd: %s", cmd)
1179
1084
  self.start(cmd, "data_processing_log")
1180
1085
 
@@ -1187,8 +1092,7 @@ def data_split_execute(self: ActionInstance):
1187
1092
  if not action_details:
1188
1093
  return
1189
1094
  self.setup_action_requirements(action_details, work_fs, model_family="")
1190
- container_name = f"data_split_{self.action_record_id}"
1191
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1095
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1192
1096
  logging.info("cmd: %s", cmd)
1193
1097
  self.start(cmd, "data_split")
1194
1098
 
@@ -1203,8 +1107,7 @@ def dataset_annotation_execute(
1203
1107
  if not action_details:
1204
1108
  return
1205
1109
  self.setup_action_requirements(action_details, work_fs)
1206
- container_name = f"dataset_annotation_{self.action_record_id}"
1207
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1110
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1208
1111
  logging.info("cmd: %s", cmd)
1209
1112
  self.start(cmd, "dataset_annotation")
1210
1113
 
@@ -1219,8 +1122,7 @@ def dataset_augmentation_execute(
1219
1122
  if not action_details:
1220
1123
  return
1221
1124
  self.setup_action_requirements(action_details, work_fs)
1222
- container_name = f"dataset_augmentation_{self.action_record_id}"
1223
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1125
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1224
1126
  logging.info("cmd: %s", cmd)
1225
1127
  self.start(cmd, "dataset_augmentation")
1226
1128
 
@@ -1236,8 +1138,7 @@ def augmentation_server_creation_execute(
1236
1138
  if not action_details:
1237
1139
  return
1238
1140
  self.setup_action_requirements(action_details, work_fs)
1239
- container_name = f"augmentation_setup_{self.action_record_id}"
1240
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1141
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1241
1142
  logging.info("cmd: %s", cmd)
1242
1143
  self.start(cmd, "augmentation_setup")
1243
1144
 
@@ -1258,45 +1159,30 @@ def database_setup_execute(self: ActionInstance):
1258
1159
 
1259
1160
  project_id = action_details["_idProject"]
1260
1161
 
1261
- # Define container names with action_record_id for uniqueness
1262
- mongodb_container_name = f"database_setup_{self.action_record_id}"
1263
- qdrant_container_name = f"qdrant_{self.action_record_id}"
1162
+ if action_details["actionDetails"].get("containerId"):
1163
+ logging.info(
1164
+ "Using existing container ID for inference tracker: %s",
1165
+ action_details["actionDetails"]["containerId"],
1166
+ )
1167
+ self.docker_container = action_details["actionDetails"]["containerId"]
1168
+ cmd = "docker restart " + self.docker_container
1169
+ self.start(cmd, "qdrant_setup")
1264
1170
 
1265
- existing_container_id = action_details["actionDetails"].get("containerId")
1266
- if existing_container_id:
1267
- # Check if both containers actually exist before trying to restart
1268
- mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
1269
- qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
1171
+ #qdrant restart
1172
+ qdrant_cmd = "docker restart qdrant"
1173
+ self.start(qdrant_cmd, 'qdrant_setup')
1270
1174
 
1271
- if mongodb_container_exists and qdrant_container_exists:
1272
- logging.info(
1273
- "Using existing container ID for database setup: %s",
1274
- existing_container_id,
1275
- )
1276
- self.docker_container = existing_container_id
1277
- cmd = "docker restart " + self.docker_container
1278
- self.start(cmd, "database_setup")
1175
+ return
1176
+
1279
1177
 
1280
- # qdrant restart
1281
- qdrant_cmd = f"docker restart {qdrant_container_name}"
1282
- self.start(qdrant_cmd, "qdrant_setup")
1283
- return
1284
- else:
1285
- logging.warning(
1286
- "Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
1287
- mongodb_container_exists,
1288
- qdrant_container_exists
1289
- )
1290
- # Fall through to create new containers
1178
+ dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
1291
1179
 
1292
- dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
1293
1180
 
1294
1181
  # MongoDB container with --net=host (Port: 27020:27017)
1295
1182
  cmd = (
1296
1183
  f"docker run --pull=always --net=host "
1297
- f"--name {mongodb_container_name} "
1298
- f"-v matrice_myvol:/matrice_data "
1299
1184
  f"-v {dbPath}:{dbPath} "
1185
+ f"--name database_setup_{self.action_record_id} "
1300
1186
  f"-v /var/run/docker.sock:/var/run/docker.sock "
1301
1187
  f"--cidfile ./{self.action_record_id}.cid "
1302
1188
  f"-e ACTION_RECORD_ID={self.action_record_id} "
@@ -1308,23 +1194,6 @@ def database_setup_execute(self: ActionInstance):
1308
1194
  )
1309
1195
  logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
1310
1196
 
1311
- # Qdrant container with --net=host (Port: 6334)
1312
- qdrant_cmd = (
1313
- f"docker run -d --pull=always --net=host "
1314
- f"--name {qdrant_container_name} "
1315
- f"-v matrice_myvol:/matrice_data "
1316
- f"qdrant/qdrant:latest "
1317
- )
1318
- logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
1319
-
1320
- # Start Qdrant container
1321
- qdrant_process = subprocess.Popen(
1322
- qdrant_cmd,
1323
- shell=True,
1324
- stdout=subprocess.PIPE,
1325
- stderr=subprocess.PIPE,
1326
- )
1327
- logging.info("Qdrant container started successfully")
1328
1197
 
1329
1198
  # Docker Command run
1330
1199
  self.start(cmd, "database_setup")
@@ -1344,32 +1213,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
1344
1213
 
1345
1214
  self.setup_action_requirements(action_details)
1346
1215
 
1347
- existing_container_id = action_details["actionDetails"].get("containerId")
1348
- if existing_container_id:
1349
- # Check if container actually exists before trying to restart
1350
- if ActionInstance.container_exists(existing_container_id):
1351
- logging.info(
1352
- "Using existing container ID for facial recognition worker: %s",
1353
- existing_container_id,
1354
- )
1355
- self.docker_container = existing_container_id
1356
- cmd = "docker restart " + self.docker_container
1357
- self.start(cmd, "facial_recognition_setup")
1358
- return
1359
- else:
1360
- logging.warning(
1361
- "Container %s not found. Creating new container.",
1362
- existing_container_id
1363
- )
1364
- # Fall through to create new container
1216
+ if action_details["actionDetails"].get("containerId"):
1217
+ logging.info(
1218
+ "Using existing container ID for facial recognition worker: %s",
1219
+ action_details["actionDetails"]["containerId"],
1220
+ )
1221
+ self.docker_container = action_details["actionDetails"]["containerId"]
1222
+ cmd = "docker restart " + self.docker_container
1223
+ self.start(cmd, "facial_recognition_setup")
1224
+ return
1365
1225
 
1366
1226
  # Facial recognition worker container with --net=host (Port: 8081)
1367
- container_name = f"facial_recognition_{self.action_record_id}"
1368
1227
  worker_cmd = (
1369
1228
  f"docker run -d --pull=always --net=host "
1370
- f"--name {container_name} "
1371
- f"--cidfile ./{self.action_record_id}.cid "
1229
+ f"--name worker "
1230
+ f"--cidfile ./{self.action_record_id}.cid "
1372
1231
  f"-v matrice_myvol:/matrice_data "
1232
+ f"--cidfile ./{self.action_record_id}.cid "
1373
1233
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1374
1234
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1375
1235
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1396,30 +1256,20 @@ def lpr_setup_execute(self: ActionInstance):
1396
1256
 
1397
1257
  self.setup_action_requirements(action_details)
1398
1258
 
1399
- existing_container_id = action_details["actionDetails"].get("containerId")
1400
- if existing_container_id:
1401
- # Check if container actually exists before trying to restart
1402
- if ActionInstance.container_exists(existing_container_id):
1403
- logging.info(
1404
- "Using existing container ID for LPR worker: %s",
1405
- existing_container_id,
1406
- )
1407
- self.docker_container = existing_container_id
1408
- cmd = "docker restart " + self.docker_container
1409
- self.start(cmd, "lpr_setup")
1410
- return
1411
- else:
1412
- logging.warning(
1413
- "Container %s not found. Creating new container.",
1414
- existing_container_id
1415
- )
1416
- # Fall through to create new container
1259
+ if action_details["actionDetails"].get("containerId"):
1260
+ logging.info(
1261
+ "Using existing container ID for LPR worker: %s",
1262
+ action_details["actionDetails"]["containerId"],
1263
+ )
1264
+ self.docker_container = action_details["actionDetails"]["containerId"]
1265
+ cmd = "docker restart " + self.docker_container
1266
+ self.start(cmd, "lpr_setup")
1267
+ return
1417
1268
 
1418
1269
  # LPR worker container with --net=host (Port: 8082)
1419
- container_name = f"lpr_{self.action_record_id}"
1420
1270
  worker_cmd = (
1421
1271
  f"docker run -d --net=host --pull=always "
1422
- f"--name {container_name} "
1272
+ f"--name lpr-worker "
1423
1273
  f"--cidfile ./{self.action_record_id}.cid "
1424
1274
  f"-v matrice_myvol:/matrice_data "
1425
1275
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1458,30 +1308,20 @@ def inference_ws_server_execute(self: ActionInstance):
1458
1308
 
1459
1309
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1460
1310
 
1461
- existing_container_id = action_details["actionDetails"].get("containerId")
1462
- if existing_container_id:
1463
- # Check if container actually exists before trying to restart
1464
- if ActionInstance.container_exists(existing_container_id):
1465
- logging.info(
1466
- "Using existing container ID for inference WebSocket server: %s",
1467
- existing_container_id,
1468
- )
1469
- self.docker_container = existing_container_id
1470
- cmd = "docker restart " + self.docker_container
1471
- self.start(cmd, "inference_ws_server")
1472
- return
1473
- else:
1474
- logging.warning(
1475
- "Container %s not found. Creating new container.",
1476
- existing_container_id
1477
- )
1478
- # Fall through to create new container
1311
+ if action_details["actionDetails"].get("containerId"):
1312
+ logging.info(
1313
+ "Using existing container ID for inference WebSocket server: %s",
1314
+ action_details["actionDetails"]["containerId"],
1315
+ )
1316
+ self.docker_container = action_details["actionDetails"]["containerId"]
1317
+ cmd = "docker restart " + self.docker_container
1318
+ self.start(cmd, "inference_ws_server")
1319
+ return
1479
1320
 
1480
1321
  # Inference WebSocket server with --net=host (Port: 8102)
1481
- container_name = f"inference_ws_{self.action_record_id}"
1482
1322
  worker_cmd = (
1483
1323
  f"docker run -d --pull=always --net=host "
1484
- f"--name {container_name} "
1324
+ f"--name inference "
1485
1325
  f"--cidfile ./{self.action_record_id}.cid "
1486
1326
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1487
1327
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1517,30 +1357,20 @@ def fe_fs_streaming_execute(self: ActionInstance):
1517
1357
 
1518
1358
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1519
1359
 
1520
- existing_container_id = action_details["actionDetails"].get("containerId")
1521
- if existing_container_id:
1522
- # Check if container actually exists before trying to restart
1523
- if ActionInstance.container_exists(existing_container_id):
1524
- logging.info(
1525
- "Using existing container ID for frontend streaming: %s",
1526
- existing_container_id,
1527
- )
1528
- self.docker_container = existing_container_id
1529
- cmd = "docker restart " + self.docker_container
1530
- self.start(cmd, "fe_fs_streaming")
1531
- return
1532
- else:
1533
- logging.warning(
1534
- "Container %s not found. Creating new container.",
1535
- existing_container_id
1536
- )
1537
- # Fall through to create new container
1538
-
1360
+ if action_details["actionDetails"].get("containerId"):
1361
+ logging.info(
1362
+ "Using existing container ID for frontend streaming: %s",
1363
+ action_details["actionDetails"]["containerId"],
1364
+ )
1365
+ self.docker_container = action_details["actionDetails"]["containerId"]
1366
+ cmd = "docker restart " + self.docker_container
1367
+ self.start(cmd, "fe_fs_streaming")
1368
+ return
1369
+
1539
1370
  # Frontend streaming with --net=host (Port: 3000)
1540
- container_name = f"fe_streaming_{self.action_record_id}"
1541
1371
  worker_cmd = (
1542
1372
  f"docker run -d --pull=always --net=host "
1543
- f"--name {container_name} "
1373
+ f"--name fe_streaming "
1544
1374
  f"--cidfile ./{self.action_record_id}.cid "
1545
1375
  f"-v matrice_myvol:/matrice_data "
1546
1376
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1573,30 +1403,20 @@ def fe_analytics_service_execute(self: ActionInstance):
1573
1403
 
1574
1404
  project_id = action_details["_idProject"]
1575
1405
 
1576
- existing_container_id = action_details["actionDetails"].get("containerId")
1577
- if existing_container_id:
1578
- # Check if container actually exists before trying to restart
1579
- if ActionInstance.container_exists(existing_container_id):
1580
- logging.info(
1581
- "Using existing container ID for frontend analytics service: %s",
1582
- existing_container_id,
1583
- )
1584
- self.docker_container = existing_container_id
1585
- cmd = "docker restart " + self.docker_container
1586
- self.start(cmd, "fe_analytics_service")
1587
- return
1588
- else:
1589
- logging.warning(
1590
- "Container %s not found. Creating new container.",
1591
- existing_container_id
1592
- )
1593
- # Fall through to create new container
1594
-
1406
+ if action_details["actionDetails"].get("containerId"):
1407
+ logging.info(
1408
+ "Using existing container ID for frontend analytics service: %s",
1409
+ action_details["actionDetails"]["containerId"],
1410
+ )
1411
+ self.docker_container = action_details["actionDetails"]["containerId"]
1412
+ cmd = "docker restart " + self.docker_container
1413
+ self.start(cmd, "fe_analytics_service")
1414
+ return
1415
+
1595
1416
  # Frontend analytics service with --net=host (Port: 3001)
1596
- container_name = f"fe_analytics_{self.action_record_id}"
1597
1417
  worker_cmd = (
1598
1418
  f"docker run -d --pull=always --net=host "
1599
- f"--name {container_name} "
1419
+ f"--name fe-analytics "
1600
1420
  f"--cidfile ./{self.action_record_id}.cid "
1601
1421
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1602
1422
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1629,8 +1449,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1629
1449
  else:
1630
1450
  return
1631
1451
  use_gpu = self.get_gpu_config(action_details)
1632
- container_name = f"dataset_generation_{self.action_record_id}"
1633
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1452
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1634
1453
  logging.info("cmd is: %s", cmd)
1635
1454
  self.start(cmd, "dataset_generation")
1636
1455
 
@@ -1651,8 +1470,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
1651
1470
  else:
1652
1471
  return
1653
1472
  use_gpu = self.get_gpu_config(action_details)
1654
- container_name = f"synthetic_data_setup_{self.action_record_id}"
1655
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1473
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1656
1474
  logging.info("cmd is: %s", cmd)
1657
1475
  self.start(cmd, "synthetic_data_setup")
1658
1476
 
@@ -1689,60 +1507,31 @@ def redis_setup_execute(self: ActionInstance):
1689
1507
 
1690
1508
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1691
1509
 
1692
- # Define container names with action_record_id for uniqueness
1693
- redis_container_name = f"redis_{self.action_record_id}"
1694
1510
 
1695
- existing_container_id = action_details["actionDetails"].get("containerId")
1696
- if existing_container_id:
1697
- # Check if both containers actually exist before trying to restart
1698
- management_container_exists = ActionInstance.container_exists(existing_container_id)
1699
- redis_container_exists = ActionInstance.container_exists(redis_container_name)
1700
-
1701
- if management_container_exists and redis_container_exists:
1702
- logging.info(
1703
- "Using existing container ID for redis management: %s",
1704
- existing_container_id,
1705
- )
1706
- self.docker_container = existing_container_id
1707
- cmd = "docker restart " + self.docker_container
1708
- self.start(cmd, "redis_setup")
1511
+ if action_details["actionDetails"].get("containerId"):
1512
+ logging.info(
1513
+ "Using existing container ID for redis management: %s",
1514
+ action_details["actionDetails"]["containerId"],
1515
+ )
1516
+ self.docker_container = action_details["actionDetails"]["containerId"]
1517
+ cmd = "docker restart " + self.docker_container
1518
+ self.start(cmd, "redis_setup")
1709
1519
 
1710
- # Redis container restart
1711
- redis_restart_cmd = f"docker restart {redis_container_name}"
1712
- self.start(redis_restart_cmd, "redis")
1713
- return
1714
- else:
1715
- logging.warning(
1716
- "Container(s) not found (management=%s, redis=%s). Creating new containers.",
1717
- management_container_exists,
1718
- redis_container_exists
1719
- )
1720
- # Fall through to create new containers
1520
+ # Redis container restart
1521
+ redis_restart_cmd = "docker restart redis_container"
1522
+ self.start(redis_restart_cmd, "redis")
1721
1523
 
1524
+ return
1525
+
1722
1526
  # Redis container with --net=host (Port: 6379)
1723
1527
  redis_cmd = (
1724
1528
  f"docker run -d --net=host "
1725
- f"--name {redis_container_name} "
1529
+ f"--name redis_container "
1726
1530
  f"--restart unless-stopped "
1727
1531
  f"{redis_image} "
1728
- f"redis-server --bind 0.0.0.0 "
1729
- f"--appendonly no "
1730
- f'--save "" '
1731
- f"--maxmemory 30gb "
1732
- f"--maxmemory-policy allkeys-lru "
1733
- f"--io-threads 4 "
1734
- f"--io-threads-do-reads yes "
1735
- f"--stream-node-max-bytes 8192 "
1736
- f"--stream-node-max-entries 1000 "
1737
- f"--hz 100 "
1738
- f"--tcp-backlog 2048 "
1739
- f"--timeout 0 "
1740
- f"--lazyfree-lazy-eviction yes "
1741
- f"--lazyfree-lazy-expire yes "
1742
- f"--lazyfree-lazy-server-del yes "
1743
- f"--activedefrag yes "
1744
- f"--requirepass {redis_password}"
1532
+ f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1745
1533
  )
1534
+
1746
1535
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
1747
1536
 
1748
1537
  # Start Redis container first
@@ -1792,8 +1581,7 @@ def deploy_aggregator_execute(
1792
1581
  if not action_details:
1793
1582
  return
1794
1583
  self.setup_action_requirements(action_details, work_fs)
1795
- container_name = f"deploy_aggregator_{self.action_record_id}"
1796
- cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1584
+ cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1797
1585
  logging.info("cmd: %s", cmd)
1798
1586
  self.start(cmd, "deploy_aggregator")
1799
1587
 
@@ -1809,10 +1597,6 @@ def model_deploy_execute(self: ActionInstance):
1809
1597
  return
1810
1598
  action_id = action_details["_id"]
1811
1599
  model_family = action_details["actionDetails"]["modelFamily"]
1812
-
1813
- # Get the service ID to track deployments
1814
- service_id = action_details.get("_idService")
1815
-
1816
1600
  self.setup_action_requirements(
1817
1601
  action_details,
1818
1602
  work_fs,
@@ -1820,29 +1604,17 @@ def model_deploy_execute(self: ActionInstance):
1820
1604
  action_id=action_id,
1821
1605
  )
1822
1606
 
1823
- # Check if this is the first deployment for this service
1824
- is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
1825
-
1826
- # Get GPU configuration (uses utility function with fail-safe fallback)
1827
- use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
1828
-
1829
- logging.info(
1830
- "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
1831
- action_id,
1832
- use_gpu if use_gpu else "CPU-only",
1833
- is_first_deployment
1834
- )
1835
-
1836
- # Get or create TRITON_PORTS (uses utility method)
1837
- triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1607
+ # Get GPU configuration based on requirements and availability
1608
+ # This uses the best-fit algorithm to select the most appropriate GPU(s)
1609
+ use_gpu = self.get_gpu_config(action_details)
1838
1610
 
1839
- extra_env_vars = {
1840
- "INTERNAL_PORT": internal_port,
1841
- "TRITON_PORTS": triton_ports
1842
- }
1611
+ # Override: If GPU is required, use all available GPUs
1612
+ gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
1613
+ if gpuRequired:
1614
+ use_gpu = "--runtime=nvidia --gpus all"
1843
1615
 
1844
- container_name = f"model_deploy_{self.action_record_id}"
1845
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1616
+ extra_env_vars = {"INTERNAL_PORT": internal_port}
1617
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1846
1618
  logging.info("cmd is: %s", cmd)
1847
1619
  self.start(cmd, "deploy_log")
1848
1620
 
@@ -1865,27 +1637,17 @@ def model_train_execute(self: ActionInstance):
1865
1637
  action_id=action_id,
1866
1638
  )
1867
1639
 
1868
- existing_container_id = action_details["actionDetails"].get("containerId")
1869
- if existing_container_id:
1870
- # Check if container actually exists before trying to restart
1871
- if ActionInstance.container_exists(existing_container_id):
1872
- logging.info(
1873
- "Using existing container ID for training: %s",
1874
- existing_container_id,
1875
- )
1876
- self.docker_container = existing_container_id
1877
- cmd = "docker restart " + self.docker_container
1878
- self.start(cmd, "train_log")
1879
- return
1880
- else:
1881
- logging.warning(
1882
- "Container %s not found. Creating new container.",
1883
- existing_container_id
1884
- )
1885
- # Fall through to create new container
1886
-
1887
- container_name = f"model_train_{self.action_record_id}"
1888
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1640
+ if action_details["actionDetails"].get("containerId"):
1641
+ logging.info(
1642
+ "Using existing container ID for training: %s",
1643
+ action_details["actionDetails"]["containerId"],
1644
+ )
1645
+ self.docker_container = action_details["actionDetails"]["containerId"]
1646
+ cmd = "docker restart " + self.docker_container
1647
+ self.start(cmd, "train_log")
1648
+ return
1649
+
1650
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1889
1651
  logging.info("cmd is: %s", cmd)
1890
1652
  self.start(cmd, "train_log")
1891
1653
 
@@ -1906,27 +1668,17 @@ def model_eval_execute(self: ActionInstance):
1906
1668
  model_family=model_family,
1907
1669
  action_id=action_id,
1908
1670
  )
1909
- existing_container_id = action_details["actionDetails"].get("containerId")
1910
- if existing_container_id:
1911
- # Check if container actually exists before trying to restart
1912
- if ActionInstance.container_exists(existing_container_id):
1913
- logging.info(
1914
- "Using existing container ID for evaluation: %s",
1915
- existing_container_id,
1916
- )
1917
- self.docker_container = existing_container_id
1918
- cmd = "docker restart " + self.docker_container
1919
- self.start(cmd, "eval_log")
1920
- return
1921
- else:
1922
- logging.warning(
1923
- "Container %s not found. Creating new container.",
1924
- existing_container_id
1925
- )
1926
- # Fall through to create new container
1927
-
1928
- container_name = f"model_eval_{self.action_record_id}"
1929
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1671
+ if action_details["actionDetails"].get("containerId"):
1672
+ logging.info(
1673
+ "Using existing container ID for training: %s",
1674
+ action_details["actionDetails"]["containerId"],
1675
+ )
1676
+ self.docker_container = action_details["actionDetails"]["containerId"]
1677
+ cmd = "docker restart " + self.docker_container
1678
+ self.start(cmd, "eval_log")
1679
+ return
1680
+
1681
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1930
1682
  logging.info("cmd is: %s", cmd)
1931
1683
  self.start(cmd, "eval_log")
1932
1684
 
@@ -1950,27 +1702,17 @@ def model_export_execute(self: ActionInstance):
1950
1702
  model_family=model_family,
1951
1703
  action_id=action_id,
1952
1704
  )
1953
- existing_container_id = action_details["actionDetails"].get("containerId")
1954
- if existing_container_id:
1955
- # Check if container actually exists before trying to restart
1956
- if ActionInstance.container_exists(existing_container_id):
1957
- logging.info(
1958
- "Using existing container ID for export: %s",
1959
- existing_container_id,
1960
- )
1961
- self.docker_container = existing_container_id
1962
- cmd = "docker restart " + self.docker_container
1963
- self.start(cmd, "export_log")
1964
- return
1965
- else:
1966
- logging.warning(
1967
- "Container %s not found. Creating new container.",
1968
- existing_container_id
1969
- )
1970
- # Fall through to create new container
1971
-
1972
- container_name = f"model_export_{self.action_record_id}"
1973
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1705
+ if action_details["actionDetails"].get("containerId"):
1706
+ logging.info(
1707
+ "Using existing container ID for training: %s",
1708
+ action_details["actionDetails"]["containerId"],
1709
+ )
1710
+ self.docker_container = action_details["actionDetails"]["containerId"]
1711
+ cmd = "docker restart " + self.docker_container
1712
+ self.start(cmd, "export_log")
1713
+ return
1714
+
1715
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1974
1716
  logging.info("cmd is: %s", cmd)
1975
1717
  self.start(cmd, "export_log")
1976
1718
 
@@ -1986,8 +1728,7 @@ def image_build_execute(self: ActionInstance):
1986
1728
  action_id = action_details["_id"]
1987
1729
  internal_api_key = self.get_internal_api_key(action_id)
1988
1730
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1989
- container_name = f"image_build_{self.action_record_id}"
1990
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1731
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1991
1732
  logging.info("cmd is: %s", cmd)
1992
1733
  self.start(cmd, "image_build_log")
1993
1734
 
@@ -1999,8 +1740,7 @@ def resource_clone_execute(self: ActionInstance):
1999
1740
  if not action_details:
2000
1741
  return
2001
1742
  self.setup_action_requirements(action_details)
2002
- container_name = f"resource_clone_{self.action_record_id}"
2003
- cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1743
+ cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
2004
1744
  logging.info("cmd is: %s", cmd)
2005
1745
  self.start(cmd, "resource_clone")
2006
1746
 
@@ -2016,27 +1756,17 @@ def streaming_gateway_execute(self: ActionInstance):
2016
1756
  self.docker_container = (
2017
1757
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
2018
1758
  )
2019
- existing_container_id = action_details["actionDetails"].get("containerId")
2020
- if existing_container_id:
2021
- # Check if container actually exists before trying to restart
2022
- if ActionInstance.container_exists(existing_container_id):
2023
- logging.info(
2024
- "Using existing container ID for streaming gateway: %s",
2025
- existing_container_id,
2026
- )
2027
- self.docker_container = existing_container_id
2028
- cmd = "docker restart " + self.docker_container
2029
- self.start(cmd, "streaming_gateway")
2030
- return
2031
- else:
2032
- logging.warning(
2033
- "Container %s not found. Creating new container.",
2034
- existing_container_id
2035
- )
2036
- # Fall through to create new container
2037
-
2038
- container_name = f"streaming_gateway_{self.action_record_id}"
2039
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1759
+ if action_details["actionDetails"].get("containerId"):
1760
+ logging.info(
1761
+ "Using existing container ID for training: %s",
1762
+ action_details["actionDetails"]["containerId"],
1763
+ )
1764
+ self.docker_container = action_details["actionDetails"]["containerId"]
1765
+ cmd = "docker restart " + self.docker_container
1766
+ self.start(cmd, "streaming_gateway")
1767
+ return
1768
+
1769
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
2040
1770
  logging.info("cmd is: %s", cmd)
2041
1771
  self.start(cmd, "streaming_gateway")
2042
1772
 
@@ -2130,24 +1860,16 @@ def kafka_setup_execute(self: ActionInstance):
2130
1860
  else:
2131
1861
  pkgs = f"matrice_common matrice"
2132
1862
 
2133
- existing_container_id = action_details["actionDetails"].get("containerId")
2134
- if existing_container_id:
2135
- # Check if container actually exists before trying to restart
2136
- if ActionInstance.container_exists(existing_container_id):
2137
- logging.info(
2138
- "Using existing container ID for kafka: %s",
2139
- existing_container_id,
2140
- )
2141
- self.docker_container = existing_container_id
2142
- cmd = "docker restart " + self.docker_container
2143
- self.start(cmd, "kafka_setup")
2144
- return
2145
- else:
2146
- logging.warning(
2147
- "Container %s not found. Creating new container.",
2148
- existing_container_id
2149
- )
2150
- # Fall through to create new container
1863
+ if action_details["actionDetails"].get("containerId"):
1864
+ logging.info(
1865
+ "Using existing container ID for training: %s",
1866
+ action_details["actionDetails"]["containerId"],
1867
+ )
1868
+ self.docker_container = action_details["actionDetails"]["containerId"]
1869
+ cmd = "docker restart " + self.docker_container
1870
+ self.start(cmd, "kafka_setup")
1871
+ return
1872
+
2151
1873
 
2152
1874
  # Kafka container with --net=host (Ports: 9092, 9093)
2153
1875
  cmd = (
@@ -2184,31 +1906,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
2184
1906
 
2185
1907
  self.setup_action_requirements(action_details)
2186
1908
 
2187
- existing_container_id = action_details["actionDetails"].get("containerId")
2188
- if existing_container_id:
2189
- # Check if container actually exists before trying to restart
2190
- if ActionInstance.container_exists(existing_container_id):
2191
- logging.info(
2192
- "Using existing container ID for inference tracker: %s",
2193
- existing_container_id,
2194
- )
2195
- self.docker_container = existing_container_id
2196
- cmd = "docker restart " + self.docker_container
2197
- self.start(cmd, "inference_tracker_setup")
2198
- return
2199
- else:
2200
- logging.warning(
2201
- "Container %s not found. Creating new container.",
2202
- existing_container_id
2203
- )
2204
- # Fall through to create new container
2205
-
1909
+ if action_details["actionDetails"].get("containerId"):
1910
+ logging.info(
1911
+ "Using existing container ID for inference tracker: %s",
1912
+ action_details["actionDetails"]["containerId"],
1913
+ )
1914
+ self.docker_container = action_details["actionDetails"]["containerId"]
1915
+ cmd = "docker restart " + self.docker_container
1916
+ self.start(cmd, "inference_tracker_setup")
1917
+ return
1918
+
2206
1919
  # This is the existing Docker run command
2207
- container_name = f"inference_tracker_{self.action_record_id}"
2208
1920
  worker_cmd = (
2209
1921
  f"docker run -d --pull=always --net=host "
2210
- f"--cidfile ./{self.action_record_id}.cid "
2211
- f"--name {container_name} "
1922
+ f"--cidfile ./{self.action_record_id}.cid "
1923
+ f"--name inference-tracker-worker "
2212
1924
  f"-v matrice_myvol:/matrice_data "
2213
1925
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2214
1926
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -2256,7 +1968,7 @@ def video_storage_setup_execute(self: ActionInstance):
2256
1968
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2257
1969
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2258
1970
  f'-e ACTION_ID="{self.action_record_id}" '
2259
- f'--restart=unless-stopped '
1971
+ f' --restart=unless-stopped '
2260
1972
  f"{image}"
2261
1973
  )
2262
1974
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.35
3
+ Version: 0.1.36
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
1
  matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
2
- matrice_compute/action_instance.py,sha256=zVsW7-O3u5QoeZlh2D9Qlo5HsQNMOrUGbaR1jY1QFFg,88286
2
+ matrice_compute/action_instance.py,sha256=03TX2dF2i2DUtMJvFJFckzvIEPsuyaJuNk9mkHWjsLM,75901
3
3
  matrice_compute/actions_manager.py,sha256=a_TulMnu462xc0t_A-Mpug5zhQTmtpjiv7mhiC_IAVw,18280
4
4
  matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
5
  matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
@@ -11,8 +11,8 @@ matrice_compute/resources_tracker.py,sha256=DffKitGU1gran0OAuKIsfH0XeOe03xU7NGl-
11
11
  matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
12
12
  matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
13
13
  matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
14
- matrice_compute-0.1.35.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
15
- matrice_compute-0.1.35.dist-info/METADATA,sha256=iP3k-FJhne2q4kbSSZx6IcTX1MPDr5upMcFeG0I_z6g,1038
16
- matrice_compute-0.1.35.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
- matrice_compute-0.1.35.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
18
- matrice_compute-0.1.35.dist-info/RECORD,,
14
+ matrice_compute-0.1.36.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
15
+ matrice_compute-0.1.36.dist-info/METADATA,sha256=S3V1TndESfRIbXxF4M6CMxoqNtVZ3uvnoa7WkzcSNxI,1038
16
+ matrice_compute-0.1.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
+ matrice_compute-0.1.36.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
18
+ matrice_compute-0.1.36.dist-info/RECORD,,