matrice-compute 0.1.35__tar.gz → 0.1.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/action_instance.py +225 -511
  4. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/LICENSE.txt +0 -0
  5. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/README.md +0 -0
  6. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/matrice_compute.egg-info/SOURCES.txt +0 -0
  7. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/matrice_compute.egg-info/dependency_links.txt +0 -0
  8. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/matrice_compute.egg-info/not-zip-safe +0 -0
  9. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/matrice_compute.egg-info/top_level.txt +0 -0
  10. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/pyproject.toml +0 -0
  11. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/setup.cfg +0 -0
  12. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/setup.py +0 -0
  13. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/__init__.py +0 -0
  14. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/actions_manager.py +0 -0
  15. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  16. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/compute_operations_handler.py +0 -0
  17. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/instance_manager.py +0 -0
  18. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/instance_utils.py +0 -0
  19. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/prechecks.py +0 -0
  20. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/py.typed +0 -0
  21. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/resources_tracker.py +0 -0
  22. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/scaling.py +0 -0
  23. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/shutdown_manager.py +0 -0
  24. {matrice_compute-0.1.35 → matrice_compute-0.1.37}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.35
3
+ Version: 0.1.37
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.35
3
+ Version: 0.1.37
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -10,7 +10,6 @@ import signal
10
10
  import urllib.request
11
11
  from matrice_compute.instance_utils import (
12
12
  get_gpu_with_sufficient_memory_for_action,
13
- get_gpu_config_for_deployment,
14
13
  get_decrypted_access_key_pair,
15
14
  get_max_file_system,
16
15
  get_best_service_ip_and_network,
@@ -27,10 +26,6 @@ from matrice_common.utils import log_errors
27
26
  class ActionInstance:
28
27
  """Base class for tasks that run in Action containers."""
29
28
 
30
- # Class-level dictionary to track deployed services and their ports
31
- # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
32
- _deployed_services = {}
33
-
34
29
  def __init__(self, scaling: Scaling, action_info: dict):
35
30
  """Initialize an action instance.
36
31
 
@@ -90,67 +85,6 @@ class ActionInstance:
90
85
  raise ValueError(f"Unknown action type: {self.action_type}")
91
86
  self.task = self.actions_map[self.action_type]
92
87
 
93
- @classmethod
94
- def is_first_deployment_for_service(cls, service_id):
95
- """Check if this is the first deployment for a given service.
96
-
97
- Args:
98
- service_id (str): Service ID (_idService)
99
-
100
- Returns:
101
- bool: True if this is the first deployment, False otherwise
102
- """
103
- if not service_id:
104
- return False
105
- return service_id not in cls._deployed_services
106
-
107
- @classmethod
108
- def get_or_create_triton_ports(cls, service_id, scaling_instance):
109
- """Get existing TRITON_PORTS for a service or create new ones.
110
-
111
- Args:
112
- service_id (str): Service ID (_idService)
113
- scaling_instance: Scaling instance to get open ports
114
-
115
- Returns:
116
- str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
117
- """
118
- if not service_id:
119
- # No service_id, generate new ports
120
- port1 = scaling_instance.get_open_port()
121
- port2 = scaling_instance.get_open_port()
122
- port3 = scaling_instance.get_open_port()
123
- return f"{port1},{port2},{port3}"
124
-
125
- # Check if ports already exist for this service
126
- if service_id in cls._deployed_services:
127
- triton_ports = cls._deployed_services[service_id]["triton_ports"]
128
- logging.info(
129
- "Reusing TRITON_PORTS for service %s: %s",
130
- service_id,
131
- triton_ports
132
- )
133
- return triton_ports
134
-
135
- # First deployment: generate new ports and store them
136
- port1 = scaling_instance.get_open_port()
137
- port2 = scaling_instance.get_open_port()
138
- port3 = scaling_instance.get_open_port()
139
- triton_ports = f"{port1},{port2},{port3}"
140
-
141
- # Store for future use
142
- cls._deployed_services[service_id] = {
143
- "triton_ports": triton_ports,
144
- "is_first": False
145
- }
146
-
147
- logging.info(
148
- "First deployment for service %s - generated TRITON_PORTS: %s",
149
- service_id,
150
- triton_ports
151
- )
152
- return triton_ports
153
-
154
88
  @log_errors(default_return={}, raise_exception=True, log_error=False)
155
89
  def _init_credentials(self):
156
90
  """Initialize Matrice credentials.
@@ -297,7 +231,7 @@ class ActionInstance:
297
231
  getattr(self, "action_record_id", "unknown"),
298
232
  )
299
233
  else:
300
- logging.info(
234
+ logging.debug(
301
235
  "No additional logs to send for action %s",
302
236
  getattr(self, "action_record_id", "unknown"),
303
237
  )
@@ -352,13 +286,13 @@ class ActionInstance:
352
286
  ).get("gpuMemory", 0)
353
287
 
354
288
  logging.info(
355
- "Action %s requires GPU with %d MB memory - selecting GPU(s) with most free memory",
289
+ "Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
356
290
  action_id,
357
291
  required_memory
358
292
  )
359
293
 
360
294
  try:
361
- # Get the GPU(s) with most free memory that have sufficient memory
295
+ # Get the best-fit GPU(s) with sufficient memory
362
296
  gpu_indices = get_gpu_with_sufficient_memory_for_action(
363
297
  action_details=action_details
364
298
  )
@@ -412,7 +346,6 @@ class ActionInstance:
412
346
  destination_workspace_path: str = "/usr/src/workspace",
413
347
  docker_workdir: str = "",
414
348
  extra_pkgs: list = [],
415
- container_name: str = "",
416
349
  ):
417
350
  """Build base Docker command with common options.
418
351
 
@@ -427,7 +360,6 @@ class ActionInstance:
427
360
  destination_workspace_path (str): Container workspace path
428
361
  docker_workdir (str): Docker working directory
429
362
  extra_pkgs (list): List of extra packages to install
430
- container_name (str): Docker container name (format: {action_type}_{action_id})
431
363
  Returns:
432
364
  str: Base Docker command
433
365
  """
@@ -492,20 +424,19 @@ class ActionInstance:
492
424
  ]
493
425
  )
494
426
 
495
- # Build container name option if provided
496
- name_option = f"--name {container_name}" if container_name else ""
497
-
498
427
  # if the service provider is local, then put --restart unless-stopped
499
428
  if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
500
- env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
429
+ use_restart_policy = "--restart unless-stopped"
430
+ else:
431
+ use_restart_policy = ""
501
432
 
502
433
  cmd_parts = [
503
- f"docker run -d {use_gpu} ",
504
- name_option,
434
+ f"docker run {use_gpu} {use_restart_policy} ",
505
435
  network_config,
506
436
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
507
437
  *volumes,
508
438
  # Container configuration and startup commands
439
+ f"--cidfile ./{self.action_record_id}.cid ",
509
440
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
510
441
  f'/bin/bash -c "cd {docker_workdir} && '
511
442
  f"{env_exports} && "
@@ -893,34 +824,6 @@ class ActionInstance:
893
824
  job_params=action_details["jobParams"],
894
825
  )
895
826
 
896
- @staticmethod
897
- def container_exists(container_id: str) -> bool:
898
- """Check if a Docker container exists.
899
-
900
- Args:
901
- container_id (str): Container ID or name to check
902
-
903
- Returns:
904
- bool: True if container exists, False otherwise
905
- """
906
- if not container_id:
907
- return False
908
- try:
909
- result = subprocess.run(
910
- ["docker", "inspect", container_id],
911
- capture_output=True,
912
- text=True,
913
- timeout=10
914
- )
915
- return result.returncode == 0
916
- except Exception as e:
917
- logging.warning(
918
- "Error checking if container %s exists: %s",
919
- container_id,
920
- str(e)
921
- )
922
- return False
923
-
924
827
  @log_errors(raise_exception=True)
925
828
  def start_process(self, cmd, log_name):
926
829
  """Start the process and initialize logging.
@@ -935,54 +838,60 @@ class ActionInstance:
935
838
  self.cmd = cmd
936
839
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
937
840
 
938
- # Run docker with -d flag to get container ID from stdout
939
- process = subprocess.Popen(
940
- shlex.split(self.cmd),
941
- stdout=subprocess.PIPE,
942
- stderr=subprocess.PIPE,
943
- text=True,
944
- env={**os.environ},
945
- )
946
-
947
- # Use a longer timeout for docker run since --pull=always may need to
948
- # download large images on first run. Default: 30 minutes (1800 seconds)
949
- # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
950
- docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
951
- logging.info(
952
- "Waiting for docker container to start for action %s (timeout: %d seconds)",
953
- self.action_record_id,
954
- docker_start_timeout,
955
- )
956
- stdout, stderr = process.communicate(timeout=docker_start_timeout)
841
+ with open(self.log_path, "wb") as out:
842
+ self.process = subprocess.Popen(
843
+ shlex.split(self.cmd),
844
+ stdout=out,
845
+ stderr=out,
846
+ env={**os.environ},
847
+ start_new_session=True,
848
+ )
957
849
 
958
- if process.returncode != 0:
850
+ self.container_id = None
851
+
852
+ cid_file_path = f"./{self.action_record_id}.cid"
853
+ max_retries = 5
854
+ retry_delay = 1 # seconds
855
+ for attempt in range(max_retries):
856
+ try:
857
+ with open(cid_file_path, "r") as cid_file:
858
+ container_id = cid_file.read().strip()
859
+ self.container_id = container_id
860
+ logging.info(
861
+ "Started process for action %s with container ID: %s",
862
+ self.action_record_id,
863
+ self.container_id,
864
+ )
865
+ break
866
+ except FileNotFoundError:
867
+ logging.warning(
868
+ "CID file not found for action %s, attempt %d/%d",
869
+ self.action_record_id,
870
+ attempt + 1,
871
+ max_retries,
872
+ )
873
+ time.sleep(retry_delay)
874
+ except Exception as e:
875
+ logging.error(
876
+ "Error reading CID file for action %s: %s",
877
+ self.action_record_id,
878
+ str(e),
879
+ )
880
+ time.sleep(retry_delay)
881
+ else:
959
882
  logging.error(
960
- "Docker run failed for action %s: %s",
883
+ "Failed to read CID file for action %s after %d attempts",
961
884
  self.action_record_id,
962
- stderr,
885
+ max_retries,
963
886
  )
964
- raise RuntimeError(f"Docker run failed: {stderr}")
965
-
966
- self.container_id = stdout.strip()
967
- logging.info(
968
- "Started container for action %s with ID: %s",
969
- self.action_record_id,
970
- self.container_id,
971
- )
972
-
973
- # Start following container logs in background
974
- self.process = subprocess.Popen(
975
- ["docker", "logs", "-f", self.container_id],
976
- stdout=open(self.log_path, "wb"),
977
- stderr=subprocess.STDOUT,
978
- start_new_session=True,
979
- )
887
+ raise Exception("Failed to start process: CID file not found")
980
888
 
981
- # Report container id to scaling service
889
+ # report container id to scaling service
982
890
  self.scaling.update_action_container_id(
983
891
  action_record_id=self.action_record_id,
984
892
  container_id=self.container_id,
985
893
  )
894
+
986
895
 
987
896
  @log_errors(raise_exception=False)
988
897
  def start_logger(self):
@@ -1143,8 +1052,7 @@ def data_preparation_execute(
1143
1052
  "Started pulling Docker image with PID: %s",
1144
1053
  process.pid,
1145
1054
  )
1146
- container_name = f"data_prep_{self.action_record_id}"
1147
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1055
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1148
1056
  logging.info("cmd is: %s", cmd)
1149
1057
  self.start(cmd, "data_preparation_log")
1150
1058
 
@@ -1173,8 +1081,7 @@ def data_processing_execute(self: ActionInstance):
1173
1081
  service="bg-job-scheduler",
1174
1082
  job_params=action["jobParams"],
1175
1083
  )
1176
- container_name = f"data_processing_{self.action_record_id}"
1177
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1084
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1178
1085
  logging.info("cmd: %s", cmd)
1179
1086
  self.start(cmd, "data_processing_log")
1180
1087
 
@@ -1187,8 +1094,7 @@ def data_split_execute(self: ActionInstance):
1187
1094
  if not action_details:
1188
1095
  return
1189
1096
  self.setup_action_requirements(action_details, work_fs, model_family="")
1190
- container_name = f"data_split_{self.action_record_id}"
1191
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1097
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1192
1098
  logging.info("cmd: %s", cmd)
1193
1099
  self.start(cmd, "data_split")
1194
1100
 
@@ -1203,8 +1109,7 @@ def dataset_annotation_execute(
1203
1109
  if not action_details:
1204
1110
  return
1205
1111
  self.setup_action_requirements(action_details, work_fs)
1206
- container_name = f"dataset_annotation_{self.action_record_id}"
1207
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1112
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1208
1113
  logging.info("cmd: %s", cmd)
1209
1114
  self.start(cmd, "dataset_annotation")
1210
1115
 
@@ -1219,8 +1124,7 @@ def dataset_augmentation_execute(
1219
1124
  if not action_details:
1220
1125
  return
1221
1126
  self.setup_action_requirements(action_details, work_fs)
1222
- container_name = f"dataset_augmentation_{self.action_record_id}"
1223
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1127
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1224
1128
  logging.info("cmd: %s", cmd)
1225
1129
  self.start(cmd, "dataset_augmentation")
1226
1130
 
@@ -1236,8 +1140,7 @@ def augmentation_server_creation_execute(
1236
1140
  if not action_details:
1237
1141
  return
1238
1142
  self.setup_action_requirements(action_details, work_fs)
1239
- container_name = f"augmentation_setup_{self.action_record_id}"
1240
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1143
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1241
1144
  logging.info("cmd: %s", cmd)
1242
1145
  self.start(cmd, "augmentation_setup")
1243
1146
 
@@ -1258,45 +1161,30 @@ def database_setup_execute(self: ActionInstance):
1258
1161
 
1259
1162
  project_id = action_details["_idProject"]
1260
1163
 
1261
- # Define container names with action_record_id for uniqueness
1262
- mongodb_container_name = f"database_setup_{self.action_record_id}"
1263
- qdrant_container_name = f"qdrant_{self.action_record_id}"
1164
+ if action_details["actionDetails"].get("containerId"):
1165
+ logging.info(
1166
+ "Using existing container ID for inference tracker: %s",
1167
+ action_details["actionDetails"]["containerId"],
1168
+ )
1169
+ self.docker_container = action_details["actionDetails"]["containerId"]
1170
+ cmd = "docker restart " + self.docker_container
1171
+ self.start(cmd, "qdrant_setup")
1264
1172
 
1265
- existing_container_id = action_details["actionDetails"].get("containerId")
1266
- if existing_container_id:
1267
- # Check if both containers actually exist before trying to restart
1268
- mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
1269
- qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
1173
+ #qdrant restart
1174
+ qdrant_cmd = "docker restart qdrant"
1175
+ self.start(qdrant_cmd, 'qdrant_setup')
1270
1176
 
1271
- if mongodb_container_exists and qdrant_container_exists:
1272
- logging.info(
1273
- "Using existing container ID for database setup: %s",
1274
- existing_container_id,
1275
- )
1276
- self.docker_container = existing_container_id
1277
- cmd = "docker restart " + self.docker_container
1278
- self.start(cmd, "database_setup")
1177
+ return
1178
+
1279
1179
 
1280
- # qdrant restart
1281
- qdrant_cmd = f"docker restart {qdrant_container_name}"
1282
- self.start(qdrant_cmd, "qdrant_setup")
1283
- return
1284
- else:
1285
- logging.warning(
1286
- "Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
1287
- mongodb_container_exists,
1288
- qdrant_container_exists
1289
- )
1290
- # Fall through to create new containers
1180
+ dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
1291
1181
 
1292
- dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
1293
1182
 
1294
1183
  # MongoDB container with --net=host (Port: 27020:27017)
1295
1184
  cmd = (
1296
1185
  f"docker run --pull=always --net=host "
1297
- f"--name {mongodb_container_name} "
1298
- f"-v matrice_myvol:/matrice_data "
1299
1186
  f"-v {dbPath}:{dbPath} "
1187
+ f"--name database_setup_{self.action_record_id} "
1300
1188
  f"-v /var/run/docker.sock:/var/run/docker.sock "
1301
1189
  f"--cidfile ./{self.action_record_id}.cid "
1302
1190
  f"-e ACTION_RECORD_ID={self.action_record_id} "
@@ -1308,23 +1196,6 @@ def database_setup_execute(self: ActionInstance):
1308
1196
  )
1309
1197
  logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
1310
1198
 
1311
- # Qdrant container with --net=host (Port: 6334)
1312
- qdrant_cmd = (
1313
- f"docker run -d --pull=always --net=host "
1314
- f"--name {qdrant_container_name} "
1315
- f"-v matrice_myvol:/matrice_data "
1316
- f"qdrant/qdrant:latest "
1317
- )
1318
- logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
1319
-
1320
- # Start Qdrant container
1321
- qdrant_process = subprocess.Popen(
1322
- qdrant_cmd,
1323
- shell=True,
1324
- stdout=subprocess.PIPE,
1325
- stderr=subprocess.PIPE,
1326
- )
1327
- logging.info("Qdrant container started successfully")
1328
1199
 
1329
1200
  # Docker Command run
1330
1201
  self.start(cmd, "database_setup")
@@ -1344,32 +1215,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
1344
1215
 
1345
1216
  self.setup_action_requirements(action_details)
1346
1217
 
1347
- existing_container_id = action_details["actionDetails"].get("containerId")
1348
- if existing_container_id:
1349
- # Check if container actually exists before trying to restart
1350
- if ActionInstance.container_exists(existing_container_id):
1351
- logging.info(
1352
- "Using existing container ID for facial recognition worker: %s",
1353
- existing_container_id,
1354
- )
1355
- self.docker_container = existing_container_id
1356
- cmd = "docker restart " + self.docker_container
1357
- self.start(cmd, "facial_recognition_setup")
1358
- return
1359
- else:
1360
- logging.warning(
1361
- "Container %s not found. Creating new container.",
1362
- existing_container_id
1363
- )
1364
- # Fall through to create new container
1218
+ if action_details["actionDetails"].get("containerId"):
1219
+ logging.info(
1220
+ "Using existing container ID for facial recognition worker: %s",
1221
+ action_details["actionDetails"]["containerId"],
1222
+ )
1223
+ self.docker_container = action_details["actionDetails"]["containerId"]
1224
+ cmd = "docker restart " + self.docker_container
1225
+ self.start(cmd, "facial_recognition_setup")
1226
+ return
1365
1227
 
1366
1228
  # Facial recognition worker container with --net=host (Port: 8081)
1367
- container_name = f"facial_recognition_{self.action_record_id}"
1368
1229
  worker_cmd = (
1369
1230
  f"docker run -d --pull=always --net=host "
1370
- f"--name {container_name} "
1371
- f"--cidfile ./{self.action_record_id}.cid "
1231
+ f"--name worker "
1232
+ f"--cidfile ./{self.action_record_id}.cid "
1372
1233
  f"-v matrice_myvol:/matrice_data "
1234
+ f"--cidfile ./{self.action_record_id}.cid "
1373
1235
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1374
1236
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1375
1237
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
@@ -1396,30 +1258,20 @@ def lpr_setup_execute(self: ActionInstance):
1396
1258
 
1397
1259
  self.setup_action_requirements(action_details)
1398
1260
 
1399
- existing_container_id = action_details["actionDetails"].get("containerId")
1400
- if existing_container_id:
1401
- # Check if container actually exists before trying to restart
1402
- if ActionInstance.container_exists(existing_container_id):
1403
- logging.info(
1404
- "Using existing container ID for LPR worker: %s",
1405
- existing_container_id,
1406
- )
1407
- self.docker_container = existing_container_id
1408
- cmd = "docker restart " + self.docker_container
1409
- self.start(cmd, "lpr_setup")
1410
- return
1411
- else:
1412
- logging.warning(
1413
- "Container %s not found. Creating new container.",
1414
- existing_container_id
1415
- )
1416
- # Fall through to create new container
1261
+ if action_details["actionDetails"].get("containerId"):
1262
+ logging.info(
1263
+ "Using existing container ID for LPR worker: %s",
1264
+ action_details["actionDetails"]["containerId"],
1265
+ )
1266
+ self.docker_container = action_details["actionDetails"]["containerId"]
1267
+ cmd = "docker restart " + self.docker_container
1268
+ self.start(cmd, "lpr_setup")
1269
+ return
1417
1270
 
1418
1271
  # LPR worker container with --net=host (Port: 8082)
1419
- container_name = f"lpr_{self.action_record_id}"
1420
1272
  worker_cmd = (
1421
1273
  f"docker run -d --net=host --pull=always "
1422
- f"--name {container_name} "
1274
+ f"--name lpr-worker "
1423
1275
  f"--cidfile ./{self.action_record_id}.cid "
1424
1276
  f"-v matrice_myvol:/matrice_data "
1425
1277
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1458,30 +1310,20 @@ def inference_ws_server_execute(self: ActionInstance):
1458
1310
 
1459
1311
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1460
1312
 
1461
- existing_container_id = action_details["actionDetails"].get("containerId")
1462
- if existing_container_id:
1463
- # Check if container actually exists before trying to restart
1464
- if ActionInstance.container_exists(existing_container_id):
1465
- logging.info(
1466
- "Using existing container ID for inference WebSocket server: %s",
1467
- existing_container_id,
1468
- )
1469
- self.docker_container = existing_container_id
1470
- cmd = "docker restart " + self.docker_container
1471
- self.start(cmd, "inference_ws_server")
1472
- return
1473
- else:
1474
- logging.warning(
1475
- "Container %s not found. Creating new container.",
1476
- existing_container_id
1477
- )
1478
- # Fall through to create new container
1313
+ if action_details["actionDetails"].get("containerId"):
1314
+ logging.info(
1315
+ "Using existing container ID for inference WebSocket server: %s",
1316
+ action_details["actionDetails"]["containerId"],
1317
+ )
1318
+ self.docker_container = action_details["actionDetails"]["containerId"]
1319
+ cmd = "docker restart " + self.docker_container
1320
+ self.start(cmd, "inference_ws_server")
1321
+ return
1479
1322
 
1480
1323
  # Inference WebSocket server with --net=host (Port: 8102)
1481
- container_name = f"inference_ws_{self.action_record_id}"
1482
1324
  worker_cmd = (
1483
1325
  f"docker run -d --pull=always --net=host "
1484
- f"--name {container_name} "
1326
+ f"--name inference "
1485
1327
  f"--cidfile ./{self.action_record_id}.cid "
1486
1328
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1487
1329
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1517,30 +1359,20 @@ def fe_fs_streaming_execute(self: ActionInstance):
1517
1359
 
1518
1360
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1519
1361
 
1520
- existing_container_id = action_details["actionDetails"].get("containerId")
1521
- if existing_container_id:
1522
- # Check if container actually exists before trying to restart
1523
- if ActionInstance.container_exists(existing_container_id):
1524
- logging.info(
1525
- "Using existing container ID for frontend streaming: %s",
1526
- existing_container_id,
1527
- )
1528
- self.docker_container = existing_container_id
1529
- cmd = "docker restart " + self.docker_container
1530
- self.start(cmd, "fe_fs_streaming")
1531
- return
1532
- else:
1533
- logging.warning(
1534
- "Container %s not found. Creating new container.",
1535
- existing_container_id
1536
- )
1537
- # Fall through to create new container
1538
-
1362
+ if action_details["actionDetails"].get("containerId"):
1363
+ logging.info(
1364
+ "Using existing container ID for frontend streaming: %s",
1365
+ action_details["actionDetails"]["containerId"],
1366
+ )
1367
+ self.docker_container = action_details["actionDetails"]["containerId"]
1368
+ cmd = "docker restart " + self.docker_container
1369
+ self.start(cmd, "fe_fs_streaming")
1370
+ return
1371
+
1539
1372
  # Frontend streaming with --net=host (Port: 3000)
1540
- container_name = f"fe_streaming_{self.action_record_id}"
1541
1373
  worker_cmd = (
1542
1374
  f"docker run -d --pull=always --net=host "
1543
- f"--name {container_name} "
1375
+ f"--name fe_streaming "
1544
1376
  f"--cidfile ./{self.action_record_id}.cid "
1545
1377
  f"-v matrice_myvol:/matrice_data "
1546
1378
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1573,30 +1405,20 @@ def fe_analytics_service_execute(self: ActionInstance):
1573
1405
 
1574
1406
  project_id = action_details["_idProject"]
1575
1407
 
1576
- existing_container_id = action_details["actionDetails"].get("containerId")
1577
- if existing_container_id:
1578
- # Check if container actually exists before trying to restart
1579
- if ActionInstance.container_exists(existing_container_id):
1580
- logging.info(
1581
- "Using existing container ID for frontend analytics service: %s",
1582
- existing_container_id,
1583
- )
1584
- self.docker_container = existing_container_id
1585
- cmd = "docker restart " + self.docker_container
1586
- self.start(cmd, "fe_analytics_service")
1587
- return
1588
- else:
1589
- logging.warning(
1590
- "Container %s not found. Creating new container.",
1591
- existing_container_id
1592
- )
1593
- # Fall through to create new container
1594
-
1408
+ if action_details["actionDetails"].get("containerId"):
1409
+ logging.info(
1410
+ "Using existing container ID for frontend analytics service: %s",
1411
+ action_details["actionDetails"]["containerId"],
1412
+ )
1413
+ self.docker_container = action_details["actionDetails"]["containerId"]
1414
+ cmd = "docker restart " + self.docker_container
1415
+ self.start(cmd, "fe_analytics_service")
1416
+ return
1417
+
1595
1418
  # Frontend analytics service with --net=host (Port: 3001)
1596
- container_name = f"fe_analytics_{self.action_record_id}"
1597
1419
  worker_cmd = (
1598
1420
  f"docker run -d --pull=always --net=host "
1599
- f"--name {container_name} "
1421
+ f"--name fe-analytics "
1600
1422
  f"--cidfile ./{self.action_record_id}.cid "
1601
1423
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1602
1424
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1629,8 +1451,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1629
1451
  else:
1630
1452
  return
1631
1453
  use_gpu = self.get_gpu_config(action_details)
1632
- container_name = f"dataset_generation_{self.action_record_id}"
1633
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1454
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1634
1455
  logging.info("cmd is: %s", cmd)
1635
1456
  self.start(cmd, "dataset_generation")
1636
1457
 
@@ -1651,8 +1472,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
1651
1472
  else:
1652
1473
  return
1653
1474
  use_gpu = self.get_gpu_config(action_details)
1654
- container_name = f"synthetic_data_setup_{self.action_record_id}"
1655
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1475
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1656
1476
  logging.info("cmd is: %s", cmd)
1657
1477
  self.start(cmd, "synthetic_data_setup")
1658
1478
 
@@ -1689,60 +1509,31 @@ def redis_setup_execute(self: ActionInstance):
1689
1509
 
1690
1510
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1691
1511
 
1692
- # Define container names with action_record_id for uniqueness
1693
- redis_container_name = f"redis_{self.action_record_id}"
1694
1512
 
1695
- existing_container_id = action_details["actionDetails"].get("containerId")
1696
- if existing_container_id:
1697
- # Check if both containers actually exist before trying to restart
1698
- management_container_exists = ActionInstance.container_exists(existing_container_id)
1699
- redis_container_exists = ActionInstance.container_exists(redis_container_name)
1700
-
1701
- if management_container_exists and redis_container_exists:
1702
- logging.info(
1703
- "Using existing container ID for redis management: %s",
1704
- existing_container_id,
1705
- )
1706
- self.docker_container = existing_container_id
1707
- cmd = "docker restart " + self.docker_container
1708
- self.start(cmd, "redis_setup")
1513
+ if action_details["actionDetails"].get("containerId"):
1514
+ logging.info(
1515
+ "Using existing container ID for redis management: %s",
1516
+ action_details["actionDetails"]["containerId"],
1517
+ )
1518
+ self.docker_container = action_details["actionDetails"]["containerId"]
1519
+ cmd = "docker restart " + self.docker_container
1520
+ self.start(cmd, "redis_setup")
1709
1521
 
1710
- # Redis container restart
1711
- redis_restart_cmd = f"docker restart {redis_container_name}"
1712
- self.start(redis_restart_cmd, "redis")
1713
- return
1714
- else:
1715
- logging.warning(
1716
- "Container(s) not found (management=%s, redis=%s). Creating new containers.",
1717
- management_container_exists,
1718
- redis_container_exists
1719
- )
1720
- # Fall through to create new containers
1522
+ # Redis container restart
1523
+ redis_restart_cmd = "docker restart redis_container"
1524
+ self.start(redis_restart_cmd, "redis")
1721
1525
 
1526
+ return
1527
+
1722
1528
  # Redis container with --net=host (Port: 6379)
1723
1529
  redis_cmd = (
1724
1530
  f"docker run -d --net=host "
1725
- f"--name {redis_container_name} "
1531
+ f"--name redis_container "
1726
1532
  f"--restart unless-stopped "
1727
1533
  f"{redis_image} "
1728
- f"redis-server --bind 0.0.0.0 "
1729
- f"--appendonly no "
1730
- f'--save "" '
1731
- f"--maxmemory 30gb "
1732
- f"--maxmemory-policy allkeys-lru "
1733
- f"--io-threads 4 "
1734
- f"--io-threads-do-reads yes "
1735
- f"--stream-node-max-bytes 8192 "
1736
- f"--stream-node-max-entries 1000 "
1737
- f"--hz 100 "
1738
- f"--tcp-backlog 2048 "
1739
- f"--timeout 0 "
1740
- f"--lazyfree-lazy-eviction yes "
1741
- f"--lazyfree-lazy-expire yes "
1742
- f"--lazyfree-lazy-server-del yes "
1743
- f"--activedefrag yes "
1744
- f"--requirepass {redis_password}"
1534
+ f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1745
1535
  )
1536
+
1746
1537
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
1747
1538
 
1748
1539
  # Start Redis container first
@@ -1792,8 +1583,7 @@ def deploy_aggregator_execute(
1792
1583
  if not action_details:
1793
1584
  return
1794
1585
  self.setup_action_requirements(action_details, work_fs)
1795
- container_name = f"deploy_aggregator_{self.action_record_id}"
1796
- cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1586
+ cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1797
1587
  logging.info("cmd: %s", cmd)
1798
1588
  self.start(cmd, "deploy_aggregator")
1799
1589
 
@@ -1809,10 +1599,6 @@ def model_deploy_execute(self: ActionInstance):
1809
1599
  return
1810
1600
  action_id = action_details["_id"]
1811
1601
  model_family = action_details["actionDetails"]["modelFamily"]
1812
-
1813
- # Get the service ID to track deployments
1814
- service_id = action_details.get("_idService")
1815
-
1816
1602
  self.setup_action_requirements(
1817
1603
  action_details,
1818
1604
  work_fs,
@@ -1820,29 +1606,17 @@ def model_deploy_execute(self: ActionInstance):
1820
1606
  action_id=action_id,
1821
1607
  )
1822
1608
 
1823
- # Check if this is the first deployment for this service
1824
- is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
1825
-
1826
- # Get GPU configuration (uses utility function with fail-safe fallback)
1827
- use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
1828
-
1829
- logging.info(
1830
- "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
1831
- action_id,
1832
- use_gpu if use_gpu else "CPU-only",
1833
- is_first_deployment
1834
- )
1835
-
1836
- # Get or create TRITON_PORTS (uses utility method)
1837
- triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1609
+ # Get GPU configuration based on requirements and availability
1610
+ # This uses the best-fit algorithm to select the most appropriate GPU(s)
1611
+ use_gpu = self.get_gpu_config(action_details)
1838
1612
 
1839
- extra_env_vars = {
1840
- "INTERNAL_PORT": internal_port,
1841
- "TRITON_PORTS": triton_ports
1842
- }
1613
+ # Override: If GPU is required, use all available GPUs
1614
+ gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
1615
+ if gpuRequired:
1616
+ use_gpu = "--runtime=nvidia --gpus all"
1843
1617
 
1844
- container_name = f"model_deploy_{self.action_record_id}"
1845
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1618
+ extra_env_vars = {"INTERNAL_PORT": internal_port}
1619
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1846
1620
  logging.info("cmd is: %s", cmd)
1847
1621
  self.start(cmd, "deploy_log")
1848
1622
 
@@ -1865,27 +1639,17 @@ def model_train_execute(self: ActionInstance):
1865
1639
  action_id=action_id,
1866
1640
  )
1867
1641
 
1868
- existing_container_id = action_details["actionDetails"].get("containerId")
1869
- if existing_container_id:
1870
- # Check if container actually exists before trying to restart
1871
- if ActionInstance.container_exists(existing_container_id):
1872
- logging.info(
1873
- "Using existing container ID for training: %s",
1874
- existing_container_id,
1875
- )
1876
- self.docker_container = existing_container_id
1877
- cmd = "docker restart " + self.docker_container
1878
- self.start(cmd, "train_log")
1879
- return
1880
- else:
1881
- logging.warning(
1882
- "Container %s not found. Creating new container.",
1883
- existing_container_id
1884
- )
1885
- # Fall through to create new container
1886
-
1887
- container_name = f"model_train_{self.action_record_id}"
1888
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1642
+ if action_details["actionDetails"].get("containerId"):
1643
+ logging.info(
1644
+ "Using existing container ID for training: %s",
1645
+ action_details["actionDetails"]["containerId"],
1646
+ )
1647
+ self.docker_container = action_details["actionDetails"]["containerId"]
1648
+ cmd = "docker restart " + self.docker_container
1649
+ self.start(cmd, "train_log")
1650
+ return
1651
+
1652
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1889
1653
  logging.info("cmd is: %s", cmd)
1890
1654
  self.start(cmd, "train_log")
1891
1655
 
@@ -1906,27 +1670,17 @@ def model_eval_execute(self: ActionInstance):
1906
1670
  model_family=model_family,
1907
1671
  action_id=action_id,
1908
1672
  )
1909
- existing_container_id = action_details["actionDetails"].get("containerId")
1910
- if existing_container_id:
1911
- # Check if container actually exists before trying to restart
1912
- if ActionInstance.container_exists(existing_container_id):
1913
- logging.info(
1914
- "Using existing container ID for evaluation: %s",
1915
- existing_container_id,
1916
- )
1917
- self.docker_container = existing_container_id
1918
- cmd = "docker restart " + self.docker_container
1919
- self.start(cmd, "eval_log")
1920
- return
1921
- else:
1922
- logging.warning(
1923
- "Container %s not found. Creating new container.",
1924
- existing_container_id
1925
- )
1926
- # Fall through to create new container
1927
-
1928
- container_name = f"model_eval_{self.action_record_id}"
1929
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1673
+ if action_details["actionDetails"].get("containerId"):
1674
+ logging.info(
1675
+ "Using existing container ID for training: %s",
1676
+ action_details["actionDetails"]["containerId"],
1677
+ )
1678
+ self.docker_container = action_details["actionDetails"]["containerId"]
1679
+ cmd = "docker restart " + self.docker_container
1680
+ self.start(cmd, "eval_log")
1681
+ return
1682
+
1683
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1930
1684
  logging.info("cmd is: %s", cmd)
1931
1685
  self.start(cmd, "eval_log")
1932
1686
 
@@ -1950,27 +1704,17 @@ def model_export_execute(self: ActionInstance):
1950
1704
  model_family=model_family,
1951
1705
  action_id=action_id,
1952
1706
  )
1953
- existing_container_id = action_details["actionDetails"].get("containerId")
1954
- if existing_container_id:
1955
- # Check if container actually exists before trying to restart
1956
- if ActionInstance.container_exists(existing_container_id):
1957
- logging.info(
1958
- "Using existing container ID for export: %s",
1959
- existing_container_id,
1960
- )
1961
- self.docker_container = existing_container_id
1962
- cmd = "docker restart " + self.docker_container
1963
- self.start(cmd, "export_log")
1964
- return
1965
- else:
1966
- logging.warning(
1967
- "Container %s not found. Creating new container.",
1968
- existing_container_id
1969
- )
1970
- # Fall through to create new container
1971
-
1972
- container_name = f"model_export_{self.action_record_id}"
1973
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1707
+ if action_details["actionDetails"].get("containerId"):
1708
+ logging.info(
1709
+ "Using existing container ID for training: %s",
1710
+ action_details["actionDetails"]["containerId"],
1711
+ )
1712
+ self.docker_container = action_details["actionDetails"]["containerId"]
1713
+ cmd = "docker restart " + self.docker_container
1714
+ self.start(cmd, "export_log")
1715
+ return
1716
+
1717
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1974
1718
  logging.info("cmd is: %s", cmd)
1975
1719
  self.start(cmd, "export_log")
1976
1720
 
@@ -1986,8 +1730,7 @@ def image_build_execute(self: ActionInstance):
1986
1730
  action_id = action_details["_id"]
1987
1731
  internal_api_key = self.get_internal_api_key(action_id)
1988
1732
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1989
- container_name = f"image_build_{self.action_record_id}"
1990
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1733
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1991
1734
  logging.info("cmd is: %s", cmd)
1992
1735
  self.start(cmd, "image_build_log")
1993
1736
 
@@ -1999,8 +1742,7 @@ def resource_clone_execute(self: ActionInstance):
1999
1742
  if not action_details:
2000
1743
  return
2001
1744
  self.setup_action_requirements(action_details)
2002
- container_name = f"resource_clone_{self.action_record_id}"
2003
- cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1745
+ cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
2004
1746
  logging.info("cmd is: %s", cmd)
2005
1747
  self.start(cmd, "resource_clone")
2006
1748
 
@@ -2016,27 +1758,17 @@ def streaming_gateway_execute(self: ActionInstance):
2016
1758
  self.docker_container = (
2017
1759
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
2018
1760
  )
2019
- existing_container_id = action_details["actionDetails"].get("containerId")
2020
- if existing_container_id:
2021
- # Check if container actually exists before trying to restart
2022
- if ActionInstance.container_exists(existing_container_id):
2023
- logging.info(
2024
- "Using existing container ID for streaming gateway: %s",
2025
- existing_container_id,
2026
- )
2027
- self.docker_container = existing_container_id
2028
- cmd = "docker restart " + self.docker_container
2029
- self.start(cmd, "streaming_gateway")
2030
- return
2031
- else:
2032
- logging.warning(
2033
- "Container %s not found. Creating new container.",
2034
- existing_container_id
2035
- )
2036
- # Fall through to create new container
2037
-
2038
- container_name = f"streaming_gateway_{self.action_record_id}"
2039
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1761
+ if action_details["actionDetails"].get("containerId"):
1762
+ logging.info(
1763
+ "Using existing container ID for training: %s",
1764
+ action_details["actionDetails"]["containerId"],
1765
+ )
1766
+ self.docker_container = action_details["actionDetails"]["containerId"]
1767
+ cmd = "docker restart " + self.docker_container
1768
+ self.start(cmd, "streaming_gateway")
1769
+ return
1770
+
1771
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
2040
1772
  logging.info("cmd is: %s", cmd)
2041
1773
  self.start(cmd, "streaming_gateway")
2042
1774
 
@@ -2130,24 +1862,16 @@ def kafka_setup_execute(self: ActionInstance):
2130
1862
  else:
2131
1863
  pkgs = f"matrice_common matrice"
2132
1864
 
2133
- existing_container_id = action_details["actionDetails"].get("containerId")
2134
- if existing_container_id:
2135
- # Check if container actually exists before trying to restart
2136
- if ActionInstance.container_exists(existing_container_id):
2137
- logging.info(
2138
- "Using existing container ID for kafka: %s",
2139
- existing_container_id,
2140
- )
2141
- self.docker_container = existing_container_id
2142
- cmd = "docker restart " + self.docker_container
2143
- self.start(cmd, "kafka_setup")
2144
- return
2145
- else:
2146
- logging.warning(
2147
- "Container %s not found. Creating new container.",
2148
- existing_container_id
2149
- )
2150
- # Fall through to create new container
1865
+ if action_details["actionDetails"].get("containerId"):
1866
+ logging.info(
1867
+ "Using existing container ID for training: %s",
1868
+ action_details["actionDetails"]["containerId"],
1869
+ )
1870
+ self.docker_container = action_details["actionDetails"]["containerId"]
1871
+ cmd = "docker restart " + self.docker_container
1872
+ self.start(cmd, "kafka_setup")
1873
+ return
1874
+
2151
1875
 
2152
1876
  # Kafka container with --net=host (Ports: 9092, 9093)
2153
1877
  cmd = (
@@ -2184,31 +1908,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
2184
1908
 
2185
1909
  self.setup_action_requirements(action_details)
2186
1910
 
2187
- existing_container_id = action_details["actionDetails"].get("containerId")
2188
- if existing_container_id:
2189
- # Check if container actually exists before trying to restart
2190
- if ActionInstance.container_exists(existing_container_id):
2191
- logging.info(
2192
- "Using existing container ID for inference tracker: %s",
2193
- existing_container_id,
2194
- )
2195
- self.docker_container = existing_container_id
2196
- cmd = "docker restart " + self.docker_container
2197
- self.start(cmd, "inference_tracker_setup")
2198
- return
2199
- else:
2200
- logging.warning(
2201
- "Container %s not found. Creating new container.",
2202
- existing_container_id
2203
- )
2204
- # Fall through to create new container
2205
-
1911
+ if action_details["actionDetails"].get("containerId"):
1912
+ logging.info(
1913
+ "Using existing container ID for inference tracker: %s",
1914
+ action_details["actionDetails"]["containerId"],
1915
+ )
1916
+ self.docker_container = action_details["actionDetails"]["containerId"]
1917
+ cmd = "docker restart " + self.docker_container
1918
+ self.start(cmd, "inference_tracker_setup")
1919
+ return
1920
+
2206
1921
  # This is the existing Docker run command
2207
- container_name = f"inference_tracker_{self.action_record_id}"
2208
1922
  worker_cmd = (
2209
1923
  f"docker run -d --pull=always --net=host "
2210
- f"--cidfile ./{self.action_record_id}.cid "
2211
- f"--name {container_name} "
1924
+ f"--cidfile ./{self.action_record_id}.cid "
1925
+ f"--name inference-tracker-worker "
2212
1926
  f"-v matrice_myvol:/matrice_data "
2213
1927
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2214
1928
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -2256,7 +1970,7 @@ def video_storage_setup_execute(self: ActionInstance):
2256
1970
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2257
1971
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2258
1972
  f'-e ACTION_ID="{self.action_record_id}" '
2259
- f'--restart=unless-stopped '
1973
+ f' --restart=unless-stopped '
2260
1974
  f"{image}"
2261
1975
  )
2262
1976