matrice-compute 0.1.39__tar.gz → 0.1.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/action_instance.py +113 -239
  4. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/LICENSE.txt +0 -0
  5. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/README.md +0 -0
  6. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/matrice_compute.egg-info/SOURCES.txt +0 -0
  7. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/matrice_compute.egg-info/dependency_links.txt +0 -0
  8. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/matrice_compute.egg-info/not-zip-safe +0 -0
  9. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/matrice_compute.egg-info/top_level.txt +0 -0
  10. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/pyproject.toml +0 -0
  11. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/setup.cfg +0 -0
  12. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/setup.py +0 -0
  13. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/__init__.py +0 -0
  14. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/actions_manager.py +0 -0
  15. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  16. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/compute_operations_handler.py +0 -0
  17. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/instance_manager.py +0 -0
  18. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/instance_utils.py +0 -0
  19. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/prechecks.py +0 -0
  20. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/py.typed +0 -0
  21. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/resources_tracker.py +0 -0
  22. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/scaling.py +0 -0
  23. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/shutdown_manager.py +0 -0
  24. {matrice_compute-0.1.39 → matrice_compute-0.1.40}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.39
3
+ Version: 0.1.40
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.39
3
+ Version: 0.1.40
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -26,10 +26,6 @@ from matrice_common.utils import log_errors
26
26
  class ActionInstance:
27
27
  """Base class for tasks that run in Action containers."""
28
28
 
29
- # Class-level dictionary to track deployed services and their ports
30
- # Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
31
- _deployed_services = {}
32
-
33
29
  def __init__(self, scaling: Scaling, action_info: dict):
34
30
  """Initialize an action instance.
35
31
 
@@ -89,52 +85,6 @@ class ActionInstance:
89
85
  raise ValueError(f"Unknown action type: {self.action_type}")
90
86
  self.task = self.actions_map[self.action_type]
91
87
 
92
- @classmethod
93
- def get_or_create_triton_ports(cls, service_id, scaling_instance):
94
- """Get existing TRITON_PORTS for a service or create new ones.
95
-
96
- Args:
97
- service_id (str): Service ID (_idService)
98
- scaling_instance: Scaling instance to get open ports
99
-
100
- Returns:
101
- str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
102
- """
103
- if not service_id:
104
- # No service_id, generate new ports
105
- port1 = scaling_instance.get_open_port()
106
- port2 = scaling_instance.get_open_port()
107
- port3 = scaling_instance.get_open_port()
108
- return f"{port1},{port2},{port3}"
109
-
110
- # Check if ports already exist for this service
111
- if service_id in cls._deployed_services:
112
- triton_ports = cls._deployed_services[service_id]["triton_ports"]
113
- logging.info(
114
- "Reusing TRITON_PORTS for service %s: %s",
115
- service_id,
116
- triton_ports
117
- )
118
- return triton_ports
119
-
120
- # First deployment: generate new ports and store them
121
- port1 = scaling_instance.get_open_port()
122
- port2 = scaling_instance.get_open_port()
123
- port3 = scaling_instance.get_open_port()
124
- triton_ports = f"{port1},{port2},{port3}"
125
-
126
- # Store for future use
127
- cls._deployed_services[service_id] = {
128
- "triton_ports": triton_ports,
129
- }
130
-
131
- logging.info(
132
- "First deployment for service %s - generated TRITON_PORTS: %s",
133
- service_id,
134
- triton_ports
135
- )
136
- return triton_ports
137
-
138
88
  @log_errors(default_return={}, raise_exception=True, log_error=False)
139
89
  def _init_credentials(self):
140
90
  """Initialize Matrice credentials.
@@ -396,7 +346,6 @@ class ActionInstance:
396
346
  destination_workspace_path: str = "/usr/src/workspace",
397
347
  docker_workdir: str = "",
398
348
  extra_pkgs: list = [],
399
- container_name: str = "",
400
349
  ):
401
350
  """Build base Docker command with common options.
402
351
 
@@ -411,7 +360,6 @@ class ActionInstance:
411
360
  destination_workspace_path (str): Container workspace path
412
361
  docker_workdir (str): Docker working directory
413
362
  extra_pkgs (list): List of extra packages to install
414
- container_name (str): Docker container name (format: {action_type}_{action_id})
415
363
  Returns:
416
364
  str: Base Docker command
417
365
  """
@@ -482,15 +430,13 @@ class ActionInstance:
482
430
  else:
483
431
  use_restart_policy = ""
484
432
 
485
- # Build container name option if provided
486
- name_option = f"--name {container_name}" if container_name else ""
487
-
488
433
  cmd_parts = [
489
- f"docker run -d {use_gpu} {use_restart_policy} ",
434
+ f"docker run {use_gpu} {use_restart_policy} ",
490
435
  network_config,
491
436
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
492
437
  *volumes,
493
438
  # Container configuration and startup commands
439
+ f"--cidfile ./{self.action_record_id}.cid ",
494
440
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
495
441
  f'/bin/bash -c "cd {docker_workdir} && '
496
442
  f"{env_exports} && "
@@ -892,50 +838,55 @@ class ActionInstance:
892
838
  self.cmd = cmd
893
839
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
894
840
 
895
- # Run docker with -d flag to get container ID from stdout
896
- process = subprocess.Popen(
897
- shlex.split(self.cmd),
898
- stdout=subprocess.PIPE,
899
- stderr=subprocess.PIPE,
900
- text=True,
901
- env={**os.environ},
902
- )
903
-
904
- # Use a longer timeout for docker run since --pull=always may need to
905
- # download large images on first run. Default: 30 minutes (1800 seconds)
906
- # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
907
- docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
908
- logging.info(
909
- "Waiting for docker container to start for action %s (timeout: %d seconds)",
910
- self.action_record_id,
911
- docker_start_timeout,
912
- )
913
- stdout, stderr = process.communicate(timeout=docker_start_timeout)
841
+ with open(self.log_path, "wb") as out:
842
+ self.process = subprocess.Popen(
843
+ shlex.split(self.cmd),
844
+ stdout=out,
845
+ stderr=out,
846
+ env={**os.environ},
847
+ start_new_session=True,
848
+ )
914
849
 
915
- if process.returncode != 0:
850
+ self.container_id = None
851
+
852
+ cid_file_path = f"./{self.action_record_id}.cid"
853
+ max_retries = 5
854
+ retry_delay = 1 # seconds
855
+ for attempt in range(max_retries):
856
+ try:
857
+ with open(cid_file_path, "r") as cid_file:
858
+ container_id = cid_file.read().strip()
859
+ self.container_id = container_id
860
+ logging.info(
861
+ "Started process for action %s with container ID: %s",
862
+ self.action_record_id,
863
+ self.container_id,
864
+ )
865
+ break
866
+ except FileNotFoundError:
867
+ logging.warning(
868
+ "CID file not found for action %s, attempt %d/%d",
869
+ self.action_record_id,
870
+ attempt + 1,
871
+ max_retries,
872
+ )
873
+ time.sleep(retry_delay)
874
+ except Exception as e:
875
+ logging.error(
876
+ "Error reading CID file for action %s: %s",
877
+ self.action_record_id,
878
+ str(e),
879
+ )
880
+ time.sleep(retry_delay)
881
+ else:
916
882
  logging.error(
917
- "Docker run failed for action %s: %s",
883
+ "Failed to read CID file for action %s after %d attempts",
918
884
  self.action_record_id,
919
- stderr,
885
+ max_retries,
920
886
  )
921
- raise RuntimeError(f"Docker run failed: {stderr}")
887
+ raise Exception("Failed to start process: CID file not found")
922
888
 
923
- self.container_id = stdout.strip()
924
- logging.info(
925
- "Started container for action %s with ID: %s",
926
- self.action_record_id,
927
- self.container_id,
928
- )
929
-
930
- # Start following container logs in background
931
- self.process = subprocess.Popen(
932
- ["docker", "logs", "-f", self.container_id],
933
- stdout=open(self.log_path, "wb"),
934
- stderr=subprocess.STDOUT,
935
- start_new_session=True,
936
- )
937
-
938
- # Report container id to scaling service
889
+ # report container id to scaling service
939
890
  self.scaling.update_action_container_id(
940
891
  action_record_id=self.action_record_id,
941
892
  container_id=self.container_id,
@@ -1101,8 +1052,7 @@ def data_preparation_execute(
1101
1052
  "Started pulling Docker image with PID: %s",
1102
1053
  process.pid,
1103
1054
  )
1104
- container_name = f"data_prep_{self.action_record_id}"
1105
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1055
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1106
1056
  logging.info("cmd is: %s", cmd)
1107
1057
  self.start(cmd, "data_preparation_log")
1108
1058
 
@@ -1131,8 +1081,7 @@ def data_processing_execute(self: ActionInstance):
1131
1081
  service="bg-job-scheduler",
1132
1082
  job_params=action["jobParams"],
1133
1083
  )
1134
- container_name = f"data_processing_{self.action_record_id}"
1135
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1084
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1136
1085
  logging.info("cmd: %s", cmd)
1137
1086
  self.start(cmd, "data_processing_log")
1138
1087
 
@@ -1145,8 +1094,7 @@ def data_split_execute(self: ActionInstance):
1145
1094
  if not action_details:
1146
1095
  return
1147
1096
  self.setup_action_requirements(action_details, work_fs, model_family="")
1148
- container_name = f"data_split_{self.action_record_id}"
1149
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1097
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1150
1098
  logging.info("cmd: %s", cmd)
1151
1099
  self.start(cmd, "data_split")
1152
1100
 
@@ -1161,8 +1109,7 @@ def dataset_annotation_execute(
1161
1109
  if not action_details:
1162
1110
  return
1163
1111
  self.setup_action_requirements(action_details, work_fs)
1164
- container_name = f"dataset_annotation_{self.action_record_id}"
1165
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1112
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1166
1113
  logging.info("cmd: %s", cmd)
1167
1114
  self.start(cmd, "dataset_annotation")
1168
1115
 
@@ -1177,8 +1124,7 @@ def dataset_augmentation_execute(
1177
1124
  if not action_details:
1178
1125
  return
1179
1126
  self.setup_action_requirements(action_details, work_fs)
1180
- container_name = f"dataset_augmentation_{self.action_record_id}"
1181
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1127
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1182
1128
  logging.info("cmd: %s", cmd)
1183
1129
  self.start(cmd, "dataset_augmentation")
1184
1130
 
@@ -1194,8 +1140,7 @@ def augmentation_server_creation_execute(
1194
1140
  if not action_details:
1195
1141
  return
1196
1142
  self.setup_action_requirements(action_details, work_fs)
1197
- container_name = f"augmentation_setup_{self.action_record_id}"
1198
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1143
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1199
1144
  logging.info("cmd: %s", cmd)
1200
1145
  self.start(cmd, "augmentation_setup")
1201
1146
 
@@ -1216,34 +1161,32 @@ def database_setup_execute(self: ActionInstance):
1216
1161
 
1217
1162
  project_id = action_details["_idProject"]
1218
1163
 
1219
- # Define container names with action_record_id for uniqueness
1220
- mongodb_container_name = f"database_setup_{self.action_record_id}"
1221
- qdrant_container_name = f"qdrant_{self.action_record_id}"
1222
-
1223
1164
  if action_details["actionDetails"].get("containerId"):
1224
1165
  logging.info(
1225
- "Using existing container ID for database setup: %s",
1166
+ "Using existing container ID for inference tracker: %s",
1226
1167
  action_details["actionDetails"]["containerId"],
1227
1168
  )
1228
1169
  self.docker_container = action_details["actionDetails"]["containerId"]
1229
1170
  cmd = "docker restart " + self.docker_container
1230
- self.start(cmd, "database_setup")
1171
+ self.start(cmd, "qdrant_setup")
1231
1172
 
1232
- # qdrant restart
1233
- qdrant_cmd = f"docker restart {qdrant_container_name}"
1234
- self.start(qdrant_cmd, "qdrant_setup")
1173
+ #qdrant restart
1174
+ qdrant_cmd = "docker restart qdrant"
1175
+ self.start(qdrant_cmd, 'qdrant_setup')
1235
1176
 
1236
1177
  return
1178
+
1179
+
1180
+ dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
1237
1181
 
1238
- dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
1239
1182
 
1240
1183
  # MongoDB container with --net=host (Port: 27020:27017)
1241
1184
  cmd = (
1242
- f"docker run -d --pull=always --net=host "
1243
- f"--name {mongodb_container_name} "
1244
- f"-v matrice_myvol:/matrice_data "
1185
+ f"docker run --pull=always --net=host "
1245
1186
  f"-v {dbPath}:{dbPath} "
1187
+ f"--name database_setup_{self.action_record_id} "
1246
1188
  f"-v /var/run/docker.sock:/var/run/docker.sock "
1189
+ f"--cidfile ./{self.action_record_id}.cid "
1247
1190
  f"-e ACTION_RECORD_ID={self.action_record_id} "
1248
1191
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1249
1192
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1253,23 +1196,6 @@ def database_setup_execute(self: ActionInstance):
1253
1196
  )
1254
1197
  logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
1255
1198
 
1256
- # Qdrant container with --net=host (Port: 6334)
1257
- qdrant_cmd = (
1258
- f"docker run -d --pull=always --net=host "
1259
- f"--name {qdrant_container_name} "
1260
- f"-v matrice_myvol:/matrice_data "
1261
- f"qdrant/qdrant:latest "
1262
- )
1263
- logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
1264
-
1265
- # Start Qdrant container
1266
- qdrant_process = subprocess.Popen(
1267
- qdrant_cmd,
1268
- shell=True,
1269
- stdout=subprocess.PIPE,
1270
- stderr=subprocess.PIPE,
1271
- )
1272
- logging.info("Qdrant container started successfully")
1273
1199
 
1274
1200
  # Docker Command run
1275
1201
  self.start(cmd, "database_setup")
@@ -1289,8 +1215,6 @@ def facial_recognition_setup_execute(self: ActionInstance):
1289
1215
 
1290
1216
  self.setup_action_requirements(action_details)
1291
1217
 
1292
- container_name = f"facial_recognition_{self.action_record_id}"
1293
-
1294
1218
  if action_details["actionDetails"].get("containerId"):
1295
1219
  logging.info(
1296
1220
  "Using existing container ID for facial recognition worker: %s",
@@ -1304,13 +1228,15 @@ def facial_recognition_setup_execute(self: ActionInstance):
1304
1228
  # Facial recognition worker container with --net=host (Port: 8081)
1305
1229
  worker_cmd = (
1306
1230
  f"docker run -d --pull=always --net=host "
1307
- f"--name {container_name} "
1231
+ f"--name worker "
1232
+ f"--cidfile ./{self.action_record_id}.cid "
1308
1233
  f"-v matrice_myvol:/matrice_data "
1234
+ f"--cidfile ./{self.action_record_id}.cid "
1309
1235
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1310
1236
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1311
1237
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1312
1238
  f'-e ACTION_ID="{self.action_record_id}" '
1313
- f'--restart=unless-stopped '
1239
+ f' --restart=unless-stopped '
1314
1240
  f"{image}"
1315
1241
  )
1316
1242
  logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
@@ -1332,8 +1258,6 @@ def lpr_setup_execute(self: ActionInstance):
1332
1258
 
1333
1259
  self.setup_action_requirements(action_details)
1334
1260
 
1335
- container_name = f"lpr_{self.action_record_id}"
1336
-
1337
1261
  if action_details["actionDetails"].get("containerId"):
1338
1262
  logging.info(
1339
1263
  "Using existing container ID for LPR worker: %s",
@@ -1347,14 +1271,15 @@ def lpr_setup_execute(self: ActionInstance):
1347
1271
  # LPR worker container with --net=host (Port: 8082)
1348
1272
  worker_cmd = (
1349
1273
  f"docker run -d --net=host --pull=always "
1350
- f"--name {container_name} "
1274
+ f"--name lpr-worker "
1275
+ f"--cidfile ./{self.action_record_id}.cid "
1351
1276
  f"-v matrice_myvol:/matrice_data "
1352
1277
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1353
1278
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1354
1279
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1355
1280
  f'-e ACTION_ID="{self.action_record_id}" '
1356
1281
  f'-e PORT=8082 '
1357
- f'--restart=unless-stopped '
1282
+ f' --restart=unless-stopped '
1358
1283
  f"{image}"
1359
1284
  )
1360
1285
  logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
@@ -1385,8 +1310,6 @@ def inference_ws_server_execute(self: ActionInstance):
1385
1310
 
1386
1311
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1387
1312
 
1388
- container_name = f"inference_ws_{self.action_record_id}"
1389
-
1390
1313
  if action_details["actionDetails"].get("containerId"):
1391
1314
  logging.info(
1392
1315
  "Using existing container ID for inference WebSocket server: %s",
@@ -1400,11 +1323,12 @@ def inference_ws_server_execute(self: ActionInstance):
1400
1323
  # Inference WebSocket server with --net=host (Port: 8102)
1401
1324
  worker_cmd = (
1402
1325
  f"docker run -d --pull=always --net=host "
1403
- f"--name {container_name} "
1326
+ f"--name inference "
1327
+ f"--cidfile ./{self.action_record_id}.cid "
1404
1328
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1405
1329
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1406
1330
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1407
- f'--restart=unless-stopped '
1331
+ f' --restart=unless-stopped '
1408
1332
  f"{image} "
1409
1333
  f"./app "
1410
1334
  f"{self.action_record_id} "
@@ -1435,8 +1359,6 @@ def fe_fs_streaming_execute(self: ActionInstance):
1435
1359
 
1436
1360
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1437
1361
 
1438
- container_name = f"fe_streaming_{self.action_record_id}"
1439
-
1440
1362
  if action_details["actionDetails"].get("containerId"):
1441
1363
  logging.info(
1442
1364
  "Using existing container ID for frontend streaming: %s",
@@ -1450,14 +1372,15 @@ def fe_fs_streaming_execute(self: ActionInstance):
1450
1372
  # Frontend streaming with --net=host (Port: 3000)
1451
1373
  worker_cmd = (
1452
1374
  f"docker run -d --pull=always --net=host "
1453
- f"--name {container_name} "
1375
+ f"--name fe_streaming "
1376
+ f"--cidfile ./{self.action_record_id}.cid "
1454
1377
  f"-v matrice_myvol:/matrice_data "
1455
1378
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1456
1379
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1457
1380
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1458
1381
  f"-e PORT=3000 "
1459
1382
  f'-e WS_HOST="{ws_url}" '
1460
- f'--restart=unless-stopped '
1383
+ f' --restart=unless-stopped '
1461
1384
  f"{image}"
1462
1385
  )
1463
1386
  logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
@@ -1482,8 +1405,6 @@ def fe_analytics_service_execute(self: ActionInstance):
1482
1405
 
1483
1406
  project_id = action_details["_idProject"]
1484
1407
 
1485
- container_name = f"fe_analytics_{self.action_record_id}"
1486
-
1487
1408
  if action_details["actionDetails"].get("containerId"):
1488
1409
  logging.info(
1489
1410
  "Using existing container ID for frontend analytics service: %s",
@@ -1497,14 +1418,15 @@ def fe_analytics_service_execute(self: ActionInstance):
1497
1418
  # Frontend analytics service with --net=host (Port: 3001)
1498
1419
  worker_cmd = (
1499
1420
  f"docker run -d --pull=always --net=host "
1500
- f"--name {container_name} "
1421
+ f"--name fe-analytics "
1422
+ f"--cidfile ./{self.action_record_id}.cid "
1501
1423
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1502
1424
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1503
1425
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1504
1426
  f'-e ACTION_ID="{self.action_record_id}" '
1505
1427
  f"-e PORT=3001 "
1506
1428
  f'-e PROJECT_ID="{project_id}" '
1507
- f'--restart=unless-stopped '
1429
+ f' --restart=unless-stopped '
1508
1430
  f"{image}"
1509
1431
  )
1510
1432
  logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
@@ -1529,8 +1451,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1529
1451
  else:
1530
1452
  return
1531
1453
  use_gpu = self.get_gpu_config(action_details)
1532
- container_name = f"dataset_generation_{self.action_record_id}"
1533
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1454
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1534
1455
  logging.info("cmd is: %s", cmd)
1535
1456
  self.start(cmd, "dataset_generation")
1536
1457
 
@@ -1551,8 +1472,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
1551
1472
  else:
1552
1473
  return
1553
1474
  use_gpu = self.get_gpu_config(action_details)
1554
- container_name = f"synthetic_data_setup_{self.action_record_id}"
1555
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1475
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1556
1476
  logging.info("cmd is: %s", cmd)
1557
1477
  self.start(cmd, "synthetic_data_setup")
1558
1478
 
@@ -1589,8 +1509,6 @@ def redis_setup_execute(self: ActionInstance):
1589
1509
 
1590
1510
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1591
1511
 
1592
- # Define container names with action_record_id for uniqueness
1593
- redis_container_name = f"redis_{self.action_record_id}"
1594
1512
 
1595
1513
  if action_details["actionDetails"].get("containerId"):
1596
1514
  logging.info(
@@ -1602,34 +1520,18 @@ def redis_setup_execute(self: ActionInstance):
1602
1520
  self.start(cmd, "redis_setup")
1603
1521
 
1604
1522
  # Redis container restart
1605
- redis_restart_cmd = f"docker restart {redis_container_name}"
1523
+ redis_restart_cmd = "docker restart redis_container"
1606
1524
  self.start(redis_restart_cmd, "redis")
1607
1525
 
1608
1526
  return
1609
1527
 
1610
- # Redis container with --net=host (Port: 6379) with optimized configuration
1528
+ # Redis container with --net=host (Port: 6379)
1611
1529
  redis_cmd = (
1612
1530
  f"docker run -d --net=host "
1613
- f"--name {redis_container_name} "
1531
+ f"--name redis_container "
1614
1532
  f"--restart unless-stopped "
1615
1533
  f"{redis_image} "
1616
- f"redis-server --bind 0.0.0.0 "
1617
- f"--appendonly no "
1618
- f'--save "" '
1619
- f"--maxmemory 30gb "
1620
- f"--maxmemory-policy allkeys-lru "
1621
- f"--io-threads 4 "
1622
- f"--io-threads-do-reads yes "
1623
- f"--stream-node-max-bytes 8192 "
1624
- f"--stream-node-max-entries 1000 "
1625
- f"--hz 100 "
1626
- f"--tcp-backlog 2048 "
1627
- f"--timeout 0 "
1628
- f"--lazyfree-lazy-eviction yes "
1629
- f"--lazyfree-lazy-expire yes "
1630
- f"--lazyfree-lazy-server-del yes "
1631
- f"--activedefrag yes "
1632
- f"--requirepass {redis_password}"
1534
+ f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1633
1535
  )
1634
1536
 
1635
1537
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
@@ -1653,9 +1555,8 @@ def redis_setup_execute(self: ActionInstance):
1653
1555
 
1654
1556
  # bg-redis management container with --net=host (Port: 8082)
1655
1557
  cmd = (
1656
- f"docker run -d --net=host "
1657
- f"--restart unless-stopped "
1658
- f"--name bg-redis_{self.action_record_id} "
1558
+ f"docker run --net=host "
1559
+ f"--cidfile ./{self.action_record_id}.cid "
1659
1560
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1660
1561
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1661
1562
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1682,8 +1583,7 @@ def deploy_aggregator_execute(
1682
1583
  if not action_details:
1683
1584
  return
1684
1585
  self.setup_action_requirements(action_details, work_fs)
1685
- container_name = f"deploy_aggregator_{self.action_record_id}"
1686
- cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1586
+ cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1687
1587
  logging.info("cmd: %s", cmd)
1688
1588
  self.start(cmd, "deploy_aggregator")
1689
1589
 
@@ -1699,10 +1599,6 @@ def model_deploy_execute(self: ActionInstance):
1699
1599
  return
1700
1600
  action_id = action_details["_id"]
1701
1601
  model_family = action_details["actionDetails"]["modelFamily"]
1702
-
1703
- # Get the service ID to track deployments
1704
- service_id = action_details.get("_idService")
1705
-
1706
1602
  self.setup_action_requirements(
1707
1603
  action_details,
1708
1604
  work_fs,
@@ -1710,29 +1606,17 @@ def model_deploy_execute(self: ActionInstance):
1710
1606
  action_id=action_id,
1711
1607
  )
1712
1608
 
1713
- # Use all GPUs if GPU is required
1609
+ # Get GPU configuration based on requirements and availability
1610
+ # This uses the best-fit algorithm to select the most appropriate GPU(s)
1611
+ use_gpu = self.get_gpu_config(action_details)
1612
+
1613
+ # Override: If GPU is required, use all available GPUs
1714
1614
  gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
1715
1615
  if gpuRequired:
1716
1616
  use_gpu = "--runtime=nvidia --gpus all"
1717
- else:
1718
- use_gpu = ""
1719
-
1720
- logging.info(
1721
- "Action %s: Model deployment GPU config: %s",
1722
- action_id,
1723
- use_gpu if use_gpu else "CPU-only"
1724
- )
1725
-
1726
- # Get or create TRITON_PORTS (uses utility method)
1727
- triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1728
1617
 
1729
- extra_env_vars = {
1730
- "INTERNAL_PORT": internal_port,
1731
- "TRITON_PORTS": triton_ports
1732
- }
1733
-
1734
- container_name = f"model_deploy_{self.action_record_id}"
1735
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1618
+ extra_env_vars = {"INTERNAL_PORT": internal_port}
1619
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1736
1620
  logging.info("cmd is: %s", cmd)
1737
1621
  self.start(cmd, "deploy_log")
1738
1622
 
@@ -1765,8 +1649,7 @@ def model_train_execute(self: ActionInstance):
1765
1649
  self.start(cmd, "train_log")
1766
1650
  return
1767
1651
 
1768
- container_name = f"model_train_{self.action_record_id}"
1769
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1652
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1770
1653
  logging.info("cmd is: %s", cmd)
1771
1654
  self.start(cmd, "train_log")
1772
1655
 
@@ -1789,7 +1672,7 @@ def model_eval_execute(self: ActionInstance):
1789
1672
  )
1790
1673
  if action_details["actionDetails"].get("containerId"):
1791
1674
  logging.info(
1792
- "Using existing container ID for evaluation: %s",
1675
+ "Using existing container ID for training: %s",
1793
1676
  action_details["actionDetails"]["containerId"],
1794
1677
  )
1795
1678
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1797,8 +1680,7 @@ def model_eval_execute(self: ActionInstance):
1797
1680
  self.start(cmd, "eval_log")
1798
1681
  return
1799
1682
 
1800
- container_name = f"model_eval_{self.action_record_id}"
1801
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1683
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1802
1684
  logging.info("cmd is: %s", cmd)
1803
1685
  self.start(cmd, "eval_log")
1804
1686
 
@@ -1824,7 +1706,7 @@ def model_export_execute(self: ActionInstance):
1824
1706
  )
1825
1707
  if action_details["actionDetails"].get("containerId"):
1826
1708
  logging.info(
1827
- "Using existing container ID for export: %s",
1709
+ "Using existing container ID for training: %s",
1828
1710
  action_details["actionDetails"]["containerId"],
1829
1711
  )
1830
1712
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1832,8 +1714,7 @@ def model_export_execute(self: ActionInstance):
1832
1714
  self.start(cmd, "export_log")
1833
1715
  return
1834
1716
 
1835
- container_name = f"model_export_{self.action_record_id}"
1836
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1717
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1837
1718
  logging.info("cmd is: %s", cmd)
1838
1719
  self.start(cmd, "export_log")
1839
1720
 
@@ -1849,8 +1730,7 @@ def image_build_execute(self: ActionInstance):
1849
1730
  action_id = action_details["_id"]
1850
1731
  internal_api_key = self.get_internal_api_key(action_id)
1851
1732
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1852
- container_name = f"image_build_{self.action_record_id}"
1853
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1733
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1854
1734
  logging.info("cmd is: %s", cmd)
1855
1735
  self.start(cmd, "image_build_log")
1856
1736
 
@@ -1862,8 +1742,7 @@ def resource_clone_execute(self: ActionInstance):
1862
1742
  if not action_details:
1863
1743
  return
1864
1744
  self.setup_action_requirements(action_details)
1865
- container_name = f"resource_clone_{self.action_record_id}"
1866
- cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1745
+ cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1867
1746
  logging.info("cmd is: %s", cmd)
1868
1747
  self.start(cmd, "resource_clone")
1869
1748
 
@@ -1881,7 +1760,7 @@ def streaming_gateway_execute(self: ActionInstance):
1881
1760
  )
1882
1761
  if action_details["actionDetails"].get("containerId"):
1883
1762
  logging.info(
1884
- "Using existing container ID for streaming gateway: %s",
1763
+ "Using existing container ID for training: %s",
1885
1764
  action_details["actionDetails"]["containerId"],
1886
1765
  )
1887
1766
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1889,8 +1768,7 @@ def streaming_gateway_execute(self: ActionInstance):
1889
1768
  self.start(cmd, "streaming_gateway")
1890
1769
  return
1891
1770
 
1892
- container_name = f"streaming_gateway_{self.action_record_id}"
1893
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1771
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1894
1772
  logging.info("cmd is: %s", cmd)
1895
1773
  self.start(cmd, "streaming_gateway")
1896
1774
 
@@ -1986,7 +1864,7 @@ def kafka_setup_execute(self: ActionInstance):
1986
1864
 
1987
1865
  if action_details["actionDetails"].get("containerId"):
1988
1866
  logging.info(
1989
- "Using existing container ID for kafka: %s",
1867
+ "Using existing container ID for training: %s",
1990
1868
  action_details["actionDetails"]["containerId"],
1991
1869
  )
1992
1870
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1994,12 +1872,10 @@ def kafka_setup_execute(self: ActionInstance):
1994
1872
  self.start(cmd, "kafka_setup")
1995
1873
  return
1996
1874
 
1997
- container_name = f"kafka_{self.action_record_id}"
1998
1875
 
1999
1876
  # Kafka container with --net=host (Ports: 9092, 9093)
2000
1877
  cmd = (
2001
- f"docker run -d --net=host "
2002
- f"--name {container_name} "
1878
+ f"docker run --net=host "
2003
1879
  f"{env_args} "
2004
1880
  f"--shm-size=30G --pull=always "
2005
1881
  f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
@@ -2032,8 +1908,6 @@ def inference_tracker_setup_execute(self: ActionInstance):
2032
1908
 
2033
1909
  self.setup_action_requirements(action_details)
2034
1910
 
2035
- container_name = f"inference_tracker_{self.action_record_id}"
2036
-
2037
1911
  if action_details["actionDetails"].get("containerId"):
2038
1912
  logging.info(
2039
1913
  "Using existing container ID for inference tracker: %s",
@@ -2047,13 +1921,14 @@ def inference_tracker_setup_execute(self: ActionInstance):
2047
1921
  # This is the existing Docker run command
2048
1922
  worker_cmd = (
2049
1923
  f"docker run -d --pull=always --net=host "
2050
- f"--name {container_name} "
1924
+ f"--cidfile ./{self.action_record_id}.cid "
1925
+ f"--name inference-tracker-worker "
2051
1926
  f"-v matrice_myvol:/matrice_data "
2052
1927
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2053
1928
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2054
1929
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2055
1930
  f'-e ACTION_ID="{self.action_record_id}" '
2056
- f'--restart=unless-stopped '
1931
+ f' --restart=unless-stopped '
2057
1932
  f"{image}"
2058
1933
  )
2059
1934
 
@@ -2075,11 +1950,9 @@ def video_storage_setup_execute(self: ActionInstance):
2075
1950
 
2076
1951
  self.setup_action_requirements(action_details)
2077
1952
 
2078
- container_name = f"video_storage_{self.action_record_id}"
2079
-
2080
1953
  if action_details["actionDetails"].get("containerId"):
2081
1954
  logging.info(
2082
- "Using existing container ID for video storage: %s",
1955
+ "Using existing container ID for inference tracker: %s",
2083
1956
  action_details["actionDetails"]["containerId"],
2084
1957
  )
2085
1958
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -2090,13 +1963,14 @@ def video_storage_setup_execute(self: ActionInstance):
2090
1963
  # This is the existing Docker run command
2091
1964
  worker_cmd = (
2092
1965
  f"docker run -d --pull=always --net=host "
2093
- f"--name {container_name} "
1966
+ f"--cidfile ./{self.action_record_id}.cid "
1967
+ f"--name media_server "
2094
1968
  f"-v matrice_myvol:/matrice_data "
2095
1969
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2096
1970
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2097
1971
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2098
1972
  f'-e ACTION_ID="{self.action_record_id}" '
2099
- f'--restart=unless-stopped '
1973
+ f' --restart=unless-stopped '
2100
1974
  f"{image}"
2101
1975
  )
2102
1976