matrice-compute 0.1.38__tar.gz → 0.1.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/PKG-INFO +1 -1
  2. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/PKG-INFO +1 -1
  3. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/action_instance.py +114 -241
  4. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/LICENSE.txt +0 -0
  5. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/README.md +0 -0
  6. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/SOURCES.txt +0 -0
  7. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/dependency_links.txt +0 -0
  8. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/not-zip-safe +0 -0
  9. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/top_level.txt +0 -0
  10. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/pyproject.toml +0 -0
  11. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/setup.cfg +0 -0
  12. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/setup.py +0 -0
  13. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/__init__.py +0 -0
  14. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/actions_manager.py +0 -0
  15. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/actions_scaledown_manager.py +0 -0
  16. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/compute_operations_handler.py +0 -0
  17. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/instance_manager.py +0 -0
  18. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/instance_utils.py +0 -0
  19. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/prechecks.py +0 -0
  20. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/py.typed +0 -0
  21. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/resources_tracker.py +0 -0
  22. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/scaling.py +0 -0
  23. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/shutdown_manager.py +0 -0
  24. {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/task_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.38
3
+ Version: 0.1.40
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.38
3
+ Version: 0.1.40
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -26,10 +26,6 @@ from matrice_common.utils import log_errors
26
26
  class ActionInstance:
27
27
  """Base class for tasks that run in Action containers."""
28
28
 
29
- # Class-level dictionary to track deployed services and their ports
30
- # Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
31
- _deployed_services = {}
32
-
33
29
  def __init__(self, scaling: Scaling, action_info: dict):
34
30
  """Initialize an action instance.
35
31
 
@@ -89,52 +85,6 @@ class ActionInstance:
89
85
  raise ValueError(f"Unknown action type: {self.action_type}")
90
86
  self.task = self.actions_map[self.action_type]
91
87
 
92
- @classmethod
93
- def get_or_create_triton_ports(cls, service_id, scaling_instance):
94
- """Get existing TRITON_PORTS for a service or create new ones.
95
-
96
- Args:
97
- service_id (str): Service ID (_idService)
98
- scaling_instance: Scaling instance to get open ports
99
-
100
- Returns:
101
- str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
102
- """
103
- if not service_id:
104
- # No service_id, generate new ports
105
- port1 = scaling_instance.get_open_port()
106
- port2 = scaling_instance.get_open_port()
107
- port3 = scaling_instance.get_open_port()
108
- return f"{port1},{port2},{port3}"
109
-
110
- # Check if ports already exist for this service
111
- if service_id in cls._deployed_services:
112
- triton_ports = cls._deployed_services[service_id]["triton_ports"]
113
- logging.info(
114
- "Reusing TRITON_PORTS for service %s: %s",
115
- service_id,
116
- triton_ports
117
- )
118
- return triton_ports
119
-
120
- # First deployment: generate new ports and store them
121
- port1 = scaling_instance.get_open_port()
122
- port2 = scaling_instance.get_open_port()
123
- port3 = scaling_instance.get_open_port()
124
- triton_ports = f"{port1},{port2},{port3}"
125
-
126
- # Store for future use
127
- cls._deployed_services[service_id] = {
128
- "triton_ports": triton_ports,
129
- }
130
-
131
- logging.info(
132
- "First deployment for service %s - generated TRITON_PORTS: %s",
133
- service_id,
134
- triton_ports
135
- )
136
- return triton_ports
137
-
138
88
  @log_errors(default_return={}, raise_exception=True, log_error=False)
139
89
  def _init_credentials(self):
140
90
  """Initialize Matrice credentials.
@@ -396,7 +346,6 @@ class ActionInstance:
396
346
  destination_workspace_path: str = "/usr/src/workspace",
397
347
  docker_workdir: str = "",
398
348
  extra_pkgs: list = [],
399
- container_name: str = "",
400
349
  ):
401
350
  """Build base Docker command with common options.
402
351
 
@@ -411,7 +360,6 @@ class ActionInstance:
411
360
  destination_workspace_path (str): Container workspace path
412
361
  docker_workdir (str): Docker working directory
413
362
  extra_pkgs (list): List of extra packages to install
414
- container_name (str): Docker container name (format: {action_type}_{action_id})
415
363
  Returns:
416
364
  str: Base Docker command
417
365
  """
@@ -478,20 +426,17 @@ class ActionInstance:
478
426
 
479
427
  # if the service provider is local, then put --restart unless-stopped
480
428
  if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
481
- env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
482
- use_restart_policy = "--restart unless-stopped"
429
+ use_restart_policy = "--restart=unless-stopped "
483
430
  else:
484
431
  use_restart_policy = ""
485
432
 
486
- # Build container name option if provided
487
- name_option = f"--name {container_name}" if container_name else ""
488
-
489
433
  cmd_parts = [
490
- f"docker run -d {use_gpu} {use_restart_policy} ",
434
+ f"docker run {use_gpu} {use_restart_policy} ",
491
435
  network_config,
492
436
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
493
437
  *volumes,
494
438
  # Container configuration and startup commands
439
+ f"--cidfile ./{self.action_record_id}.cid ",
495
440
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
496
441
  f'/bin/bash -c "cd {docker_workdir} && '
497
442
  f"{env_exports} && "
@@ -893,50 +838,55 @@ class ActionInstance:
893
838
  self.cmd = cmd
894
839
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
895
840
 
896
- # Run docker with -d flag to get container ID from stdout
897
- process = subprocess.Popen(
898
- shlex.split(self.cmd),
899
- stdout=subprocess.PIPE,
900
- stderr=subprocess.PIPE,
901
- text=True,
902
- env={**os.environ},
903
- )
904
-
905
- # Use a longer timeout for docker run since --pull=always may need to
906
- # download large images on first run. Default: 30 minutes (1800 seconds)
907
- # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
908
- docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
909
- logging.info(
910
- "Waiting for docker container to start for action %s (timeout: %d seconds)",
911
- self.action_record_id,
912
- docker_start_timeout,
913
- )
914
- stdout, stderr = process.communicate(timeout=docker_start_timeout)
841
+ with open(self.log_path, "wb") as out:
842
+ self.process = subprocess.Popen(
843
+ shlex.split(self.cmd),
844
+ stdout=out,
845
+ stderr=out,
846
+ env={**os.environ},
847
+ start_new_session=True,
848
+ )
915
849
 
916
- if process.returncode != 0:
850
+ self.container_id = None
851
+
852
+ cid_file_path = f"./{self.action_record_id}.cid"
853
+ max_retries = 5
854
+ retry_delay = 1 # seconds
855
+ for attempt in range(max_retries):
856
+ try:
857
+ with open(cid_file_path, "r") as cid_file:
858
+ container_id = cid_file.read().strip()
859
+ self.container_id = container_id
860
+ logging.info(
861
+ "Started process for action %s with container ID: %s",
862
+ self.action_record_id,
863
+ self.container_id,
864
+ )
865
+ break
866
+ except FileNotFoundError:
867
+ logging.warning(
868
+ "CID file not found for action %s, attempt %d/%d",
869
+ self.action_record_id,
870
+ attempt + 1,
871
+ max_retries,
872
+ )
873
+ time.sleep(retry_delay)
874
+ except Exception as e:
875
+ logging.error(
876
+ "Error reading CID file for action %s: %s",
877
+ self.action_record_id,
878
+ str(e),
879
+ )
880
+ time.sleep(retry_delay)
881
+ else:
917
882
  logging.error(
918
- "Docker run failed for action %s: %s",
883
+ "Failed to read CID file for action %s after %d attempts",
919
884
  self.action_record_id,
920
- stderr,
885
+ max_retries,
921
886
  )
922
- raise RuntimeError(f"Docker run failed: {stderr}")
887
+ raise Exception("Failed to start process: CID file not found")
923
888
 
924
- self.container_id = stdout.strip()
925
- logging.info(
926
- "Started container for action %s with ID: %s",
927
- self.action_record_id,
928
- self.container_id,
929
- )
930
-
931
- # Start following container logs in background
932
- self.process = subprocess.Popen(
933
- ["docker", "logs", "-f", self.container_id],
934
- stdout=open(self.log_path, "wb"),
935
- stderr=subprocess.STDOUT,
936
- start_new_session=True,
937
- )
938
-
939
- # Report container id to scaling service
889
+ # report container id to scaling service
940
890
  self.scaling.update_action_container_id(
941
891
  action_record_id=self.action_record_id,
942
892
  container_id=self.container_id,
@@ -1102,8 +1052,7 @@ def data_preparation_execute(
1102
1052
  "Started pulling Docker image with PID: %s",
1103
1053
  process.pid,
1104
1054
  )
1105
- container_name = f"data_prep_{self.action_record_id}"
1106
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1055
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1107
1056
  logging.info("cmd is: %s", cmd)
1108
1057
  self.start(cmd, "data_preparation_log")
1109
1058
 
@@ -1132,8 +1081,7 @@ def data_processing_execute(self: ActionInstance):
1132
1081
  service="bg-job-scheduler",
1133
1082
  job_params=action["jobParams"],
1134
1083
  )
1135
- container_name = f"data_processing_{self.action_record_id}"
1136
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1084
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1137
1085
  logging.info("cmd: %s", cmd)
1138
1086
  self.start(cmd, "data_processing_log")
1139
1087
 
@@ -1146,8 +1094,7 @@ def data_split_execute(self: ActionInstance):
1146
1094
  if not action_details:
1147
1095
  return
1148
1096
  self.setup_action_requirements(action_details, work_fs, model_family="")
1149
- container_name = f"data_split_{self.action_record_id}"
1150
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1097
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1151
1098
  logging.info("cmd: %s", cmd)
1152
1099
  self.start(cmd, "data_split")
1153
1100
 
@@ -1162,8 +1109,7 @@ def dataset_annotation_execute(
1162
1109
  if not action_details:
1163
1110
  return
1164
1111
  self.setup_action_requirements(action_details, work_fs)
1165
- container_name = f"dataset_annotation_{self.action_record_id}"
1166
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1112
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1167
1113
  logging.info("cmd: %s", cmd)
1168
1114
  self.start(cmd, "dataset_annotation")
1169
1115
 
@@ -1178,8 +1124,7 @@ def dataset_augmentation_execute(
1178
1124
  if not action_details:
1179
1125
  return
1180
1126
  self.setup_action_requirements(action_details, work_fs)
1181
- container_name = f"dataset_augmentation_{self.action_record_id}"
1182
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1127
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1183
1128
  logging.info("cmd: %s", cmd)
1184
1129
  self.start(cmd, "dataset_augmentation")
1185
1130
 
@@ -1195,8 +1140,7 @@ def augmentation_server_creation_execute(
1195
1140
  if not action_details:
1196
1141
  return
1197
1142
  self.setup_action_requirements(action_details, work_fs)
1198
- container_name = f"augmentation_setup_{self.action_record_id}"
1199
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1143
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1200
1144
  logging.info("cmd: %s", cmd)
1201
1145
  self.start(cmd, "augmentation_setup")
1202
1146
 
@@ -1217,34 +1161,32 @@ def database_setup_execute(self: ActionInstance):
1217
1161
 
1218
1162
  project_id = action_details["_idProject"]
1219
1163
 
1220
- # Define container names with action_record_id for uniqueness
1221
- mongodb_container_name = f"database_setup_{self.action_record_id}"
1222
- qdrant_container_name = f"qdrant_{self.action_record_id}"
1223
-
1224
1164
  if action_details["actionDetails"].get("containerId"):
1225
1165
  logging.info(
1226
- "Using existing container ID for database setup: %s",
1166
+ "Using existing container ID for inference tracker: %s",
1227
1167
  action_details["actionDetails"]["containerId"],
1228
1168
  )
1229
1169
  self.docker_container = action_details["actionDetails"]["containerId"]
1230
1170
  cmd = "docker restart " + self.docker_container
1231
- self.start(cmd, "database_setup")
1171
+ self.start(cmd, "qdrant_setup")
1232
1172
 
1233
- # qdrant restart
1234
- qdrant_cmd = f"docker restart {qdrant_container_name}"
1235
- self.start(qdrant_cmd, "qdrant_setup")
1173
+ #qdrant restart
1174
+ qdrant_cmd = "docker restart qdrant"
1175
+ self.start(qdrant_cmd, 'qdrant_setup')
1236
1176
 
1237
1177
  return
1178
+
1179
+
1180
+ dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
1238
1181
 
1239
- dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
1240
1182
 
1241
1183
  # MongoDB container with --net=host (Port: 27020:27017)
1242
1184
  cmd = (
1243
- f"docker run -d --pull=always --net=host "
1244
- f"--name {mongodb_container_name} "
1245
- f"-v matrice_myvol:/matrice_data "
1185
+ f"docker run --pull=always --net=host "
1246
1186
  f"-v {dbPath}:{dbPath} "
1187
+ f"--name database_setup_{self.action_record_id} "
1247
1188
  f"-v /var/run/docker.sock:/var/run/docker.sock "
1189
+ f"--cidfile ./{self.action_record_id}.cid "
1248
1190
  f"-e ACTION_RECORD_ID={self.action_record_id} "
1249
1191
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1250
1192
  f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
@@ -1254,23 +1196,6 @@ def database_setup_execute(self: ActionInstance):
1254
1196
  )
1255
1197
  logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
1256
1198
 
1257
- # Qdrant container with --net=host (Port: 6334)
1258
- qdrant_cmd = (
1259
- f"docker run -d --pull=always --net=host "
1260
- f"--name {qdrant_container_name} "
1261
- f"-v matrice_myvol:/matrice_data "
1262
- f"qdrant/qdrant:latest "
1263
- )
1264
- logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
1265
-
1266
- # Start Qdrant container
1267
- qdrant_process = subprocess.Popen(
1268
- qdrant_cmd,
1269
- shell=True,
1270
- stdout=subprocess.PIPE,
1271
- stderr=subprocess.PIPE,
1272
- )
1273
- logging.info("Qdrant container started successfully")
1274
1199
 
1275
1200
  # Docker Command run
1276
1201
  self.start(cmd, "database_setup")
@@ -1290,8 +1215,6 @@ def facial_recognition_setup_execute(self: ActionInstance):
1290
1215
 
1291
1216
  self.setup_action_requirements(action_details)
1292
1217
 
1293
- container_name = f"facial_recognition_{self.action_record_id}"
1294
-
1295
1218
  if action_details["actionDetails"].get("containerId"):
1296
1219
  logging.info(
1297
1220
  "Using existing container ID for facial recognition worker: %s",
@@ -1305,13 +1228,15 @@ def facial_recognition_setup_execute(self: ActionInstance):
1305
1228
  # Facial recognition worker container with --net=host (Port: 8081)
1306
1229
  worker_cmd = (
1307
1230
  f"docker run -d --pull=always --net=host "
1308
- f"--name {container_name} "
1231
+ f"--name worker "
1232
+ f"--cidfile ./{self.action_record_id}.cid "
1309
1233
  f"-v matrice_myvol:/matrice_data "
1234
+ f"--cidfile ./{self.action_record_id}.cid "
1310
1235
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1311
1236
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1312
1237
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1313
1238
  f'-e ACTION_ID="{self.action_record_id}" '
1314
- f'--restart=unless-stopped '
1239
+ f' --restart=unless-stopped '
1315
1240
  f"{image}"
1316
1241
  )
1317
1242
  logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
@@ -1333,8 +1258,6 @@ def lpr_setup_execute(self: ActionInstance):
1333
1258
 
1334
1259
  self.setup_action_requirements(action_details)
1335
1260
 
1336
- container_name = f"lpr_{self.action_record_id}"
1337
-
1338
1261
  if action_details["actionDetails"].get("containerId"):
1339
1262
  logging.info(
1340
1263
  "Using existing container ID for LPR worker: %s",
@@ -1348,14 +1271,15 @@ def lpr_setup_execute(self: ActionInstance):
1348
1271
  # LPR worker container with --net=host (Port: 8082)
1349
1272
  worker_cmd = (
1350
1273
  f"docker run -d --net=host --pull=always "
1351
- f"--name {container_name} "
1274
+ f"--name lpr-worker "
1275
+ f"--cidfile ./{self.action_record_id}.cid "
1352
1276
  f"-v matrice_myvol:/matrice_data "
1353
1277
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1354
1278
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1355
1279
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1356
1280
  f'-e ACTION_ID="{self.action_record_id}" '
1357
1281
  f'-e PORT=8082 '
1358
- f'--restart=unless-stopped '
1282
+ f' --restart=unless-stopped '
1359
1283
  f"{image}"
1360
1284
  )
1361
1285
  logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
@@ -1386,8 +1310,6 @@ def inference_ws_server_execute(self: ActionInstance):
1386
1310
 
1387
1311
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1388
1312
 
1389
- container_name = f"inference_ws_{self.action_record_id}"
1390
-
1391
1313
  if action_details["actionDetails"].get("containerId"):
1392
1314
  logging.info(
1393
1315
  "Using existing container ID for inference WebSocket server: %s",
@@ -1401,11 +1323,12 @@ def inference_ws_server_execute(self: ActionInstance):
1401
1323
  # Inference WebSocket server with --net=host (Port: 8102)
1402
1324
  worker_cmd = (
1403
1325
  f"docker run -d --pull=always --net=host "
1404
- f"--name {container_name} "
1326
+ f"--name inference "
1327
+ f"--cidfile ./{self.action_record_id}.cid "
1405
1328
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1406
1329
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1407
1330
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1408
- f'--restart=unless-stopped '
1331
+ f' --restart=unless-stopped '
1409
1332
  f"{image} "
1410
1333
  f"./app "
1411
1334
  f"{self.action_record_id} "
@@ -1436,8 +1359,6 @@ def fe_fs_streaming_execute(self: ActionInstance):
1436
1359
 
1437
1360
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1438
1361
 
1439
- container_name = f"fe_streaming_{self.action_record_id}"
1440
-
1441
1362
  if action_details["actionDetails"].get("containerId"):
1442
1363
  logging.info(
1443
1364
  "Using existing container ID for frontend streaming: %s",
@@ -1451,14 +1372,15 @@ def fe_fs_streaming_execute(self: ActionInstance):
1451
1372
  # Frontend streaming with --net=host (Port: 3000)
1452
1373
  worker_cmd = (
1453
1374
  f"docker run -d --pull=always --net=host "
1454
- f"--name {container_name} "
1375
+ f"--name fe_streaming "
1376
+ f"--cidfile ./{self.action_record_id}.cid "
1455
1377
  f"-v matrice_myvol:/matrice_data "
1456
1378
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1457
1379
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1458
1380
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1459
1381
  f"-e PORT=3000 "
1460
1382
  f'-e WS_HOST="{ws_url}" '
1461
- f'--restart=unless-stopped '
1383
+ f' --restart=unless-stopped '
1462
1384
  f"{image}"
1463
1385
  )
1464
1386
  logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
@@ -1483,8 +1405,6 @@ def fe_analytics_service_execute(self: ActionInstance):
1483
1405
 
1484
1406
  project_id = action_details["_idProject"]
1485
1407
 
1486
- container_name = f"fe_analytics_{self.action_record_id}"
1487
-
1488
1408
  if action_details["actionDetails"].get("containerId"):
1489
1409
  logging.info(
1490
1410
  "Using existing container ID for frontend analytics service: %s",
@@ -1498,14 +1418,15 @@ def fe_analytics_service_execute(self: ActionInstance):
1498
1418
  # Frontend analytics service with --net=host (Port: 3001)
1499
1419
  worker_cmd = (
1500
1420
  f"docker run -d --pull=always --net=host "
1501
- f"--name {container_name} "
1421
+ f"--name fe-analytics "
1422
+ f"--cidfile ./{self.action_record_id}.cid "
1502
1423
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1503
1424
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1504
1425
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1505
1426
  f'-e ACTION_ID="{self.action_record_id}" '
1506
1427
  f"-e PORT=3001 "
1507
1428
  f'-e PROJECT_ID="{project_id}" '
1508
- f'--restart=unless-stopped '
1429
+ f' --restart=unless-stopped '
1509
1430
  f"{image}"
1510
1431
  )
1511
1432
  logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
@@ -1530,8 +1451,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1530
1451
  else:
1531
1452
  return
1532
1453
  use_gpu = self.get_gpu_config(action_details)
1533
- container_name = f"dataset_generation_{self.action_record_id}"
1534
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1454
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1535
1455
  logging.info("cmd is: %s", cmd)
1536
1456
  self.start(cmd, "dataset_generation")
1537
1457
 
@@ -1552,8 +1472,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
1552
1472
  else:
1553
1473
  return
1554
1474
  use_gpu = self.get_gpu_config(action_details)
1555
- container_name = f"synthetic_data_setup_{self.action_record_id}"
1556
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1475
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1557
1476
  logging.info("cmd is: %s", cmd)
1558
1477
  self.start(cmd, "synthetic_data_setup")
1559
1478
 
@@ -1590,8 +1509,6 @@ def redis_setup_execute(self: ActionInstance):
1590
1509
 
1591
1510
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1592
1511
 
1593
- # Define container names with action_record_id for uniqueness
1594
- redis_container_name = f"redis_{self.action_record_id}"
1595
1512
 
1596
1513
  if action_details["actionDetails"].get("containerId"):
1597
1514
  logging.info(
@@ -1603,34 +1520,18 @@ def redis_setup_execute(self: ActionInstance):
1603
1520
  self.start(cmd, "redis_setup")
1604
1521
 
1605
1522
  # Redis container restart
1606
- redis_restart_cmd = f"docker restart {redis_container_name}"
1523
+ redis_restart_cmd = "docker restart redis_container"
1607
1524
  self.start(redis_restart_cmd, "redis")
1608
1525
 
1609
1526
  return
1610
1527
 
1611
- # Redis container with --net=host (Port: 6379) with optimized configuration
1528
+ # Redis container with --net=host (Port: 6379)
1612
1529
  redis_cmd = (
1613
1530
  f"docker run -d --net=host "
1614
- f"--name {redis_container_name} "
1531
+ f"--name redis_container "
1615
1532
  f"--restart unless-stopped "
1616
1533
  f"{redis_image} "
1617
- f"redis-server --bind 0.0.0.0 "
1618
- f"--appendonly no "
1619
- f'--save "" '
1620
- f"--maxmemory 30gb "
1621
- f"--maxmemory-policy allkeys-lru "
1622
- f"--io-threads 4 "
1623
- f"--io-threads-do-reads yes "
1624
- f"--stream-node-max-bytes 8192 "
1625
- f"--stream-node-max-entries 1000 "
1626
- f"--hz 100 "
1627
- f"--tcp-backlog 2048 "
1628
- f"--timeout 0 "
1629
- f"--lazyfree-lazy-eviction yes "
1630
- f"--lazyfree-lazy-expire yes "
1631
- f"--lazyfree-lazy-server-del yes "
1632
- f"--activedefrag yes "
1633
- f"--requirepass {redis_password}"
1534
+ f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1634
1535
  )
1635
1536
 
1636
1537
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
@@ -1654,9 +1555,8 @@ def redis_setup_execute(self: ActionInstance):
1654
1555
 
1655
1556
  # bg-redis management container with --net=host (Port: 8082)
1656
1557
  cmd = (
1657
- f"docker run -d --net=host "
1658
- f"--restart unless-stopped "
1659
- f"--name bg-redis_{self.action_record_id} "
1558
+ f"docker run --net=host "
1559
+ f"--cidfile ./{self.action_record_id}.cid "
1660
1560
  f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1661
1561
  f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1662
1562
  f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
@@ -1683,8 +1583,7 @@ def deploy_aggregator_execute(
1683
1583
  if not action_details:
1684
1584
  return
1685
1585
  self.setup_action_requirements(action_details, work_fs)
1686
- container_name = f"deploy_aggregator_{self.action_record_id}"
1687
- cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1586
+ cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1688
1587
  logging.info("cmd: %s", cmd)
1689
1588
  self.start(cmd, "deploy_aggregator")
1690
1589
 
@@ -1700,10 +1599,6 @@ def model_deploy_execute(self: ActionInstance):
1700
1599
  return
1701
1600
  action_id = action_details["_id"]
1702
1601
  model_family = action_details["actionDetails"]["modelFamily"]
1703
-
1704
- # Get the service ID to track deployments
1705
- service_id = action_details.get("_idService")
1706
-
1707
1602
  self.setup_action_requirements(
1708
1603
  action_details,
1709
1604
  work_fs,
@@ -1711,29 +1606,17 @@ def model_deploy_execute(self: ActionInstance):
1711
1606
  action_id=action_id,
1712
1607
  )
1713
1608
 
1714
- # Use all GPUs if GPU is required
1609
+ # Get GPU configuration based on requirements and availability
1610
+ # This uses the best-fit algorithm to select the most appropriate GPU(s)
1611
+ use_gpu = self.get_gpu_config(action_details)
1612
+
1613
+ # Override: If GPU is required, use all available GPUs
1715
1614
  gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
1716
1615
  if gpuRequired:
1717
1616
  use_gpu = "--runtime=nvidia --gpus all"
1718
- else:
1719
- use_gpu = ""
1720
-
1721
- logging.info(
1722
- "Action %s: Model deployment GPU config: %s",
1723
- action_id,
1724
- use_gpu if use_gpu else "CPU-only"
1725
- )
1726
-
1727
- # Get or create TRITON_PORTS (uses utility method)
1728
- triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1729
1617
 
1730
- extra_env_vars = {
1731
- "INTERNAL_PORT": internal_port,
1732
- "TRITON_PORTS": triton_ports
1733
- }
1734
-
1735
- container_name = f"model_deploy_{self.action_record_id}"
1736
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1618
+ extra_env_vars = {"INTERNAL_PORT": internal_port}
1619
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1737
1620
  logging.info("cmd is: %s", cmd)
1738
1621
  self.start(cmd, "deploy_log")
1739
1622
 
@@ -1766,8 +1649,7 @@ def model_train_execute(self: ActionInstance):
1766
1649
  self.start(cmd, "train_log")
1767
1650
  return
1768
1651
 
1769
- container_name = f"model_train_{self.action_record_id}"
1770
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1652
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1771
1653
  logging.info("cmd is: %s", cmd)
1772
1654
  self.start(cmd, "train_log")
1773
1655
 
@@ -1790,7 +1672,7 @@ def model_eval_execute(self: ActionInstance):
1790
1672
  )
1791
1673
  if action_details["actionDetails"].get("containerId"):
1792
1674
  logging.info(
1793
- "Using existing container ID for evaluation: %s",
1675
+ "Using existing container ID for training: %s",
1794
1676
  action_details["actionDetails"]["containerId"],
1795
1677
  )
1796
1678
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1798,8 +1680,7 @@ def model_eval_execute(self: ActionInstance):
1798
1680
  self.start(cmd, "eval_log")
1799
1681
  return
1800
1682
 
1801
- container_name = f"model_eval_{self.action_record_id}"
1802
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1683
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1803
1684
  logging.info("cmd is: %s", cmd)
1804
1685
  self.start(cmd, "eval_log")
1805
1686
 
@@ -1825,7 +1706,7 @@ def model_export_execute(self: ActionInstance):
1825
1706
  )
1826
1707
  if action_details["actionDetails"].get("containerId"):
1827
1708
  logging.info(
1828
- "Using existing container ID for export: %s",
1709
+ "Using existing container ID for training: %s",
1829
1710
  action_details["actionDetails"]["containerId"],
1830
1711
  )
1831
1712
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1833,8 +1714,7 @@ def model_export_execute(self: ActionInstance):
1833
1714
  self.start(cmd, "export_log")
1834
1715
  return
1835
1716
 
1836
- container_name = f"model_export_{self.action_record_id}"
1837
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1717
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1838
1718
  logging.info("cmd is: %s", cmd)
1839
1719
  self.start(cmd, "export_log")
1840
1720
 
@@ -1850,8 +1730,7 @@ def image_build_execute(self: ActionInstance):
1850
1730
  action_id = action_details["_id"]
1851
1731
  internal_api_key = self.get_internal_api_key(action_id)
1852
1732
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1853
- container_name = f"image_build_{self.action_record_id}"
1854
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1733
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1855
1734
  logging.info("cmd is: %s", cmd)
1856
1735
  self.start(cmd, "image_build_log")
1857
1736
 
@@ -1863,8 +1742,7 @@ def resource_clone_execute(self: ActionInstance):
1863
1742
  if not action_details:
1864
1743
  return
1865
1744
  self.setup_action_requirements(action_details)
1866
- container_name = f"resource_clone_{self.action_record_id}"
1867
- cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1745
+ cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1868
1746
  logging.info("cmd is: %s", cmd)
1869
1747
  self.start(cmd, "resource_clone")
1870
1748
 
@@ -1882,7 +1760,7 @@ def streaming_gateway_execute(self: ActionInstance):
1882
1760
  )
1883
1761
  if action_details["actionDetails"].get("containerId"):
1884
1762
  logging.info(
1885
- "Using existing container ID for streaming gateway: %s",
1763
+ "Using existing container ID for training: %s",
1886
1764
  action_details["actionDetails"]["containerId"],
1887
1765
  )
1888
1766
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1890,8 +1768,7 @@ def streaming_gateway_execute(self: ActionInstance):
1890
1768
  self.start(cmd, "streaming_gateway")
1891
1769
  return
1892
1770
 
1893
- container_name = f"streaming_gateway_{self.action_record_id}"
1894
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1771
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1895
1772
  logging.info("cmd is: %s", cmd)
1896
1773
  self.start(cmd, "streaming_gateway")
1897
1774
 
@@ -1987,7 +1864,7 @@ def kafka_setup_execute(self: ActionInstance):
1987
1864
 
1988
1865
  if action_details["actionDetails"].get("containerId"):
1989
1866
  logging.info(
1990
- "Using existing container ID for kafka: %s",
1867
+ "Using existing container ID for training: %s",
1991
1868
  action_details["actionDetails"]["containerId"],
1992
1869
  )
1993
1870
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -1995,12 +1872,10 @@ def kafka_setup_execute(self: ActionInstance):
1995
1872
  self.start(cmd, "kafka_setup")
1996
1873
  return
1997
1874
 
1998
- container_name = f"kafka_{self.action_record_id}"
1999
1875
 
2000
1876
  # Kafka container with --net=host (Ports: 9092, 9093)
2001
1877
  cmd = (
2002
- f"docker run -d --net=host "
2003
- f"--name {container_name} "
1878
+ f"docker run --net=host "
2004
1879
  f"{env_args} "
2005
1880
  f"--shm-size=30G --pull=always "
2006
1881
  f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
@@ -2033,8 +1908,6 @@ def inference_tracker_setup_execute(self: ActionInstance):
2033
1908
 
2034
1909
  self.setup_action_requirements(action_details)
2035
1910
 
2036
- container_name = f"inference_tracker_{self.action_record_id}"
2037
-
2038
1911
  if action_details["actionDetails"].get("containerId"):
2039
1912
  logging.info(
2040
1913
  "Using existing container ID for inference tracker: %s",
@@ -2048,13 +1921,14 @@ def inference_tracker_setup_execute(self: ActionInstance):
2048
1921
  # This is the existing Docker run command
2049
1922
  worker_cmd = (
2050
1923
  f"docker run -d --pull=always --net=host "
2051
- f"--name {container_name} "
1924
+ f"--cidfile ./{self.action_record_id}.cid "
1925
+ f"--name inference-tracker-worker "
2052
1926
  f"-v matrice_myvol:/matrice_data "
2053
1927
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2054
1928
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2055
1929
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2056
1930
  f'-e ACTION_ID="{self.action_record_id}" '
2057
- f'--restart=unless-stopped '
1931
+ f' --restart=unless-stopped '
2058
1932
  f"{image}"
2059
1933
  )
2060
1934
 
@@ -2076,11 +1950,9 @@ def video_storage_setup_execute(self: ActionInstance):
2076
1950
 
2077
1951
  self.setup_action_requirements(action_details)
2078
1952
 
2079
- container_name = f"video_storage_{self.action_record_id}"
2080
-
2081
1953
  if action_details["actionDetails"].get("containerId"):
2082
1954
  logging.info(
2083
- "Using existing container ID for video storage: %s",
1955
+ "Using existing container ID for inference tracker: %s",
2084
1956
  action_details["actionDetails"]["containerId"],
2085
1957
  )
2086
1958
  self.docker_container = action_details["actionDetails"]["containerId"]
@@ -2091,13 +1963,14 @@ def video_storage_setup_execute(self: ActionInstance):
2091
1963
  # This is the existing Docker run command
2092
1964
  worker_cmd = (
2093
1965
  f"docker run -d --pull=always --net=host "
2094
- f"--name {container_name} "
1966
+ f"--cidfile ./{self.action_record_id}.cid "
1967
+ f"--name media_server "
2095
1968
  f"-v matrice_myvol:/matrice_data "
2096
1969
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2097
1970
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2098
1971
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2099
1972
  f'-e ACTION_ID="{self.action_record_id}" '
2100
- f'--restart=unless-stopped '
1973
+ f' --restart=unless-stopped '
2101
1974
  f"{image}"
2102
1975
  )
2103
1976