matrice-compute 0.1.34__py3-none-any.whl → 0.1.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,6 @@ import signal
10
10
  import urllib.request
11
11
  from matrice_compute.instance_utils import (
12
12
  get_gpu_with_sufficient_memory_for_action,
13
- get_gpu_config_for_deployment,
14
13
  get_decrypted_access_key_pair,
15
14
  get_max_file_system,
16
15
  get_best_service_ip_and_network,
@@ -27,10 +26,6 @@ from matrice_common.utils import log_errors
27
26
  class ActionInstance:
28
27
  """Base class for tasks that run in Action containers."""
29
28
 
30
- # Class-level dictionary to track deployed services and their ports
31
- # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
32
- _deployed_services = {}
33
-
34
29
  def __init__(self, scaling: Scaling, action_info: dict):
35
30
  """Initialize an action instance.
36
31
 
@@ -90,67 +85,6 @@ class ActionInstance:
90
85
  raise ValueError(f"Unknown action type: {self.action_type}")
91
86
  self.task = self.actions_map[self.action_type]
92
87
 
93
- @classmethod
94
- def is_first_deployment_for_service(cls, service_id):
95
- """Check if this is the first deployment for a given service.
96
-
97
- Args:
98
- service_id (str): Service ID (_idService)
99
-
100
- Returns:
101
- bool: True if this is the first deployment, False otherwise
102
- """
103
- if not service_id:
104
- return False
105
- return service_id not in cls._deployed_services
106
-
107
- @classmethod
108
- def get_or_create_triton_ports(cls, service_id, scaling_instance):
109
- """Get existing TRITON_PORTS for a service or create new ones.
110
-
111
- Args:
112
- service_id (str): Service ID (_idService)
113
- scaling_instance: Scaling instance to get open ports
114
-
115
- Returns:
116
- str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
117
- """
118
- if not service_id:
119
- # No service_id, generate new ports
120
- port1 = scaling_instance.get_open_port()
121
- port2 = scaling_instance.get_open_port()
122
- port3 = scaling_instance.get_open_port()
123
- return f"{port1},{port2},{port3}"
124
-
125
- # Check if ports already exist for this service
126
- if service_id in cls._deployed_services:
127
- triton_ports = cls._deployed_services[service_id]["triton_ports"]
128
- logging.info(
129
- "Reusing TRITON_PORTS for service %s: %s",
130
- service_id,
131
- triton_ports
132
- )
133
- return triton_ports
134
-
135
- # First deployment: generate new ports and store them
136
- port1 = scaling_instance.get_open_port()
137
- port2 = scaling_instance.get_open_port()
138
- port3 = scaling_instance.get_open_port()
139
- triton_ports = f"{port1},{port2},{port3}"
140
-
141
- # Store for future use
142
- cls._deployed_services[service_id] = {
143
- "triton_ports": triton_ports,
144
- "is_first": False
145
- }
146
-
147
- logging.info(
148
- "First deployment for service %s - generated TRITON_PORTS: %s",
149
- service_id,
150
- triton_ports
151
- )
152
- return triton_ports
153
-
154
88
  @log_errors(default_return={}, raise_exception=True, log_error=False)
155
89
  def _init_credentials(self):
156
90
  """Initialize Matrice credentials.
@@ -297,7 +231,7 @@ class ActionInstance:
297
231
  getattr(self, "action_record_id", "unknown"),
298
232
  )
299
233
  else:
300
- logging.info(
234
+ logging.debug(
301
235
  "No additional logs to send for action %s",
302
236
  getattr(self, "action_record_id", "unknown"),
303
237
  )
@@ -352,13 +286,13 @@ class ActionInstance:
352
286
  ).get("gpuMemory", 0)
353
287
 
354
288
  logging.info(
355
- "Action %s requires GPU with %d MB memory - selecting GPU(s) with most free memory",
289
+ "Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
356
290
  action_id,
357
291
  required_memory
358
292
  )
359
293
 
360
294
  try:
361
- # Get the GPU(s) with most free memory that have sufficient memory
295
+ # Get the best-fit GPU(s) with sufficient memory
362
296
  gpu_indices = get_gpu_with_sufficient_memory_for_action(
363
297
  action_details=action_details
364
298
  )
@@ -412,7 +346,6 @@ class ActionInstance:
412
346
  destination_workspace_path: str = "/usr/src/workspace",
413
347
  docker_workdir: str = "",
414
348
  extra_pkgs: list = [],
415
- container_name: str = "",
416
349
  ):
417
350
  """Build base Docker command with common options.
418
351
 
@@ -427,7 +360,6 @@ class ActionInstance:
427
360
  destination_workspace_path (str): Container workspace path
428
361
  docker_workdir (str): Docker working directory
429
362
  extra_pkgs (list): List of extra packages to install
430
- container_name (str): Docker container name (format: {action_type}_{action_id})
431
363
  Returns:
432
364
  str: Base Docker command
433
365
  """
@@ -492,16 +424,17 @@ class ActionInstance:
492
424
  ]
493
425
  )
494
426
 
495
- # Build container name option if provided
496
- name_option = f"--name {container_name}" if container_name else ""
427
+ # if the service provider is local, then put --restart unless-stopped
428
+ if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
429
+ env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
497
430
 
498
431
  cmd_parts = [
499
- f"docker run -d {use_gpu} ",
500
- name_option,
432
+ f"docker run {use_gpu} ",
501
433
  network_config,
502
434
  *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
503
435
  *volumes,
504
436
  # Container configuration and startup commands
437
+ f"--cidfile ./{self.action_record_id}.cid ",
505
438
  f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
506
439
  f'/bin/bash -c "cd {docker_workdir} && '
507
440
  f"{env_exports} && "
@@ -889,34 +822,6 @@ class ActionInstance:
889
822
  job_params=action_details["jobParams"],
890
823
  )
891
824
 
892
- @staticmethod
893
- def container_exists(container_id: str) -> bool:
894
- """Check if a Docker container exists.
895
-
896
- Args:
897
- container_id (str): Container ID or name to check
898
-
899
- Returns:
900
- bool: True if container exists, False otherwise
901
- """
902
- if not container_id:
903
- return False
904
- try:
905
- result = subprocess.run(
906
- ["docker", "inspect", container_id],
907
- capture_output=True,
908
- text=True,
909
- timeout=10
910
- )
911
- return result.returncode == 0
912
- except Exception as e:
913
- logging.warning(
914
- "Error checking if container %s exists: %s",
915
- container_id,
916
- str(e)
917
- )
918
- return False
919
-
920
825
  @log_errors(raise_exception=True)
921
826
  def start_process(self, cmd, log_name):
922
827
  """Start the process and initialize logging.
@@ -931,54 +836,60 @@ class ActionInstance:
931
836
  self.cmd = cmd
932
837
  self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
933
838
 
934
- # Run docker with -d flag to get container ID from stdout
935
- process = subprocess.Popen(
936
- shlex.split(self.cmd),
937
- stdout=subprocess.PIPE,
938
- stderr=subprocess.PIPE,
939
- text=True,
940
- env={**os.environ},
941
- )
942
-
943
- # Use a longer timeout for docker run since --pull=always may need to
944
- # download large images on first run. Default: 30 minutes (1800 seconds)
945
- # Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
946
- docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
947
- logging.info(
948
- "Waiting for docker container to start for action %s (timeout: %d seconds)",
949
- self.action_record_id,
950
- docker_start_timeout,
951
- )
952
- stdout, stderr = process.communicate(timeout=docker_start_timeout)
839
+ with open(self.log_path, "wb") as out:
840
+ self.process = subprocess.Popen(
841
+ shlex.split(self.cmd),
842
+ stdout=out,
843
+ stderr=out,
844
+ env={**os.environ},
845
+ start_new_session=True,
846
+ )
953
847
 
954
- if process.returncode != 0:
848
+ self.container_id = None
849
+
850
+ cid_file_path = f"./{self.action_record_id}.cid"
851
+ max_retries = 5
852
+ retry_delay = 1 # seconds
853
+ for attempt in range(max_retries):
854
+ try:
855
+ with open(cid_file_path, "r") as cid_file:
856
+ container_id = cid_file.read().strip()
857
+ self.container_id = container_id
858
+ logging.info(
859
+ "Started process for action %s with container ID: %s",
860
+ self.action_record_id,
861
+ self.container_id,
862
+ )
863
+ break
864
+ except FileNotFoundError:
865
+ logging.warning(
866
+ "CID file not found for action %s, attempt %d/%d",
867
+ self.action_record_id,
868
+ attempt + 1,
869
+ max_retries,
870
+ )
871
+ time.sleep(retry_delay)
872
+ except Exception as e:
873
+ logging.error(
874
+ "Error reading CID file for action %s: %s",
875
+ self.action_record_id,
876
+ str(e),
877
+ )
878
+ time.sleep(retry_delay)
879
+ else:
955
880
  logging.error(
956
- "Docker run failed for action %s: %s",
881
+ "Failed to read CID file for action %s after %d attempts",
957
882
  self.action_record_id,
958
- stderr,
883
+ max_retries,
959
884
  )
960
- raise RuntimeError(f"Docker run failed: {stderr}")
961
-
962
- self.container_id = stdout.strip()
963
- logging.info(
964
- "Started container for action %s with ID: %s",
965
- self.action_record_id,
966
- self.container_id,
967
- )
968
-
969
- # Start following container logs in background
970
- self.process = subprocess.Popen(
971
- ["docker", "logs", "-f", self.container_id],
972
- stdout=open(self.log_path, "wb"),
973
- stderr=subprocess.STDOUT,
974
- start_new_session=True,
975
- )
885
+ raise Exception("Failed to start process: CID file not found")
976
886
 
977
- # Report container id to scaling service
887
+ # report container id to scaling service
978
888
  self.scaling.update_action_container_id(
979
889
  action_record_id=self.action_record_id,
980
890
  container_id=self.container_id,
981
891
  )
892
+
982
893
 
983
894
  @log_errors(raise_exception=False)
984
895
  def start_logger(self):
@@ -1139,8 +1050,7 @@ def data_preparation_execute(
1139
1050
  "Started pulling Docker image with PID: %s",
1140
1051
  process.pid,
1141
1052
  )
1142
- container_name = f"data_prep_{self.action_record_id}"
1143
- cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1053
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1144
1054
  logging.info("cmd is: %s", cmd)
1145
1055
  self.start(cmd, "data_preparation_log")
1146
1056
 
@@ -1169,8 +1079,7 @@ def data_processing_execute(self: ActionInstance):
1169
1079
  service="bg-job-scheduler",
1170
1080
  job_params=action["jobParams"],
1171
1081
  )
1172
- container_name = f"data_processing_{self.action_record_id}"
1173
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
1082
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1174
1083
  logging.info("cmd: %s", cmd)
1175
1084
  self.start(cmd, "data_processing_log")
1176
1085
 
@@ -1183,8 +1092,7 @@ def data_split_execute(self: ActionInstance):
1183
1092
  if not action_details:
1184
1093
  return
1185
1094
  self.setup_action_requirements(action_details, work_fs, model_family="")
1186
- container_name = f"data_split_{self.action_record_id}"
1187
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1095
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1188
1096
  logging.info("cmd: %s", cmd)
1189
1097
  self.start(cmd, "data_split")
1190
1098
 
@@ -1199,8 +1107,7 @@ def dataset_annotation_execute(
1199
1107
  if not action_details:
1200
1108
  return
1201
1109
  self.setup_action_requirements(action_details, work_fs)
1202
- container_name = f"dataset_annotation_{self.action_record_id}"
1203
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1110
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1204
1111
  logging.info("cmd: %s", cmd)
1205
1112
  self.start(cmd, "dataset_annotation")
1206
1113
 
@@ -1215,8 +1122,7 @@ def dataset_augmentation_execute(
1215
1122
  if not action_details:
1216
1123
  return
1217
1124
  self.setup_action_requirements(action_details, work_fs)
1218
- container_name = f"dataset_augmentation_{self.action_record_id}"
1219
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1125
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1220
1126
  logging.info("cmd: %s", cmd)
1221
1127
  self.start(cmd, "dataset_augmentation")
1222
1128
 
@@ -1232,8 +1138,7 @@ def augmentation_server_creation_execute(
1232
1138
  if not action_details:
1233
1139
  return
1234
1140
  self.setup_action_requirements(action_details, work_fs)
1235
- container_name = f"augmentation_setup_{self.action_record_id}"
1236
- cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1141
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1237
1142
  logging.info("cmd: %s", cmd)
1238
1143
  self.start(cmd, "augmentation_setup")
1239
1144
 
@@ -1254,42 +1159,31 @@ def database_setup_execute(self: ActionInstance):
1254
1159
 
1255
1160
  project_id = action_details["_idProject"]
1256
1161
 
1257
- # Define container names with action_record_id for uniqueness
1258
- mongodb_container_name = f"database_setup_{self.action_record_id}"
1259
- qdrant_container_name = f"qdrant_{self.action_record_id}"
1162
+ if action_details["actionDetails"].get("containerId"):
1163
+ logging.info(
1164
+ "Using existing container ID for inference tracker: %s",
1165
+ action_details["actionDetails"]["containerId"],
1166
+ )
1167
+ self.docker_container = action_details["actionDetails"]["containerId"]
1168
+ cmd = "docker restart " + self.docker_container
1169
+ self.start(cmd, "qdrant_setup")
1260
1170
 
1261
- existing_container_id = action_details["actionDetails"].get("containerId")
1262
- if existing_container_id:
1263
- # Check if both containers actually exist before trying to restart
1264
- mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
1265
- qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
1171
+ #qdrant restart
1172
+ qdrant_cmd = "docker restart qdrant"
1173
+ self.start(qdrant_cmd, 'qdrant_setup')
1266
1174
 
1267
- if mongodb_container_exists and qdrant_container_exists:
1268
- logging.info(
1269
- "Using existing container ID for database setup: %s",
1270
- existing_container_id,
1271
- )
1272
- self.docker_container = existing_container_id
1273
- cmd = "docker restart " + self.docker_container
1274
- self.start(cmd, "qdrant_setup")
1175
+ return
1176
+
1177
+
1178
+ dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
1275
1179
 
1276
- # qdrant restart
1277
- qdrant_cmd = f"docker restart {qdrant_container_name}"
1278
- self.start(qdrant_cmd, "qdrant_setup")
1279
- return
1280
- else:
1281
- logging.warning(
1282
- "Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
1283
- mongodb_container_exists,
1284
- qdrant_container_exists
1285
- )
1286
- # Fall through to create new containers
1287
1180
 
1288
1181
  # MongoDB container with --net=host (Port: 27020:27017)
1289
1182
  cmd = (
1290
1183
  f"docker run --pull=always --net=host "
1291
- f"--name {mongodb_container_name} "
1292
- f"-v matrice_myvol:/matrice_data "
1184
+ f"-v {dbPath}:{dbPath} "
1185
+ f"--name database_setup_{self.action_record_id} "
1186
+ f"-v /var/run/docker.sock:/var/run/docker.sock "
1293
1187
  f"--cidfile ./{self.action_record_id}.cid "
1294
1188
  f"-e ACTION_RECORD_ID={self.action_record_id} "
1295
1189
  f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
@@ -1298,22 +1192,12 @@ def database_setup_execute(self: ActionInstance):
1298
1192
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1299
1193
  f"{image} "
1300
1194
  )
1301
- logging.info("Starting MongoDB container (Port: 27020:27017): %s", cmd)
1195
+ logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
1302
1196
 
1303
- # Qdrant container with --net=host (Port: 6334)
1304
- qdrant_cmd = (
1305
- f"docker run --pull=always --net=host "
1306
- f"--name {qdrant_container_name} "
1307
- f"-v matrice_myvol:/matrice_data "
1308
- f"{'qdrant/qdrant:latest'} "
1309
- )
1310
- logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
1311
1197
 
1312
1198
  # Docker Command run
1313
1199
  self.start(cmd, "database_setup")
1314
1200
 
1315
- # Docker for qdrant
1316
- self.start(qdrant_cmd, 'qdrant_setup')
1317
1201
 
1318
1202
  @log_errors(raise_exception=False)
1319
1203
  def facial_recognition_setup_execute(self: ActionInstance):
@@ -1329,36 +1213,28 @@ def facial_recognition_setup_execute(self: ActionInstance):
1329
1213
 
1330
1214
  self.setup_action_requirements(action_details)
1331
1215
 
1332
- existing_container_id = action_details["actionDetails"].get("containerId")
1333
- if existing_container_id:
1334
- # Check if container actually exists before trying to restart
1335
- if ActionInstance.container_exists(existing_container_id):
1336
- logging.info(
1337
- "Using existing container ID for facial recognition worker: %s",
1338
- existing_container_id,
1339
- )
1340
- self.docker_container = existing_container_id
1341
- cmd = "docker restart " + self.docker_container
1342
- self.start(cmd, "facial_recognition_setup")
1343
- return
1344
- else:
1345
- logging.warning(
1346
- "Container %s not found. Creating new container.",
1347
- existing_container_id
1348
- )
1349
- # Fall through to create new container
1216
+ if action_details["actionDetails"].get("containerId"):
1217
+ logging.info(
1218
+ "Using existing container ID for facial recognition worker: %s",
1219
+ action_details["actionDetails"]["containerId"],
1220
+ )
1221
+ self.docker_container = action_details["actionDetails"]["containerId"]
1222
+ cmd = "docker restart " + self.docker_container
1223
+ self.start(cmd, "facial_recognition_setup")
1224
+ return
1350
1225
 
1351
1226
  # Facial recognition worker container with --net=host (Port: 8081)
1352
- container_name = f"facial_recognition_{self.action_record_id}"
1353
1227
  worker_cmd = (
1354
1228
  f"docker run -d --pull=always --net=host "
1355
- f"--name {container_name} "
1356
- f"--cidfile ./{self.action_record_id}.cid "
1229
+ f"--name worker "
1230
+ f"--cidfile ./{self.action_record_id}.cid "
1357
1231
  f"-v matrice_myvol:/matrice_data "
1232
+ f"--cidfile ./{self.action_record_id}.cid "
1358
1233
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1359
1234
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1360
1235
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1361
1236
  f'-e ACTION_ID="{self.action_record_id}" '
1237
+ f' --restart=unless-stopped '
1362
1238
  f"{image}"
1363
1239
  )
1364
1240
  logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
@@ -1380,30 +1256,20 @@ def lpr_setup_execute(self: ActionInstance):
1380
1256
 
1381
1257
  self.setup_action_requirements(action_details)
1382
1258
 
1383
- existing_container_id = action_details["actionDetails"].get("containerId")
1384
- if existing_container_id:
1385
- # Check if container actually exists before trying to restart
1386
- if ActionInstance.container_exists(existing_container_id):
1387
- logging.info(
1388
- "Using existing container ID for LPR worker: %s",
1389
- existing_container_id,
1390
- )
1391
- self.docker_container = existing_container_id
1392
- cmd = "docker restart " + self.docker_container
1393
- self.start(cmd, "lpr_setup")
1394
- return
1395
- else:
1396
- logging.warning(
1397
- "Container %s not found. Creating new container.",
1398
- existing_container_id
1399
- )
1400
- # Fall through to create new container
1259
+ if action_details["actionDetails"].get("containerId"):
1260
+ logging.info(
1261
+ "Using existing container ID for LPR worker: %s",
1262
+ action_details["actionDetails"]["containerId"],
1263
+ )
1264
+ self.docker_container = action_details["actionDetails"]["containerId"]
1265
+ cmd = "docker restart " + self.docker_container
1266
+ self.start(cmd, "lpr_setup")
1267
+ return
1401
1268
 
1402
1269
  # LPR worker container with --net=host (Port: 8082)
1403
- container_name = f"lpr_{self.action_record_id}"
1404
1270
  worker_cmd = (
1405
1271
  f"docker run -d --net=host --pull=always "
1406
- f"--name {container_name} "
1272
+ f"--name lpr-worker "
1407
1273
  f"--cidfile ./{self.action_record_id}.cid "
1408
1274
  f"-v matrice_myvol:/matrice_data "
1409
1275
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1411,6 +1277,7 @@ def lpr_setup_execute(self: ActionInstance):
1411
1277
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1412
1278
  f'-e ACTION_ID="{self.action_record_id}" '
1413
1279
  f'-e PORT=8082 '
1280
+ f' --restart=unless-stopped '
1414
1281
  f"{image}"
1415
1282
  )
1416
1283
  logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
@@ -1441,34 +1308,25 @@ def inference_ws_server_execute(self: ActionInstance):
1441
1308
 
1442
1309
  logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1443
1310
 
1444
- existing_container_id = action_details["actionDetails"].get("containerId")
1445
- if existing_container_id:
1446
- # Check if container actually exists before trying to restart
1447
- if ActionInstance.container_exists(existing_container_id):
1448
- logging.info(
1449
- "Using existing container ID for inference WebSocket server: %s",
1450
- existing_container_id,
1451
- )
1452
- self.docker_container = existing_container_id
1453
- cmd = "docker restart " + self.docker_container
1454
- self.start(cmd, "inference_ws_server")
1455
- return
1456
- else:
1457
- logging.warning(
1458
- "Container %s not found. Creating new container.",
1459
- existing_container_id
1460
- )
1461
- # Fall through to create new container
1311
+ if action_details["actionDetails"].get("containerId"):
1312
+ logging.info(
1313
+ "Using existing container ID for inference WebSocket server: %s",
1314
+ action_details["actionDetails"]["containerId"],
1315
+ )
1316
+ self.docker_container = action_details["actionDetails"]["containerId"]
1317
+ cmd = "docker restart " + self.docker_container
1318
+ self.start(cmd, "inference_ws_server")
1319
+ return
1462
1320
 
1463
1321
  # Inference WebSocket server with --net=host (Port: 8102)
1464
- container_name = f"inference_ws_{self.action_record_id}"
1465
1322
  worker_cmd = (
1466
1323
  f"docker run -d --pull=always --net=host "
1467
- f"--name {container_name} "
1324
+ f"--name inference "
1468
1325
  f"--cidfile ./{self.action_record_id}.cid "
1469
1326
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
1470
1327
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1471
1328
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1329
+ f' --restart=unless-stopped '
1472
1330
  f"{image} "
1473
1331
  f"./app "
1474
1332
  f"{self.action_record_id} "
@@ -1499,30 +1357,20 @@ def fe_fs_streaming_execute(self: ActionInstance):
1499
1357
 
1500
1358
  logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1501
1359
 
1502
- existing_container_id = action_details["actionDetails"].get("containerId")
1503
- if existing_container_id:
1504
- # Check if container actually exists before trying to restart
1505
- if ActionInstance.container_exists(existing_container_id):
1506
- logging.info(
1507
- "Using existing container ID for frontend streaming: %s",
1508
- existing_container_id,
1509
- )
1510
- self.docker_container = existing_container_id
1511
- cmd = "docker restart " + self.docker_container
1512
- self.start(cmd, "fe_fs_streaming")
1513
- return
1514
- else:
1515
- logging.warning(
1516
- "Container %s not found. Creating new container.",
1517
- existing_container_id
1518
- )
1519
- # Fall through to create new container
1520
-
1360
+ if action_details["actionDetails"].get("containerId"):
1361
+ logging.info(
1362
+ "Using existing container ID for frontend streaming: %s",
1363
+ action_details["actionDetails"]["containerId"],
1364
+ )
1365
+ self.docker_container = action_details["actionDetails"]["containerId"]
1366
+ cmd = "docker restart " + self.docker_container
1367
+ self.start(cmd, "fe_fs_streaming")
1368
+ return
1369
+
1521
1370
  # Frontend streaming with --net=host (Port: 3000)
1522
- container_name = f"fe_streaming_{self.action_record_id}"
1523
1371
  worker_cmd = (
1524
1372
  f"docker run -d --pull=always --net=host "
1525
- f"--name {container_name} "
1373
+ f"--name fe_streaming "
1526
1374
  f"--cidfile ./{self.action_record_id}.cid "
1527
1375
  f"-v matrice_myvol:/matrice_data "
1528
1376
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
@@ -1530,6 +1378,7 @@ def fe_fs_streaming_execute(self: ActionInstance):
1530
1378
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1531
1379
  f"-e PORT=3000 "
1532
1380
  f'-e WS_HOST="{ws_url}" '
1381
+ f' --restart=unless-stopped '
1533
1382
  f"{image}"
1534
1383
  )
1535
1384
  logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
@@ -1554,30 +1403,20 @@ def fe_analytics_service_execute(self: ActionInstance):
1554
1403
 
1555
1404
  project_id = action_details["_idProject"]
1556
1405
 
1557
- existing_container_id = action_details["actionDetails"].get("containerId")
1558
- if existing_container_id:
1559
- # Check if container actually exists before trying to restart
1560
- if ActionInstance.container_exists(existing_container_id):
1561
- logging.info(
1562
- "Using existing container ID for frontend analytics service: %s",
1563
- existing_container_id,
1564
- )
1565
- self.docker_container = existing_container_id
1566
- cmd = "docker restart " + self.docker_container
1567
- self.start(cmd, "fe_analytics_service")
1568
- return
1569
- else:
1570
- logging.warning(
1571
- "Container %s not found. Creating new container.",
1572
- existing_container_id
1573
- )
1574
- # Fall through to create new container
1575
-
1406
+ if action_details["actionDetails"].get("containerId"):
1407
+ logging.info(
1408
+ "Using existing container ID for frontend analytics service: %s",
1409
+ action_details["actionDetails"]["containerId"],
1410
+ )
1411
+ self.docker_container = action_details["actionDetails"]["containerId"]
1412
+ cmd = "docker restart " + self.docker_container
1413
+ self.start(cmd, "fe_analytics_service")
1414
+ return
1415
+
1576
1416
  # Frontend analytics service with --net=host (Port: 3001)
1577
- container_name = f"fe_analytics_{self.action_record_id}"
1578
1417
  worker_cmd = (
1579
1418
  f"docker run -d --pull=always --net=host "
1580
- f"--name {container_name} "
1419
+ f"--name fe-analytics "
1581
1420
  f"--cidfile ./{self.action_record_id}.cid "
1582
1421
  f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1583
1422
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
@@ -1585,6 +1424,7 @@ def fe_analytics_service_execute(self: ActionInstance):
1585
1424
  f'-e ACTION_ID="{self.action_record_id}" '
1586
1425
  f"-e PORT=3001 "
1587
1426
  f'-e PROJECT_ID="{project_id}" '
1427
+ f' --restart=unless-stopped '
1588
1428
  f"{image}"
1589
1429
  )
1590
1430
  logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
@@ -1609,8 +1449,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
1609
1449
  else:
1610
1450
  return
1611
1451
  use_gpu = self.get_gpu_config(action_details)
1612
- container_name = f"dataset_generation_{self.action_record_id}"
1613
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1452
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1614
1453
  logging.info("cmd is: %s", cmd)
1615
1454
  self.start(cmd, "dataset_generation")
1616
1455
 
@@ -1631,8 +1470,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
1631
1470
  else:
1632
1471
  return
1633
1472
  use_gpu = self.get_gpu_config(action_details)
1634
- container_name = f"synthetic_data_setup_{self.action_record_id}"
1635
- cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1473
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1636
1474
  logging.info("cmd is: %s", cmd)
1637
1475
  self.start(cmd, "synthetic_data_setup")
1638
1476
 
@@ -1669,60 +1507,31 @@ def redis_setup_execute(self: ActionInstance):
1669
1507
 
1670
1508
  redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1671
1509
 
1672
- # Define container names with action_record_id for uniqueness
1673
- redis_container_name = f"redis_{self.action_record_id}"
1674
-
1675
- existing_container_id = action_details["actionDetails"].get("containerId")
1676
- if existing_container_id:
1677
- # Check if both containers actually exist before trying to restart
1678
- management_container_exists = ActionInstance.container_exists(existing_container_id)
1679
- redis_container_exists = ActionInstance.container_exists(redis_container_name)
1680
1510
 
1681
- if management_container_exists and redis_container_exists:
1682
- logging.info(
1683
- "Using existing container ID for redis management: %s",
1684
- existing_container_id,
1685
- )
1686
- self.docker_container = existing_container_id
1687
- cmd = "docker restart " + self.docker_container
1688
- self.start(cmd, "redis_setup")
1511
+ if action_details["actionDetails"].get("containerId"):
1512
+ logging.info(
1513
+ "Using existing container ID for redis management: %s",
1514
+ action_details["actionDetails"]["containerId"],
1515
+ )
1516
+ self.docker_container = action_details["actionDetails"]["containerId"]
1517
+ cmd = "docker restart " + self.docker_container
1518
+ self.start(cmd, "redis_setup")
1689
1519
 
1690
- # Redis container restart
1691
- redis_restart_cmd = f"docker restart {redis_container_name}"
1692
- self.start(redis_restart_cmd, "redis")
1693
- return
1694
- else:
1695
- logging.warning(
1696
- "Container(s) not found (management=%s, redis=%s). Creating new containers.",
1697
- management_container_exists,
1698
- redis_container_exists
1699
- )
1700
- # Fall through to create new containers
1520
+ # Redis container restart
1521
+ redis_restart_cmd = "docker restart redis_container"
1522
+ self.start(redis_restart_cmd, "redis")
1701
1523
 
1524
+ return
1525
+
1702
1526
  # Redis container with --net=host (Port: 6379)
1703
1527
  redis_cmd = (
1704
1528
  f"docker run -d --net=host "
1705
- f"--name {redis_container_name} "
1529
+ f"--name redis_container "
1706
1530
  f"--restart unless-stopped "
1707
1531
  f"{redis_image} "
1708
- f"redis-server --bind 0.0.0.0 "
1709
- f"--appendonly no "
1710
- f'--save "" '
1711
- f"--maxmemory 30gb "
1712
- f"--maxmemory-policy allkeys-lru "
1713
- f"--io-threads 4 "
1714
- f"--io-threads-do-reads yes "
1715
- f"--stream-node-max-bytes 8192 "
1716
- f"--stream-node-max-entries 1000 "
1717
- f"--hz 100 "
1718
- f"--tcp-backlog 2048 "
1719
- f"--timeout 0 "
1720
- f"--lazyfree-lazy-eviction yes "
1721
- f"--lazyfree-lazy-expire yes "
1722
- f"--lazyfree-lazy-server-del yes "
1723
- f"--activedefrag yes "
1724
- f"--requirepass {redis_password}"
1532
+ f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
1725
1533
  )
1534
+
1726
1535
  logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
1727
1536
 
1728
1537
  # Start Redis container first
@@ -1772,8 +1581,7 @@ def deploy_aggregator_execute(
1772
1581
  if not action_details:
1773
1582
  return
1774
1583
  self.setup_action_requirements(action_details, work_fs)
1775
- container_name = f"deploy_aggregator_{self.action_record_id}"
1776
- cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1584
+ cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1777
1585
  logging.info("cmd: %s", cmd)
1778
1586
  self.start(cmd, "deploy_aggregator")
1779
1587
 
@@ -1789,10 +1597,6 @@ def model_deploy_execute(self: ActionInstance):
1789
1597
  return
1790
1598
  action_id = action_details["_id"]
1791
1599
  model_family = action_details["actionDetails"]["modelFamily"]
1792
-
1793
- # Get the service ID to track deployments
1794
- service_id = action_details.get("_idService")
1795
-
1796
1600
  self.setup_action_requirements(
1797
1601
  action_details,
1798
1602
  work_fs,
@@ -1800,29 +1604,17 @@ def model_deploy_execute(self: ActionInstance):
1800
1604
  action_id=action_id,
1801
1605
  )
1802
1606
 
1803
- # Check if this is the first deployment for this service
1804
- is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
1805
-
1806
- # Get GPU configuration (uses utility function with fail-safe fallback)
1807
- use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
1808
-
1809
- logging.info(
1810
- "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
1811
- action_id,
1812
- use_gpu if use_gpu else "CPU-only",
1813
- is_first_deployment
1814
- )
1815
-
1816
- # Get or create TRITON_PORTS (uses utility method)
1817
- triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1607
+ # Get GPU configuration based on requirements and availability
1608
+ # This uses the best-fit algorithm to select the most appropriate GPU(s)
1609
+ use_gpu = self.get_gpu_config(action_details)
1818
1610
 
1819
- extra_env_vars = {
1820
- "INTERNAL_PORT": internal_port,
1821
- "TRITON_PORTS": triton_ports
1822
- }
1611
+ # Override: If GPU is required, use all available GPUs
1612
+ gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
1613
+ if gpuRequired:
1614
+ use_gpu = "--runtime=nvidia --gpus all"
1823
1615
 
1824
- container_name = f"model_deploy_{self.action_record_id}"
1825
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
1616
+ extra_env_vars = {"INTERNAL_PORT": internal_port}
1617
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1826
1618
  logging.info("cmd is: %s", cmd)
1827
1619
  self.start(cmd, "deploy_log")
1828
1620
 
@@ -1845,27 +1637,17 @@ def model_train_execute(self: ActionInstance):
1845
1637
  action_id=action_id,
1846
1638
  )
1847
1639
 
1848
- existing_container_id = action_details["actionDetails"].get("containerId")
1849
- if existing_container_id:
1850
- # Check if container actually exists before trying to restart
1851
- if ActionInstance.container_exists(existing_container_id):
1852
- logging.info(
1853
- "Using existing container ID for training: %s",
1854
- existing_container_id,
1855
- )
1856
- self.docker_container = existing_container_id
1857
- cmd = "docker restart " + self.docker_container
1858
- self.start(cmd, "train_log")
1859
- return
1860
- else:
1861
- logging.warning(
1862
- "Container %s not found. Creating new container.",
1863
- existing_container_id
1864
- )
1865
- # Fall through to create new container
1866
-
1867
- container_name = f"model_train_{self.action_record_id}"
1868
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
1640
+ if action_details["actionDetails"].get("containerId"):
1641
+ logging.info(
1642
+ "Using existing container ID for training: %s",
1643
+ action_details["actionDetails"]["containerId"],
1644
+ )
1645
+ self.docker_container = action_details["actionDetails"]["containerId"]
1646
+ cmd = "docker restart " + self.docker_container
1647
+ self.start(cmd, "train_log")
1648
+ return
1649
+
1650
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1869
1651
  logging.info("cmd is: %s", cmd)
1870
1652
  self.start(cmd, "train_log")
1871
1653
 
@@ -1886,27 +1668,17 @@ def model_eval_execute(self: ActionInstance):
1886
1668
  model_family=model_family,
1887
1669
  action_id=action_id,
1888
1670
  )
1889
- existing_container_id = action_details["actionDetails"].get("containerId")
1890
- if existing_container_id:
1891
- # Check if container actually exists before trying to restart
1892
- if ActionInstance.container_exists(existing_container_id):
1893
- logging.info(
1894
- "Using existing container ID for evaluation: %s",
1895
- existing_container_id,
1896
- )
1897
- self.docker_container = existing_container_id
1898
- cmd = "docker restart " + self.docker_container
1899
- self.start(cmd, "eval_log")
1900
- return
1901
- else:
1902
- logging.warning(
1903
- "Container %s not found. Creating new container.",
1904
- existing_container_id
1905
- )
1906
- # Fall through to create new container
1907
-
1908
- container_name = f"model_eval_{self.action_record_id}"
1909
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
1671
+ if action_details["actionDetails"].get("containerId"):
1672
+ logging.info(
1673
+ "Using existing container ID for training: %s",
1674
+ action_details["actionDetails"]["containerId"],
1675
+ )
1676
+ self.docker_container = action_details["actionDetails"]["containerId"]
1677
+ cmd = "docker restart " + self.docker_container
1678
+ self.start(cmd, "eval_log")
1679
+ return
1680
+
1681
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1910
1682
  logging.info("cmd is: %s", cmd)
1911
1683
  self.start(cmd, "eval_log")
1912
1684
 
@@ -1930,27 +1702,17 @@ def model_export_execute(self: ActionInstance):
1930
1702
  model_family=model_family,
1931
1703
  action_id=action_id,
1932
1704
  )
1933
- existing_container_id = action_details["actionDetails"].get("containerId")
1934
- if existing_container_id:
1935
- # Check if container actually exists before trying to restart
1936
- if ActionInstance.container_exists(existing_container_id):
1937
- logging.info(
1938
- "Using existing container ID for export: %s",
1939
- existing_container_id,
1940
- )
1941
- self.docker_container = existing_container_id
1942
- cmd = "docker restart " + self.docker_container
1943
- self.start(cmd, "export_log")
1944
- return
1945
- else:
1946
- logging.warning(
1947
- "Container %s not found. Creating new container.",
1948
- existing_container_id
1949
- )
1950
- # Fall through to create new container
1951
-
1952
- container_name = f"model_export_{self.action_record_id}"
1953
- cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
1705
+ if action_details["actionDetails"].get("containerId"):
1706
+ logging.info(
1707
+ "Using existing container ID for training: %s",
1708
+ action_details["actionDetails"]["containerId"],
1709
+ )
1710
+ self.docker_container = action_details["actionDetails"]["containerId"]
1711
+ cmd = "docker restart " + self.docker_container
1712
+ self.start(cmd, "export_log")
1713
+ return
1714
+
1715
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1954
1716
  logging.info("cmd is: %s", cmd)
1955
1717
  self.start(cmd, "export_log")
1956
1718
 
@@ -1966,8 +1728,7 @@ def image_build_execute(self: ActionInstance):
1966
1728
  action_id = action_details["_id"]
1967
1729
  internal_api_key = self.get_internal_api_key(action_id)
1968
1730
  extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1969
- container_name = f"image_build_{self.action_record_id}"
1970
- cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
1731
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1971
1732
  logging.info("cmd is: %s", cmd)
1972
1733
  self.start(cmd, "image_build_log")
1973
1734
 
@@ -1979,8 +1740,7 @@ def resource_clone_execute(self: ActionInstance):
1979
1740
  if not action_details:
1980
1741
  return
1981
1742
  self.setup_action_requirements(action_details)
1982
- container_name = f"resource_clone_{self.action_record_id}"
1983
- cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
1743
+ cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1984
1744
  logging.info("cmd is: %s", cmd)
1985
1745
  self.start(cmd, "resource_clone")
1986
1746
 
@@ -1996,27 +1756,17 @@ def streaming_gateway_execute(self: ActionInstance):
1996
1756
  self.docker_container = (
1997
1757
  f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1998
1758
  )
1999
- existing_container_id = action_details["actionDetails"].get("containerId")
2000
- if existing_container_id:
2001
- # Check if container actually exists before trying to restart
2002
- if ActionInstance.container_exists(existing_container_id):
2003
- logging.info(
2004
- "Using existing container ID for streaming gateway: %s",
2005
- existing_container_id,
2006
- )
2007
- self.docker_container = existing_container_id
2008
- cmd = "docker restart " + self.docker_container
2009
- self.start(cmd, "streaming_gateway")
2010
- return
2011
- else:
2012
- logging.warning(
2013
- "Container %s not found. Creating new container.",
2014
- existing_container_id
2015
- )
2016
- # Fall through to create new container
2017
-
2018
- container_name = f"streaming_gateway_{self.action_record_id}"
2019
- cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1759
+ if action_details["actionDetails"].get("containerId"):
1760
+ logging.info(
1761
+ "Using existing container ID for training: %s",
1762
+ action_details["actionDetails"]["containerId"],
1763
+ )
1764
+ self.docker_container = action_details["actionDetails"]["containerId"]
1765
+ cmd = "docker restart " + self.docker_container
1766
+ self.start(cmd, "streaming_gateway")
1767
+ return
1768
+
1769
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
2020
1770
  logging.info("cmd is: %s", cmd)
2021
1771
  self.start(cmd, "streaming_gateway")
2022
1772
 
@@ -2110,24 +1860,16 @@ def kafka_setup_execute(self: ActionInstance):
2110
1860
  else:
2111
1861
  pkgs = f"matrice_common matrice"
2112
1862
 
2113
- existing_container_id = action_details["actionDetails"].get("containerId")
2114
- if existing_container_id:
2115
- # Check if container actually exists before trying to restart
2116
- if ActionInstance.container_exists(existing_container_id):
2117
- logging.info(
2118
- "Using existing container ID for kafka: %s",
2119
- existing_container_id,
2120
- )
2121
- self.docker_container = existing_container_id
2122
- cmd = "docker restart " + self.docker_container
2123
- self.start(cmd, "kafka_setup")
2124
- return
2125
- else:
2126
- logging.warning(
2127
- "Container %s not found. Creating new container.",
2128
- existing_container_id
2129
- )
2130
- # Fall through to create new container
1863
+ if action_details["actionDetails"].get("containerId"):
1864
+ logging.info(
1865
+ "Using existing container ID for training: %s",
1866
+ action_details["actionDetails"]["containerId"],
1867
+ )
1868
+ self.docker_container = action_details["actionDetails"]["containerId"]
1869
+ cmd = "docker restart " + self.docker_container
1870
+ self.start(cmd, "kafka_setup")
1871
+ return
1872
+
2131
1873
 
2132
1874
  # Kafka container with --net=host (Ports: 9092, 9093)
2133
1875
  cmd = (
@@ -2164,36 +1906,27 @@ def inference_tracker_setup_execute(self: ActionInstance):
2164
1906
 
2165
1907
  self.setup_action_requirements(action_details)
2166
1908
 
2167
- existing_container_id = action_details["actionDetails"].get("containerId")
2168
- if existing_container_id:
2169
- # Check if container actually exists before trying to restart
2170
- if ActionInstance.container_exists(existing_container_id):
2171
- logging.info(
2172
- "Using existing container ID for inference tracker: %s",
2173
- existing_container_id,
2174
- )
2175
- self.docker_container = existing_container_id
2176
- cmd = "docker restart " + self.docker_container
2177
- self.start(cmd, "inference_tracker_setup")
2178
- return
2179
- else:
2180
- logging.warning(
2181
- "Container %s not found. Creating new container.",
2182
- existing_container_id
2183
- )
2184
- # Fall through to create new container
2185
-
1909
+ if action_details["actionDetails"].get("containerId"):
1910
+ logging.info(
1911
+ "Using existing container ID for inference tracker: %s",
1912
+ action_details["actionDetails"]["containerId"],
1913
+ )
1914
+ self.docker_container = action_details["actionDetails"]["containerId"]
1915
+ cmd = "docker restart " + self.docker_container
1916
+ self.start(cmd, "inference_tracker_setup")
1917
+ return
1918
+
2186
1919
  # This is the existing Docker run command
2187
- container_name = f"inference_tracker_{self.action_record_id}"
2188
1920
  worker_cmd = (
2189
1921
  f"docker run -d --pull=always --net=host "
2190
- f"--cidfile ./{self.action_record_id}.cid "
2191
- f"--name {container_name} "
1922
+ f"--cidfile ./{self.action_record_id}.cid "
1923
+ f"--name inference-tracker-worker "
2192
1924
  f"-v matrice_myvol:/matrice_data "
2193
1925
  f'-e ENV="{os.environ.get("ENV", "prod")}" '
2194
1926
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2195
1927
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2196
1928
  f'-e ACTION_ID="{self.action_record_id}" '
1929
+ f' --restart=unless-stopped '
2197
1930
  f"{image}"
2198
1931
  )
2199
1932
 
@@ -2235,6 +1968,7 @@ def video_storage_setup_execute(self: ActionInstance):
2235
1968
  f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2236
1969
  f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2237
1970
  f'-e ACTION_ID="{self.action_record_id}" '
1971
+ f' --restart=unless-stopped '
2238
1972
  f"{image}"
2239
1973
  )
2240
1974
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_compute
3
- Version: 0.1.34
3
+ Version: 0.1.36
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
1
  matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
2
- matrice_compute/action_instance.py,sha256=GF49-yYJp_5EHZ6ZT5kY4U-y1zyPkFjjDt1xMb2BaIg,87439
2
+ matrice_compute/action_instance.py,sha256=03TX2dF2i2DUtMJvFJFckzvIEPsuyaJuNk9mkHWjsLM,75901
3
3
  matrice_compute/actions_manager.py,sha256=a_TulMnu462xc0t_A-Mpug5zhQTmtpjiv7mhiC_IAVw,18280
4
4
  matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
5
5
  matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
@@ -11,8 +11,8 @@ matrice_compute/resources_tracker.py,sha256=DffKitGU1gran0OAuKIsfH0XeOe03xU7NGl-
11
11
  matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
12
12
  matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
13
13
  matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
14
- matrice_compute-0.1.34.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
15
- matrice_compute-0.1.34.dist-info/METADATA,sha256=K4c_uaSlUeEbbC7yWB9RzW_qvLoxfgwGOk94BbbtaQs,1038
16
- matrice_compute-0.1.34.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
- matrice_compute-0.1.34.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
18
- matrice_compute-0.1.34.dist-info/RECORD,,
14
+ matrice_compute-0.1.36.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
15
+ matrice_compute-0.1.36.dist-info/METADATA,sha256=S3V1TndESfRIbXxF4M6CMxoqNtVZ3uvnoa7WkzcSNxI,1038
16
+ matrice_compute-0.1.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
17
+ matrice_compute-0.1.36.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
18
+ matrice_compute-0.1.36.dist-info/RECORD,,