matrice-compute 0.1.38__tar.gz → 0.1.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/PKG-INFO +1 -1
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/action_instance.py +114 -241
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/LICENSE.txt +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/README.md +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/pyproject.toml +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/setup.cfg +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/setup.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/compute_operations_handler.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/instance_manager.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/instance_utils.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/resources_tracker.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/scaling.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/task_utils.py +0 -0
|
@@ -26,10 +26,6 @@ from matrice_common.utils import log_errors
|
|
|
26
26
|
class ActionInstance:
|
|
27
27
|
"""Base class for tasks that run in Action containers."""
|
|
28
28
|
|
|
29
|
-
# Class-level dictionary to track deployed services and their ports
|
|
30
|
-
# Key: _idService, Value: {"triton_ports": "port1,port2,port3"}
|
|
31
|
-
_deployed_services = {}
|
|
32
|
-
|
|
33
29
|
def __init__(self, scaling: Scaling, action_info: dict):
|
|
34
30
|
"""Initialize an action instance.
|
|
35
31
|
|
|
@@ -89,52 +85,6 @@ class ActionInstance:
|
|
|
89
85
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
90
86
|
self.task = self.actions_map[self.action_type]
|
|
91
87
|
|
|
92
|
-
@classmethod
|
|
93
|
-
def get_or_create_triton_ports(cls, service_id, scaling_instance):
|
|
94
|
-
"""Get existing TRITON_PORTS for a service or create new ones.
|
|
95
|
-
|
|
96
|
-
Args:
|
|
97
|
-
service_id (str): Service ID (_idService)
|
|
98
|
-
scaling_instance: Scaling instance to get open ports
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
|
|
102
|
-
"""
|
|
103
|
-
if not service_id:
|
|
104
|
-
# No service_id, generate new ports
|
|
105
|
-
port1 = scaling_instance.get_open_port()
|
|
106
|
-
port2 = scaling_instance.get_open_port()
|
|
107
|
-
port3 = scaling_instance.get_open_port()
|
|
108
|
-
return f"{port1},{port2},{port3}"
|
|
109
|
-
|
|
110
|
-
# Check if ports already exist for this service
|
|
111
|
-
if service_id in cls._deployed_services:
|
|
112
|
-
triton_ports = cls._deployed_services[service_id]["triton_ports"]
|
|
113
|
-
logging.info(
|
|
114
|
-
"Reusing TRITON_PORTS for service %s: %s",
|
|
115
|
-
service_id,
|
|
116
|
-
triton_ports
|
|
117
|
-
)
|
|
118
|
-
return triton_ports
|
|
119
|
-
|
|
120
|
-
# First deployment: generate new ports and store them
|
|
121
|
-
port1 = scaling_instance.get_open_port()
|
|
122
|
-
port2 = scaling_instance.get_open_port()
|
|
123
|
-
port3 = scaling_instance.get_open_port()
|
|
124
|
-
triton_ports = f"{port1},{port2},{port3}"
|
|
125
|
-
|
|
126
|
-
# Store for future use
|
|
127
|
-
cls._deployed_services[service_id] = {
|
|
128
|
-
"triton_ports": triton_ports,
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
logging.info(
|
|
132
|
-
"First deployment for service %s - generated TRITON_PORTS: %s",
|
|
133
|
-
service_id,
|
|
134
|
-
triton_ports
|
|
135
|
-
)
|
|
136
|
-
return triton_ports
|
|
137
|
-
|
|
138
88
|
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
139
89
|
def _init_credentials(self):
|
|
140
90
|
"""Initialize Matrice credentials.
|
|
@@ -396,7 +346,6 @@ class ActionInstance:
|
|
|
396
346
|
destination_workspace_path: str = "/usr/src/workspace",
|
|
397
347
|
docker_workdir: str = "",
|
|
398
348
|
extra_pkgs: list = [],
|
|
399
|
-
container_name: str = "",
|
|
400
349
|
):
|
|
401
350
|
"""Build base Docker command with common options.
|
|
402
351
|
|
|
@@ -411,7 +360,6 @@ class ActionInstance:
|
|
|
411
360
|
destination_workspace_path (str): Container workspace path
|
|
412
361
|
docker_workdir (str): Docker working directory
|
|
413
362
|
extra_pkgs (list): List of extra packages to install
|
|
414
|
-
container_name (str): Docker container name (format: {action_type}_{action_id})
|
|
415
363
|
Returns:
|
|
416
364
|
str: Base Docker command
|
|
417
365
|
"""
|
|
@@ -478,20 +426,17 @@ class ActionInstance:
|
|
|
478
426
|
|
|
479
427
|
# if the service provider is local, then put --restart unless-stopped
|
|
480
428
|
if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
|
|
481
|
-
|
|
482
|
-
use_restart_policy = "--restart unless-stopped"
|
|
429
|
+
use_restart_policy = "--restart=unless-stopped "
|
|
483
430
|
else:
|
|
484
431
|
use_restart_policy = ""
|
|
485
432
|
|
|
486
|
-
# Build container name option if provided
|
|
487
|
-
name_option = f"--name {container_name}" if container_name else ""
|
|
488
|
-
|
|
489
433
|
cmd_parts = [
|
|
490
|
-
f"docker run
|
|
434
|
+
f"docker run {use_gpu} {use_restart_policy} ",
|
|
491
435
|
network_config,
|
|
492
436
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
493
437
|
*volumes,
|
|
494
438
|
# Container configuration and startup commands
|
|
439
|
+
f"--cidfile ./{self.action_record_id}.cid ",
|
|
495
440
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
496
441
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
497
442
|
f"{env_exports} && "
|
|
@@ -893,50 +838,55 @@ class ActionInstance:
|
|
|
893
838
|
self.cmd = cmd
|
|
894
839
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
895
840
|
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
# Use a longer timeout for docker run since --pull=always may need to
|
|
906
|
-
# download large images on first run. Default: 30 minutes (1800 seconds)
|
|
907
|
-
# Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
|
|
908
|
-
docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
|
|
909
|
-
logging.info(
|
|
910
|
-
"Waiting for docker container to start for action %s (timeout: %d seconds)",
|
|
911
|
-
self.action_record_id,
|
|
912
|
-
docker_start_timeout,
|
|
913
|
-
)
|
|
914
|
-
stdout, stderr = process.communicate(timeout=docker_start_timeout)
|
|
841
|
+
with open(self.log_path, "wb") as out:
|
|
842
|
+
self.process = subprocess.Popen(
|
|
843
|
+
shlex.split(self.cmd),
|
|
844
|
+
stdout=out,
|
|
845
|
+
stderr=out,
|
|
846
|
+
env={**os.environ},
|
|
847
|
+
start_new_session=True,
|
|
848
|
+
)
|
|
915
849
|
|
|
916
|
-
|
|
850
|
+
self.container_id = None
|
|
851
|
+
|
|
852
|
+
cid_file_path = f"./{self.action_record_id}.cid"
|
|
853
|
+
max_retries = 5
|
|
854
|
+
retry_delay = 1 # seconds
|
|
855
|
+
for attempt in range(max_retries):
|
|
856
|
+
try:
|
|
857
|
+
with open(cid_file_path, "r") as cid_file:
|
|
858
|
+
container_id = cid_file.read().strip()
|
|
859
|
+
self.container_id = container_id
|
|
860
|
+
logging.info(
|
|
861
|
+
"Started process for action %s with container ID: %s",
|
|
862
|
+
self.action_record_id,
|
|
863
|
+
self.container_id,
|
|
864
|
+
)
|
|
865
|
+
break
|
|
866
|
+
except FileNotFoundError:
|
|
867
|
+
logging.warning(
|
|
868
|
+
"CID file not found for action %s, attempt %d/%d",
|
|
869
|
+
self.action_record_id,
|
|
870
|
+
attempt + 1,
|
|
871
|
+
max_retries,
|
|
872
|
+
)
|
|
873
|
+
time.sleep(retry_delay)
|
|
874
|
+
except Exception as e:
|
|
875
|
+
logging.error(
|
|
876
|
+
"Error reading CID file for action %s: %s",
|
|
877
|
+
self.action_record_id,
|
|
878
|
+
str(e),
|
|
879
|
+
)
|
|
880
|
+
time.sleep(retry_delay)
|
|
881
|
+
else:
|
|
917
882
|
logging.error(
|
|
918
|
-
"
|
|
883
|
+
"Failed to read CID file for action %s after %d attempts",
|
|
919
884
|
self.action_record_id,
|
|
920
|
-
|
|
885
|
+
max_retries,
|
|
921
886
|
)
|
|
922
|
-
raise
|
|
887
|
+
raise Exception("Failed to start process: CID file not found")
|
|
923
888
|
|
|
924
|
-
|
|
925
|
-
logging.info(
|
|
926
|
-
"Started container for action %s with ID: %s",
|
|
927
|
-
self.action_record_id,
|
|
928
|
-
self.container_id,
|
|
929
|
-
)
|
|
930
|
-
|
|
931
|
-
# Start following container logs in background
|
|
932
|
-
self.process = subprocess.Popen(
|
|
933
|
-
["docker", "logs", "-f", self.container_id],
|
|
934
|
-
stdout=open(self.log_path, "wb"),
|
|
935
|
-
stderr=subprocess.STDOUT,
|
|
936
|
-
start_new_session=True,
|
|
937
|
-
)
|
|
938
|
-
|
|
939
|
-
# Report container id to scaling service
|
|
889
|
+
# report container id to scaling service
|
|
940
890
|
self.scaling.update_action_container_id(
|
|
941
891
|
action_record_id=self.action_record_id,
|
|
942
892
|
container_id=self.container_id,
|
|
@@ -1102,8 +1052,7 @@ def data_preparation_execute(
|
|
|
1102
1052
|
"Started pulling Docker image with PID: %s",
|
|
1103
1053
|
process.pid,
|
|
1104
1054
|
)
|
|
1105
|
-
|
|
1106
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1055
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1107
1056
|
logging.info("cmd is: %s", cmd)
|
|
1108
1057
|
self.start(cmd, "data_preparation_log")
|
|
1109
1058
|
|
|
@@ -1132,8 +1081,7 @@ def data_processing_execute(self: ActionInstance):
|
|
|
1132
1081
|
service="bg-job-scheduler",
|
|
1133
1082
|
job_params=action["jobParams"],
|
|
1134
1083
|
)
|
|
1135
|
-
|
|
1136
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1084
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1137
1085
|
logging.info("cmd: %s", cmd)
|
|
1138
1086
|
self.start(cmd, "data_processing_log")
|
|
1139
1087
|
|
|
@@ -1146,8 +1094,7 @@ def data_split_execute(self: ActionInstance):
|
|
|
1146
1094
|
if not action_details:
|
|
1147
1095
|
return
|
|
1148
1096
|
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1149
|
-
|
|
1150
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1097
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1151
1098
|
logging.info("cmd: %s", cmd)
|
|
1152
1099
|
self.start(cmd, "data_split")
|
|
1153
1100
|
|
|
@@ -1162,8 +1109,7 @@ def dataset_annotation_execute(
|
|
|
1162
1109
|
if not action_details:
|
|
1163
1110
|
return
|
|
1164
1111
|
self.setup_action_requirements(action_details, work_fs)
|
|
1165
|
-
|
|
1166
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1112
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1167
1113
|
logging.info("cmd: %s", cmd)
|
|
1168
1114
|
self.start(cmd, "dataset_annotation")
|
|
1169
1115
|
|
|
@@ -1178,8 +1124,7 @@ def dataset_augmentation_execute(
|
|
|
1178
1124
|
if not action_details:
|
|
1179
1125
|
return
|
|
1180
1126
|
self.setup_action_requirements(action_details, work_fs)
|
|
1181
|
-
|
|
1182
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1127
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1183
1128
|
logging.info("cmd: %s", cmd)
|
|
1184
1129
|
self.start(cmd, "dataset_augmentation")
|
|
1185
1130
|
|
|
@@ -1195,8 +1140,7 @@ def augmentation_server_creation_execute(
|
|
|
1195
1140
|
if not action_details:
|
|
1196
1141
|
return
|
|
1197
1142
|
self.setup_action_requirements(action_details, work_fs)
|
|
1198
|
-
|
|
1199
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1143
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1200
1144
|
logging.info("cmd: %s", cmd)
|
|
1201
1145
|
self.start(cmd, "augmentation_setup")
|
|
1202
1146
|
|
|
@@ -1217,34 +1161,32 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1217
1161
|
|
|
1218
1162
|
project_id = action_details["_idProject"]
|
|
1219
1163
|
|
|
1220
|
-
# Define container names with action_record_id for uniqueness
|
|
1221
|
-
mongodb_container_name = f"database_setup_{self.action_record_id}"
|
|
1222
|
-
qdrant_container_name = f"qdrant_{self.action_record_id}"
|
|
1223
|
-
|
|
1224
1164
|
if action_details["actionDetails"].get("containerId"):
|
|
1225
1165
|
logging.info(
|
|
1226
|
-
"Using existing container ID for
|
|
1166
|
+
"Using existing container ID for inference tracker: %s",
|
|
1227
1167
|
action_details["actionDetails"]["containerId"],
|
|
1228
1168
|
)
|
|
1229
1169
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1230
1170
|
cmd = "docker restart " + self.docker_container
|
|
1231
|
-
self.start(cmd, "
|
|
1171
|
+
self.start(cmd, "qdrant_setup")
|
|
1232
1172
|
|
|
1233
|
-
#
|
|
1234
|
-
qdrant_cmd =
|
|
1235
|
-
self.start(qdrant_cmd,
|
|
1173
|
+
#qdrant restart
|
|
1174
|
+
qdrant_cmd = "docker restart qdrant"
|
|
1175
|
+
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1236
1176
|
|
|
1237
1177
|
return
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
|
|
1238
1181
|
|
|
1239
|
-
dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
|
|
1240
1182
|
|
|
1241
1183
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1242
1184
|
cmd = (
|
|
1243
|
-
f"docker run
|
|
1244
|
-
f"--name {mongodb_container_name} "
|
|
1245
|
-
f"-v matrice_myvol:/matrice_data "
|
|
1185
|
+
f"docker run --pull=always --net=host "
|
|
1246
1186
|
f"-v {dbPath}:{dbPath} "
|
|
1187
|
+
f"--name database_setup_{self.action_record_id} "
|
|
1247
1188
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1189
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1248
1190
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1249
1191
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1250
1192
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
@@ -1254,23 +1196,6 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1254
1196
|
)
|
|
1255
1197
|
logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
|
|
1256
1198
|
|
|
1257
|
-
# Qdrant container with --net=host (Port: 6334)
|
|
1258
|
-
qdrant_cmd = (
|
|
1259
|
-
f"docker run -d --pull=always --net=host "
|
|
1260
|
-
f"--name {qdrant_container_name} "
|
|
1261
|
-
f"-v matrice_myvol:/matrice_data "
|
|
1262
|
-
f"qdrant/qdrant:latest "
|
|
1263
|
-
)
|
|
1264
|
-
logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
|
|
1265
|
-
|
|
1266
|
-
# Start Qdrant container
|
|
1267
|
-
qdrant_process = subprocess.Popen(
|
|
1268
|
-
qdrant_cmd,
|
|
1269
|
-
shell=True,
|
|
1270
|
-
stdout=subprocess.PIPE,
|
|
1271
|
-
stderr=subprocess.PIPE,
|
|
1272
|
-
)
|
|
1273
|
-
logging.info("Qdrant container started successfully")
|
|
1274
1199
|
|
|
1275
1200
|
# Docker Command run
|
|
1276
1201
|
self.start(cmd, "database_setup")
|
|
@@ -1290,8 +1215,6 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1290
1215
|
|
|
1291
1216
|
self.setup_action_requirements(action_details)
|
|
1292
1217
|
|
|
1293
|
-
container_name = f"facial_recognition_{self.action_record_id}"
|
|
1294
|
-
|
|
1295
1218
|
if action_details["actionDetails"].get("containerId"):
|
|
1296
1219
|
logging.info(
|
|
1297
1220
|
"Using existing container ID for facial recognition worker: %s",
|
|
@@ -1305,13 +1228,15 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1305
1228
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1306
1229
|
worker_cmd = (
|
|
1307
1230
|
f"docker run -d --pull=always --net=host "
|
|
1308
|
-
f"--name
|
|
1231
|
+
f"--name worker "
|
|
1232
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1309
1233
|
f"-v matrice_myvol:/matrice_data "
|
|
1234
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1310
1235
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1311
1236
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1312
1237
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1313
1238
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1314
|
-
f'--restart=unless-stopped '
|
|
1239
|
+
f' --restart=unless-stopped '
|
|
1315
1240
|
f"{image}"
|
|
1316
1241
|
)
|
|
1317
1242
|
logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
|
|
@@ -1333,8 +1258,6 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1333
1258
|
|
|
1334
1259
|
self.setup_action_requirements(action_details)
|
|
1335
1260
|
|
|
1336
|
-
container_name = f"lpr_{self.action_record_id}"
|
|
1337
|
-
|
|
1338
1261
|
if action_details["actionDetails"].get("containerId"):
|
|
1339
1262
|
logging.info(
|
|
1340
1263
|
"Using existing container ID for LPR worker: %s",
|
|
@@ -1348,14 +1271,15 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1348
1271
|
# LPR worker container with --net=host (Port: 8082)
|
|
1349
1272
|
worker_cmd = (
|
|
1350
1273
|
f"docker run -d --net=host --pull=always "
|
|
1351
|
-
f"--name
|
|
1274
|
+
f"--name lpr-worker "
|
|
1275
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1352
1276
|
f"-v matrice_myvol:/matrice_data "
|
|
1353
1277
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1354
1278
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1355
1279
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1356
1280
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1357
1281
|
f'-e PORT=8082 '
|
|
1358
|
-
f'--restart=unless-stopped '
|
|
1282
|
+
f' --restart=unless-stopped '
|
|
1359
1283
|
f"{image}"
|
|
1360
1284
|
)
|
|
1361
1285
|
logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
|
|
@@ -1386,8 +1310,6 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1386
1310
|
|
|
1387
1311
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1388
1312
|
|
|
1389
|
-
container_name = f"inference_ws_{self.action_record_id}"
|
|
1390
|
-
|
|
1391
1313
|
if action_details["actionDetails"].get("containerId"):
|
|
1392
1314
|
logging.info(
|
|
1393
1315
|
"Using existing container ID for inference WebSocket server: %s",
|
|
@@ -1401,11 +1323,12 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1401
1323
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1402
1324
|
worker_cmd = (
|
|
1403
1325
|
f"docker run -d --pull=always --net=host "
|
|
1404
|
-
f"--name
|
|
1326
|
+
f"--name inference "
|
|
1327
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1405
1328
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1406
1329
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1407
1330
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1408
|
-
f'--restart=unless-stopped '
|
|
1331
|
+
f' --restart=unless-stopped '
|
|
1409
1332
|
f"{image} "
|
|
1410
1333
|
f"./app "
|
|
1411
1334
|
f"{self.action_record_id} "
|
|
@@ -1436,8 +1359,6 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1436
1359
|
|
|
1437
1360
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1438
1361
|
|
|
1439
|
-
container_name = f"fe_streaming_{self.action_record_id}"
|
|
1440
|
-
|
|
1441
1362
|
if action_details["actionDetails"].get("containerId"):
|
|
1442
1363
|
logging.info(
|
|
1443
1364
|
"Using existing container ID for frontend streaming: %s",
|
|
@@ -1451,14 +1372,15 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1451
1372
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1452
1373
|
worker_cmd = (
|
|
1453
1374
|
f"docker run -d --pull=always --net=host "
|
|
1454
|
-
f"--name
|
|
1375
|
+
f"--name fe_streaming "
|
|
1376
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1455
1377
|
f"-v matrice_myvol:/matrice_data "
|
|
1456
1378
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1457
1379
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1458
1380
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1459
1381
|
f"-e PORT=3000 "
|
|
1460
1382
|
f'-e WS_HOST="{ws_url}" '
|
|
1461
|
-
f'--restart=unless-stopped '
|
|
1383
|
+
f' --restart=unless-stopped '
|
|
1462
1384
|
f"{image}"
|
|
1463
1385
|
)
|
|
1464
1386
|
logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
|
|
@@ -1483,8 +1405,6 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1483
1405
|
|
|
1484
1406
|
project_id = action_details["_idProject"]
|
|
1485
1407
|
|
|
1486
|
-
container_name = f"fe_analytics_{self.action_record_id}"
|
|
1487
|
-
|
|
1488
1408
|
if action_details["actionDetails"].get("containerId"):
|
|
1489
1409
|
logging.info(
|
|
1490
1410
|
"Using existing container ID for frontend analytics service: %s",
|
|
@@ -1498,14 +1418,15 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1498
1418
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1499
1419
|
worker_cmd = (
|
|
1500
1420
|
f"docker run -d --pull=always --net=host "
|
|
1501
|
-
f"--name
|
|
1421
|
+
f"--name fe-analytics "
|
|
1422
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1502
1423
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1503
1424
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1504
1425
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1505
1426
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1506
1427
|
f"-e PORT=3001 "
|
|
1507
1428
|
f'-e PROJECT_ID="{project_id}" '
|
|
1508
|
-
f'--restart=unless-stopped '
|
|
1429
|
+
f' --restart=unless-stopped '
|
|
1509
1430
|
f"{image}"
|
|
1510
1431
|
)
|
|
1511
1432
|
logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
|
|
@@ -1530,8 +1451,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
|
1530
1451
|
else:
|
|
1531
1452
|
return
|
|
1532
1453
|
use_gpu = self.get_gpu_config(action_details)
|
|
1533
|
-
|
|
1534
|
-
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1454
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1535
1455
|
logging.info("cmd is: %s", cmd)
|
|
1536
1456
|
self.start(cmd, "dataset_generation")
|
|
1537
1457
|
|
|
@@ -1552,8 +1472,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1552
1472
|
else:
|
|
1553
1473
|
return
|
|
1554
1474
|
use_gpu = self.get_gpu_config(action_details)
|
|
1555
|
-
|
|
1556
|
-
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1475
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1557
1476
|
logging.info("cmd is: %s", cmd)
|
|
1558
1477
|
self.start(cmd, "synthetic_data_setup")
|
|
1559
1478
|
|
|
@@ -1590,8 +1509,6 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1590
1509
|
|
|
1591
1510
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1592
1511
|
|
|
1593
|
-
# Define container names with action_record_id for uniqueness
|
|
1594
|
-
redis_container_name = f"redis_{self.action_record_id}"
|
|
1595
1512
|
|
|
1596
1513
|
if action_details["actionDetails"].get("containerId"):
|
|
1597
1514
|
logging.info(
|
|
@@ -1603,34 +1520,18 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1603
1520
|
self.start(cmd, "redis_setup")
|
|
1604
1521
|
|
|
1605
1522
|
# Redis container restart
|
|
1606
|
-
redis_restart_cmd =
|
|
1523
|
+
redis_restart_cmd = "docker restart redis_container"
|
|
1607
1524
|
self.start(redis_restart_cmd, "redis")
|
|
1608
1525
|
|
|
1609
1526
|
return
|
|
1610
1527
|
|
|
1611
|
-
# Redis container with --net=host (Port: 6379)
|
|
1528
|
+
# Redis container with --net=host (Port: 6379)
|
|
1612
1529
|
redis_cmd = (
|
|
1613
1530
|
f"docker run -d --net=host "
|
|
1614
|
-
f"--name
|
|
1531
|
+
f"--name redis_container "
|
|
1615
1532
|
f"--restart unless-stopped "
|
|
1616
1533
|
f"{redis_image} "
|
|
1617
|
-
f"redis-server --bind 0.0.0.0 "
|
|
1618
|
-
f"--appendonly no "
|
|
1619
|
-
f'--save "" '
|
|
1620
|
-
f"--maxmemory 30gb "
|
|
1621
|
-
f"--maxmemory-policy allkeys-lru "
|
|
1622
|
-
f"--io-threads 4 "
|
|
1623
|
-
f"--io-threads-do-reads yes "
|
|
1624
|
-
f"--stream-node-max-bytes 8192 "
|
|
1625
|
-
f"--stream-node-max-entries 1000 "
|
|
1626
|
-
f"--hz 100 "
|
|
1627
|
-
f"--tcp-backlog 2048 "
|
|
1628
|
-
f"--timeout 0 "
|
|
1629
|
-
f"--lazyfree-lazy-eviction yes "
|
|
1630
|
-
f"--lazyfree-lazy-expire yes "
|
|
1631
|
-
f"--lazyfree-lazy-server-del yes "
|
|
1632
|
-
f"--activedefrag yes "
|
|
1633
|
-
f"--requirepass {redis_password}"
|
|
1534
|
+
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
1634
1535
|
)
|
|
1635
1536
|
|
|
1636
1537
|
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
@@ -1654,9 +1555,8 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1654
1555
|
|
|
1655
1556
|
# bg-redis management container with --net=host (Port: 8082)
|
|
1656
1557
|
cmd = (
|
|
1657
|
-
f"docker run
|
|
1658
|
-
|
|
1659
|
-
f"--name bg-redis_{self.action_record_id} "
|
|
1558
|
+
f"docker run --net=host "
|
|
1559
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1660
1560
|
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1661
1561
|
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1662
1562
|
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
@@ -1683,8 +1583,7 @@ def deploy_aggregator_execute(
|
|
|
1683
1583
|
if not action_details:
|
|
1684
1584
|
return
|
|
1685
1585
|
self.setup_action_requirements(action_details, work_fs)
|
|
1686
|
-
|
|
1687
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1586
|
+
cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1688
1587
|
logging.info("cmd: %s", cmd)
|
|
1689
1588
|
self.start(cmd, "deploy_aggregator")
|
|
1690
1589
|
|
|
@@ -1700,10 +1599,6 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1700
1599
|
return
|
|
1701
1600
|
action_id = action_details["_id"]
|
|
1702
1601
|
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1703
|
-
|
|
1704
|
-
# Get the service ID to track deployments
|
|
1705
|
-
service_id = action_details.get("_idService")
|
|
1706
|
-
|
|
1707
1602
|
self.setup_action_requirements(
|
|
1708
1603
|
action_details,
|
|
1709
1604
|
work_fs,
|
|
@@ -1711,29 +1606,17 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1711
1606
|
action_id=action_id,
|
|
1712
1607
|
)
|
|
1713
1608
|
|
|
1714
|
-
#
|
|
1609
|
+
# Get GPU configuration based on requirements and availability
|
|
1610
|
+
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1611
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1612
|
+
|
|
1613
|
+
# Override: If GPU is required, use all available GPUs
|
|
1715
1614
|
gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
|
|
1716
1615
|
if gpuRequired:
|
|
1717
1616
|
use_gpu = "--runtime=nvidia --gpus all"
|
|
1718
|
-
else:
|
|
1719
|
-
use_gpu = ""
|
|
1720
|
-
|
|
1721
|
-
logging.info(
|
|
1722
|
-
"Action %s: Model deployment GPU config: %s",
|
|
1723
|
-
action_id,
|
|
1724
|
-
use_gpu if use_gpu else "CPU-only"
|
|
1725
|
-
)
|
|
1726
|
-
|
|
1727
|
-
# Get or create TRITON_PORTS (uses utility method)
|
|
1728
|
-
triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
|
|
1729
1617
|
|
|
1730
|
-
extra_env_vars = {
|
|
1731
|
-
|
|
1732
|
-
"TRITON_PORTS": triton_ports
|
|
1733
|
-
}
|
|
1734
|
-
|
|
1735
|
-
container_name = f"model_deploy_{self.action_record_id}"
|
|
1736
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1618
|
+
extra_env_vars = {"INTERNAL_PORT": internal_port}
|
|
1619
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1737
1620
|
logging.info("cmd is: %s", cmd)
|
|
1738
1621
|
self.start(cmd, "deploy_log")
|
|
1739
1622
|
|
|
@@ -1766,8 +1649,7 @@ def model_train_execute(self: ActionInstance):
|
|
|
1766
1649
|
self.start(cmd, "train_log")
|
|
1767
1650
|
return
|
|
1768
1651
|
|
|
1769
|
-
|
|
1770
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
|
|
1652
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
|
|
1771
1653
|
logging.info("cmd is: %s", cmd)
|
|
1772
1654
|
self.start(cmd, "train_log")
|
|
1773
1655
|
|
|
@@ -1790,7 +1672,7 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1790
1672
|
)
|
|
1791
1673
|
if action_details["actionDetails"].get("containerId"):
|
|
1792
1674
|
logging.info(
|
|
1793
|
-
"Using existing container ID for
|
|
1675
|
+
"Using existing container ID for training: %s",
|
|
1794
1676
|
action_details["actionDetails"]["containerId"],
|
|
1795
1677
|
)
|
|
1796
1678
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1798,8 +1680,7 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1798
1680
|
self.start(cmd, "eval_log")
|
|
1799
1681
|
return
|
|
1800
1682
|
|
|
1801
|
-
|
|
1802
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
|
|
1683
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
|
|
1803
1684
|
logging.info("cmd is: %s", cmd)
|
|
1804
1685
|
self.start(cmd, "eval_log")
|
|
1805
1686
|
|
|
@@ -1825,7 +1706,7 @@ def model_export_execute(self: ActionInstance):
|
|
|
1825
1706
|
)
|
|
1826
1707
|
if action_details["actionDetails"].get("containerId"):
|
|
1827
1708
|
logging.info(
|
|
1828
|
-
"Using existing container ID for
|
|
1709
|
+
"Using existing container ID for training: %s",
|
|
1829
1710
|
action_details["actionDetails"]["containerId"],
|
|
1830
1711
|
)
|
|
1831
1712
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1833,8 +1714,7 @@ def model_export_execute(self: ActionInstance):
|
|
|
1833
1714
|
self.start(cmd, "export_log")
|
|
1834
1715
|
return
|
|
1835
1716
|
|
|
1836
|
-
|
|
1837
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
|
|
1717
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
|
|
1838
1718
|
logging.info("cmd is: %s", cmd)
|
|
1839
1719
|
self.start(cmd, "export_log")
|
|
1840
1720
|
|
|
@@ -1850,8 +1730,7 @@ def image_build_execute(self: ActionInstance):
|
|
|
1850
1730
|
action_id = action_details["_id"]
|
|
1851
1731
|
internal_api_key = self.get_internal_api_key(action_id)
|
|
1852
1732
|
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1853
|
-
|
|
1854
|
-
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
|
|
1733
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
|
|
1855
1734
|
logging.info("cmd is: %s", cmd)
|
|
1856
1735
|
self.start(cmd, "image_build_log")
|
|
1857
1736
|
|
|
@@ -1863,8 +1742,7 @@ def resource_clone_execute(self: ActionInstance):
|
|
|
1863
1742
|
if not action_details:
|
|
1864
1743
|
return
|
|
1865
1744
|
self.setup_action_requirements(action_details)
|
|
1866
|
-
|
|
1867
|
-
cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
|
|
1745
|
+
cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
|
|
1868
1746
|
logging.info("cmd is: %s", cmd)
|
|
1869
1747
|
self.start(cmd, "resource_clone")
|
|
1870
1748
|
|
|
@@ -1882,7 +1760,7 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1882
1760
|
)
|
|
1883
1761
|
if action_details["actionDetails"].get("containerId"):
|
|
1884
1762
|
logging.info(
|
|
1885
|
-
"Using existing container ID for
|
|
1763
|
+
"Using existing container ID for training: %s",
|
|
1886
1764
|
action_details["actionDetails"]["containerId"],
|
|
1887
1765
|
)
|
|
1888
1766
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1890,8 +1768,7 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1890
1768
|
self.start(cmd, "streaming_gateway")
|
|
1891
1769
|
return
|
|
1892
1770
|
|
|
1893
|
-
|
|
1894
|
-
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1771
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1895
1772
|
logging.info("cmd is: %s", cmd)
|
|
1896
1773
|
self.start(cmd, "streaming_gateway")
|
|
1897
1774
|
|
|
@@ -1987,7 +1864,7 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1987
1864
|
|
|
1988
1865
|
if action_details["actionDetails"].get("containerId"):
|
|
1989
1866
|
logging.info(
|
|
1990
|
-
"Using existing container ID for
|
|
1867
|
+
"Using existing container ID for training: %s",
|
|
1991
1868
|
action_details["actionDetails"]["containerId"],
|
|
1992
1869
|
)
|
|
1993
1870
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -1995,12 +1872,10 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1995
1872
|
self.start(cmd, "kafka_setup")
|
|
1996
1873
|
return
|
|
1997
1874
|
|
|
1998
|
-
container_name = f"kafka_{self.action_record_id}"
|
|
1999
1875
|
|
|
2000
1876
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
2001
1877
|
cmd = (
|
|
2002
|
-
f"docker run
|
|
2003
|
-
f"--name {container_name} "
|
|
1878
|
+
f"docker run --net=host "
|
|
2004
1879
|
f"{env_args} "
|
|
2005
1880
|
f"--shm-size=30G --pull=always "
|
|
2006
1881
|
f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
|
|
@@ -2033,8 +1908,6 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
2033
1908
|
|
|
2034
1909
|
self.setup_action_requirements(action_details)
|
|
2035
1910
|
|
|
2036
|
-
container_name = f"inference_tracker_{self.action_record_id}"
|
|
2037
|
-
|
|
2038
1911
|
if action_details["actionDetails"].get("containerId"):
|
|
2039
1912
|
logging.info(
|
|
2040
1913
|
"Using existing container ID for inference tracker: %s",
|
|
@@ -2048,13 +1921,14 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
2048
1921
|
# This is the existing Docker run command
|
|
2049
1922
|
worker_cmd = (
|
|
2050
1923
|
f"docker run -d --pull=always --net=host "
|
|
2051
|
-
|
|
1924
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1925
|
+
f"--name inference-tracker-worker "
|
|
2052
1926
|
f"-v matrice_myvol:/matrice_data "
|
|
2053
1927
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
2054
1928
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
2055
1929
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
2056
1930
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
2057
|
-
f'--restart=unless-stopped '
|
|
1931
|
+
f' --restart=unless-stopped '
|
|
2058
1932
|
f"{image}"
|
|
2059
1933
|
)
|
|
2060
1934
|
|
|
@@ -2076,11 +1950,9 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
2076
1950
|
|
|
2077
1951
|
self.setup_action_requirements(action_details)
|
|
2078
1952
|
|
|
2079
|
-
container_name = f"video_storage_{self.action_record_id}"
|
|
2080
|
-
|
|
2081
1953
|
if action_details["actionDetails"].get("containerId"):
|
|
2082
1954
|
logging.info(
|
|
2083
|
-
"Using existing container ID for
|
|
1955
|
+
"Using existing container ID for inference tracker: %s",
|
|
2084
1956
|
action_details["actionDetails"]["containerId"],
|
|
2085
1957
|
)
|
|
2086
1958
|
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
@@ -2091,13 +1963,14 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
2091
1963
|
# This is the existing Docker run command
|
|
2092
1964
|
worker_cmd = (
|
|
2093
1965
|
f"docker run -d --pull=always --net=host "
|
|
2094
|
-
|
|
1966
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1967
|
+
f"--name media_server "
|
|
2095
1968
|
f"-v matrice_myvol:/matrice_data "
|
|
2096
1969
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
2097
1970
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
2098
1971
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
2099
1972
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
2100
|
-
f'--restart=unless-stopped '
|
|
1973
|
+
f' --restart=unless-stopped '
|
|
2101
1974
|
f"{image}"
|
|
2102
1975
|
)
|
|
2103
1976
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.38 → matrice_compute-0.1.40}/matrice_compute.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/actions_scaledown_manager.py
RENAMED
|
File without changes
|
{matrice_compute-0.1.38 → matrice_compute-0.1.40}/src/matrice_compute/compute_operations_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|