matrice-compute 0.1.34__tar.gz → 0.1.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/PKG-INFO +1 -1
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/action_instance.py +235 -501
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/LICENSE.txt +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/README.md +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/pyproject.toml +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/setup.cfg +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/setup.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/compute_operations_handler.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/instance_manager.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/instance_utils.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/resources_tracker.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/scaling.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/task_utils.py +0 -0
|
@@ -10,7 +10,6 @@ import signal
|
|
|
10
10
|
import urllib.request
|
|
11
11
|
from matrice_compute.instance_utils import (
|
|
12
12
|
get_gpu_with_sufficient_memory_for_action,
|
|
13
|
-
get_gpu_config_for_deployment,
|
|
14
13
|
get_decrypted_access_key_pair,
|
|
15
14
|
get_max_file_system,
|
|
16
15
|
get_best_service_ip_and_network,
|
|
@@ -27,10 +26,6 @@ from matrice_common.utils import log_errors
|
|
|
27
26
|
class ActionInstance:
|
|
28
27
|
"""Base class for tasks that run in Action containers."""
|
|
29
28
|
|
|
30
|
-
# Class-level dictionary to track deployed services and their ports
|
|
31
|
-
# Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
|
|
32
|
-
_deployed_services = {}
|
|
33
|
-
|
|
34
29
|
def __init__(self, scaling: Scaling, action_info: dict):
|
|
35
30
|
"""Initialize an action instance.
|
|
36
31
|
|
|
@@ -90,67 +85,6 @@ class ActionInstance:
|
|
|
90
85
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
91
86
|
self.task = self.actions_map[self.action_type]
|
|
92
87
|
|
|
93
|
-
@classmethod
|
|
94
|
-
def is_first_deployment_for_service(cls, service_id):
|
|
95
|
-
"""Check if this is the first deployment for a given service.
|
|
96
|
-
|
|
97
|
-
Args:
|
|
98
|
-
service_id (str): Service ID (_idService)
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
bool: True if this is the first deployment, False otherwise
|
|
102
|
-
"""
|
|
103
|
-
if not service_id:
|
|
104
|
-
return False
|
|
105
|
-
return service_id not in cls._deployed_services
|
|
106
|
-
|
|
107
|
-
@classmethod
|
|
108
|
-
def get_or_create_triton_ports(cls, service_id, scaling_instance):
|
|
109
|
-
"""Get existing TRITON_PORTS for a service or create new ones.
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
service_id (str): Service ID (_idService)
|
|
113
|
-
scaling_instance: Scaling instance to get open ports
|
|
114
|
-
|
|
115
|
-
Returns:
|
|
116
|
-
str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
|
|
117
|
-
"""
|
|
118
|
-
if not service_id:
|
|
119
|
-
# No service_id, generate new ports
|
|
120
|
-
port1 = scaling_instance.get_open_port()
|
|
121
|
-
port2 = scaling_instance.get_open_port()
|
|
122
|
-
port3 = scaling_instance.get_open_port()
|
|
123
|
-
return f"{port1},{port2},{port3}"
|
|
124
|
-
|
|
125
|
-
# Check if ports already exist for this service
|
|
126
|
-
if service_id in cls._deployed_services:
|
|
127
|
-
triton_ports = cls._deployed_services[service_id]["triton_ports"]
|
|
128
|
-
logging.info(
|
|
129
|
-
"Reusing TRITON_PORTS for service %s: %s",
|
|
130
|
-
service_id,
|
|
131
|
-
triton_ports
|
|
132
|
-
)
|
|
133
|
-
return triton_ports
|
|
134
|
-
|
|
135
|
-
# First deployment: generate new ports and store them
|
|
136
|
-
port1 = scaling_instance.get_open_port()
|
|
137
|
-
port2 = scaling_instance.get_open_port()
|
|
138
|
-
port3 = scaling_instance.get_open_port()
|
|
139
|
-
triton_ports = f"{port1},{port2},{port3}"
|
|
140
|
-
|
|
141
|
-
# Store for future use
|
|
142
|
-
cls._deployed_services[service_id] = {
|
|
143
|
-
"triton_ports": triton_ports,
|
|
144
|
-
"is_first": False
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
logging.info(
|
|
148
|
-
"First deployment for service %s - generated TRITON_PORTS: %s",
|
|
149
|
-
service_id,
|
|
150
|
-
triton_ports
|
|
151
|
-
)
|
|
152
|
-
return triton_ports
|
|
153
|
-
|
|
154
88
|
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
155
89
|
def _init_credentials(self):
|
|
156
90
|
"""Initialize Matrice credentials.
|
|
@@ -297,7 +231,7 @@ class ActionInstance:
|
|
|
297
231
|
getattr(self, "action_record_id", "unknown"),
|
|
298
232
|
)
|
|
299
233
|
else:
|
|
300
|
-
logging.
|
|
234
|
+
logging.debug(
|
|
301
235
|
"No additional logs to send for action %s",
|
|
302
236
|
getattr(self, "action_record_id", "unknown"),
|
|
303
237
|
)
|
|
@@ -352,13 +286,13 @@ class ActionInstance:
|
|
|
352
286
|
).get("gpuMemory", 0)
|
|
353
287
|
|
|
354
288
|
logging.info(
|
|
355
|
-
"Action %s requires GPU with %d MB memory - selecting GPU(s)
|
|
289
|
+
"Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
|
|
356
290
|
action_id,
|
|
357
291
|
required_memory
|
|
358
292
|
)
|
|
359
293
|
|
|
360
294
|
try:
|
|
361
|
-
# Get the GPU(s) with
|
|
295
|
+
# Get the best-fit GPU(s) with sufficient memory
|
|
362
296
|
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
363
297
|
action_details=action_details
|
|
364
298
|
)
|
|
@@ -412,7 +346,6 @@ class ActionInstance:
|
|
|
412
346
|
destination_workspace_path: str = "/usr/src/workspace",
|
|
413
347
|
docker_workdir: str = "",
|
|
414
348
|
extra_pkgs: list = [],
|
|
415
|
-
container_name: str = "",
|
|
416
349
|
):
|
|
417
350
|
"""Build base Docker command with common options.
|
|
418
351
|
|
|
@@ -427,7 +360,6 @@ class ActionInstance:
|
|
|
427
360
|
destination_workspace_path (str): Container workspace path
|
|
428
361
|
docker_workdir (str): Docker working directory
|
|
429
362
|
extra_pkgs (list): List of extra packages to install
|
|
430
|
-
container_name (str): Docker container name (format: {action_type}_{action_id})
|
|
431
363
|
Returns:
|
|
432
364
|
str: Base Docker command
|
|
433
365
|
"""
|
|
@@ -492,16 +424,17 @@ class ActionInstance:
|
|
|
492
424
|
]
|
|
493
425
|
)
|
|
494
426
|
|
|
495
|
-
#
|
|
496
|
-
|
|
427
|
+
# if the service provider is local, then put --restart unless-stopped
|
|
428
|
+
if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
|
|
429
|
+
env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
|
|
497
430
|
|
|
498
431
|
cmd_parts = [
|
|
499
|
-
f"docker run
|
|
500
|
-
name_option,
|
|
432
|
+
f"docker run {use_gpu} ",
|
|
501
433
|
network_config,
|
|
502
434
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
503
435
|
*volumes,
|
|
504
436
|
# Container configuration and startup commands
|
|
437
|
+
f"--cidfile ./{self.action_record_id}.cid ",
|
|
505
438
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
506
439
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
507
440
|
f"{env_exports} && "
|
|
@@ -889,34 +822,6 @@ class ActionInstance:
|
|
|
889
822
|
job_params=action_details["jobParams"],
|
|
890
823
|
)
|
|
891
824
|
|
|
892
|
-
@staticmethod
|
|
893
|
-
def container_exists(container_id: str) -> bool:
|
|
894
|
-
"""Check if a Docker container exists.
|
|
895
|
-
|
|
896
|
-
Args:
|
|
897
|
-
container_id (str): Container ID or name to check
|
|
898
|
-
|
|
899
|
-
Returns:
|
|
900
|
-
bool: True if container exists, False otherwise
|
|
901
|
-
"""
|
|
902
|
-
if not container_id:
|
|
903
|
-
return False
|
|
904
|
-
try:
|
|
905
|
-
result = subprocess.run(
|
|
906
|
-
["docker", "inspect", container_id],
|
|
907
|
-
capture_output=True,
|
|
908
|
-
text=True,
|
|
909
|
-
timeout=10
|
|
910
|
-
)
|
|
911
|
-
return result.returncode == 0
|
|
912
|
-
except Exception as e:
|
|
913
|
-
logging.warning(
|
|
914
|
-
"Error checking if container %s exists: %s",
|
|
915
|
-
container_id,
|
|
916
|
-
str(e)
|
|
917
|
-
)
|
|
918
|
-
return False
|
|
919
|
-
|
|
920
825
|
@log_errors(raise_exception=True)
|
|
921
826
|
def start_process(self, cmd, log_name):
|
|
922
827
|
"""Start the process and initialize logging.
|
|
@@ -931,54 +836,60 @@ class ActionInstance:
|
|
|
931
836
|
self.cmd = cmd
|
|
932
837
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
933
838
|
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
# Use a longer timeout for docker run since --pull=always may need to
|
|
944
|
-
# download large images on first run. Default: 30 minutes (1800 seconds)
|
|
945
|
-
# Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
|
|
946
|
-
docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
|
|
947
|
-
logging.info(
|
|
948
|
-
"Waiting for docker container to start for action %s (timeout: %d seconds)",
|
|
949
|
-
self.action_record_id,
|
|
950
|
-
docker_start_timeout,
|
|
951
|
-
)
|
|
952
|
-
stdout, stderr = process.communicate(timeout=docker_start_timeout)
|
|
839
|
+
with open(self.log_path, "wb") as out:
|
|
840
|
+
self.process = subprocess.Popen(
|
|
841
|
+
shlex.split(self.cmd),
|
|
842
|
+
stdout=out,
|
|
843
|
+
stderr=out,
|
|
844
|
+
env={**os.environ},
|
|
845
|
+
start_new_session=True,
|
|
846
|
+
)
|
|
953
847
|
|
|
954
|
-
|
|
848
|
+
self.container_id = None
|
|
849
|
+
|
|
850
|
+
cid_file_path = f"./{self.action_record_id}.cid"
|
|
851
|
+
max_retries = 5
|
|
852
|
+
retry_delay = 1 # seconds
|
|
853
|
+
for attempt in range(max_retries):
|
|
854
|
+
try:
|
|
855
|
+
with open(cid_file_path, "r") as cid_file:
|
|
856
|
+
container_id = cid_file.read().strip()
|
|
857
|
+
self.container_id = container_id
|
|
858
|
+
logging.info(
|
|
859
|
+
"Started process for action %s with container ID: %s",
|
|
860
|
+
self.action_record_id,
|
|
861
|
+
self.container_id,
|
|
862
|
+
)
|
|
863
|
+
break
|
|
864
|
+
except FileNotFoundError:
|
|
865
|
+
logging.warning(
|
|
866
|
+
"CID file not found for action %s, attempt %d/%d",
|
|
867
|
+
self.action_record_id,
|
|
868
|
+
attempt + 1,
|
|
869
|
+
max_retries,
|
|
870
|
+
)
|
|
871
|
+
time.sleep(retry_delay)
|
|
872
|
+
except Exception as e:
|
|
873
|
+
logging.error(
|
|
874
|
+
"Error reading CID file for action %s: %s",
|
|
875
|
+
self.action_record_id,
|
|
876
|
+
str(e),
|
|
877
|
+
)
|
|
878
|
+
time.sleep(retry_delay)
|
|
879
|
+
else:
|
|
955
880
|
logging.error(
|
|
956
|
-
"
|
|
881
|
+
"Failed to read CID file for action %s after %d attempts",
|
|
957
882
|
self.action_record_id,
|
|
958
|
-
|
|
883
|
+
max_retries,
|
|
959
884
|
)
|
|
960
|
-
raise
|
|
961
|
-
|
|
962
|
-
self.container_id = stdout.strip()
|
|
963
|
-
logging.info(
|
|
964
|
-
"Started container for action %s with ID: %s",
|
|
965
|
-
self.action_record_id,
|
|
966
|
-
self.container_id,
|
|
967
|
-
)
|
|
968
|
-
|
|
969
|
-
# Start following container logs in background
|
|
970
|
-
self.process = subprocess.Popen(
|
|
971
|
-
["docker", "logs", "-f", self.container_id],
|
|
972
|
-
stdout=open(self.log_path, "wb"),
|
|
973
|
-
stderr=subprocess.STDOUT,
|
|
974
|
-
start_new_session=True,
|
|
975
|
-
)
|
|
885
|
+
raise Exception("Failed to start process: CID file not found")
|
|
976
886
|
|
|
977
|
-
#
|
|
887
|
+
# report container id to scaling service
|
|
978
888
|
self.scaling.update_action_container_id(
|
|
979
889
|
action_record_id=self.action_record_id,
|
|
980
890
|
container_id=self.container_id,
|
|
981
891
|
)
|
|
892
|
+
|
|
982
893
|
|
|
983
894
|
@log_errors(raise_exception=False)
|
|
984
895
|
def start_logger(self):
|
|
@@ -1139,8 +1050,7 @@ def data_preparation_execute(
|
|
|
1139
1050
|
"Started pulling Docker image with PID: %s",
|
|
1140
1051
|
process.pid,
|
|
1141
1052
|
)
|
|
1142
|
-
|
|
1143
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1053
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1144
1054
|
logging.info("cmd is: %s", cmd)
|
|
1145
1055
|
self.start(cmd, "data_preparation_log")
|
|
1146
1056
|
|
|
@@ -1169,8 +1079,7 @@ def data_processing_execute(self: ActionInstance):
|
|
|
1169
1079
|
service="bg-job-scheduler",
|
|
1170
1080
|
job_params=action["jobParams"],
|
|
1171
1081
|
)
|
|
1172
|
-
|
|
1173
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1082
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1174
1083
|
logging.info("cmd: %s", cmd)
|
|
1175
1084
|
self.start(cmd, "data_processing_log")
|
|
1176
1085
|
|
|
@@ -1183,8 +1092,7 @@ def data_split_execute(self: ActionInstance):
|
|
|
1183
1092
|
if not action_details:
|
|
1184
1093
|
return
|
|
1185
1094
|
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1186
|
-
|
|
1187
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1095
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1188
1096
|
logging.info("cmd: %s", cmd)
|
|
1189
1097
|
self.start(cmd, "data_split")
|
|
1190
1098
|
|
|
@@ -1199,8 +1107,7 @@ def dataset_annotation_execute(
|
|
|
1199
1107
|
if not action_details:
|
|
1200
1108
|
return
|
|
1201
1109
|
self.setup_action_requirements(action_details, work_fs)
|
|
1202
|
-
|
|
1203
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1110
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1204
1111
|
logging.info("cmd: %s", cmd)
|
|
1205
1112
|
self.start(cmd, "dataset_annotation")
|
|
1206
1113
|
|
|
@@ -1215,8 +1122,7 @@ def dataset_augmentation_execute(
|
|
|
1215
1122
|
if not action_details:
|
|
1216
1123
|
return
|
|
1217
1124
|
self.setup_action_requirements(action_details, work_fs)
|
|
1218
|
-
|
|
1219
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1125
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1220
1126
|
logging.info("cmd: %s", cmd)
|
|
1221
1127
|
self.start(cmd, "dataset_augmentation")
|
|
1222
1128
|
|
|
@@ -1232,8 +1138,7 @@ def augmentation_server_creation_execute(
|
|
|
1232
1138
|
if not action_details:
|
|
1233
1139
|
return
|
|
1234
1140
|
self.setup_action_requirements(action_details, work_fs)
|
|
1235
|
-
|
|
1236
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1141
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1237
1142
|
logging.info("cmd: %s", cmd)
|
|
1238
1143
|
self.start(cmd, "augmentation_setup")
|
|
1239
1144
|
|
|
@@ -1254,42 +1159,31 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1254
1159
|
|
|
1255
1160
|
project_id = action_details["_idProject"]
|
|
1256
1161
|
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1162
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1163
|
+
logging.info(
|
|
1164
|
+
"Using existing container ID for inference tracker: %s",
|
|
1165
|
+
action_details["actionDetails"]["containerId"],
|
|
1166
|
+
)
|
|
1167
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1168
|
+
cmd = "docker restart " + self.docker_container
|
|
1169
|
+
self.start(cmd, "qdrant_setup")
|
|
1260
1170
|
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
|
|
1265
|
-
qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
|
|
1171
|
+
#qdrant restart
|
|
1172
|
+
qdrant_cmd = "docker restart qdrant"
|
|
1173
|
+
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1266
1174
|
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
)
|
|
1272
|
-
self.docker_container = existing_container_id
|
|
1273
|
-
cmd = "docker restart " + self.docker_container
|
|
1274
|
-
self.start(cmd, "qdrant_setup")
|
|
1175
|
+
return
|
|
1176
|
+
|
|
1177
|
+
|
|
1178
|
+
dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
|
|
1275
1179
|
|
|
1276
|
-
# qdrant restart
|
|
1277
|
-
qdrant_cmd = f"docker restart {qdrant_container_name}"
|
|
1278
|
-
self.start(qdrant_cmd, "qdrant_setup")
|
|
1279
|
-
return
|
|
1280
|
-
else:
|
|
1281
|
-
logging.warning(
|
|
1282
|
-
"Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
|
|
1283
|
-
mongodb_container_exists,
|
|
1284
|
-
qdrant_container_exists
|
|
1285
|
-
)
|
|
1286
|
-
# Fall through to create new containers
|
|
1287
1180
|
|
|
1288
1181
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1289
1182
|
cmd = (
|
|
1290
1183
|
f"docker run --pull=always --net=host "
|
|
1291
|
-
f"
|
|
1292
|
-
f"
|
|
1184
|
+
f"-v {dbPath}:{dbPath} "
|
|
1185
|
+
f"--name database_setup_{self.action_record_id} "
|
|
1186
|
+
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1293
1187
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1294
1188
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1295
1189
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
@@ -1298,22 +1192,12 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1298
1192
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1299
1193
|
f"{image} "
|
|
1300
1194
|
)
|
|
1301
|
-
logging.info("Starting
|
|
1195
|
+
logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
|
|
1302
1196
|
|
|
1303
|
-
# Qdrant container with --net=host (Port: 6334)
|
|
1304
|
-
qdrant_cmd = (
|
|
1305
|
-
f"docker run --pull=always --net=host "
|
|
1306
|
-
f"--name {qdrant_container_name} "
|
|
1307
|
-
f"-v matrice_myvol:/matrice_data "
|
|
1308
|
-
f"{'qdrant/qdrant:latest'} "
|
|
1309
|
-
)
|
|
1310
|
-
logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
|
|
1311
1197
|
|
|
1312
1198
|
# Docker Command run
|
|
1313
1199
|
self.start(cmd, "database_setup")
|
|
1314
1200
|
|
|
1315
|
-
# Docker for qdrant
|
|
1316
|
-
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1317
1201
|
|
|
1318
1202
|
@log_errors(raise_exception=False)
|
|
1319
1203
|
def facial_recognition_setup_execute(self: ActionInstance):
|
|
@@ -1329,36 +1213,28 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1329
1213
|
|
|
1330
1214
|
self.setup_action_requirements(action_details)
|
|
1331
1215
|
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
cmd = "docker restart " + self.docker_container
|
|
1342
|
-
self.start(cmd, "facial_recognition_setup")
|
|
1343
|
-
return
|
|
1344
|
-
else:
|
|
1345
|
-
logging.warning(
|
|
1346
|
-
"Container %s not found. Creating new container.",
|
|
1347
|
-
existing_container_id
|
|
1348
|
-
)
|
|
1349
|
-
# Fall through to create new container
|
|
1216
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1217
|
+
logging.info(
|
|
1218
|
+
"Using existing container ID for facial recognition worker: %s",
|
|
1219
|
+
action_details["actionDetails"]["containerId"],
|
|
1220
|
+
)
|
|
1221
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1222
|
+
cmd = "docker restart " + self.docker_container
|
|
1223
|
+
self.start(cmd, "facial_recognition_setup")
|
|
1224
|
+
return
|
|
1350
1225
|
|
|
1351
1226
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1352
|
-
container_name = f"facial_recognition_{self.action_record_id}"
|
|
1353
1227
|
worker_cmd = (
|
|
1354
1228
|
f"docker run -d --pull=always --net=host "
|
|
1355
|
-
f"--name
|
|
1356
|
-
|
|
1229
|
+
f"--name worker "
|
|
1230
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1357
1231
|
f"-v matrice_myvol:/matrice_data "
|
|
1232
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1358
1233
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1359
1234
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1360
1235
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1361
1236
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1237
|
+
f' --restart=unless-stopped '
|
|
1362
1238
|
f"{image}"
|
|
1363
1239
|
)
|
|
1364
1240
|
logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
|
|
@@ -1380,30 +1256,20 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1380
1256
|
|
|
1381
1257
|
self.setup_action_requirements(action_details)
|
|
1382
1258
|
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
cmd = "docker restart " + self.docker_container
|
|
1393
|
-
self.start(cmd, "lpr_setup")
|
|
1394
|
-
return
|
|
1395
|
-
else:
|
|
1396
|
-
logging.warning(
|
|
1397
|
-
"Container %s not found. Creating new container.",
|
|
1398
|
-
existing_container_id
|
|
1399
|
-
)
|
|
1400
|
-
# Fall through to create new container
|
|
1259
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1260
|
+
logging.info(
|
|
1261
|
+
"Using existing container ID for LPR worker: %s",
|
|
1262
|
+
action_details["actionDetails"]["containerId"],
|
|
1263
|
+
)
|
|
1264
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1265
|
+
cmd = "docker restart " + self.docker_container
|
|
1266
|
+
self.start(cmd, "lpr_setup")
|
|
1267
|
+
return
|
|
1401
1268
|
|
|
1402
1269
|
# LPR worker container with --net=host (Port: 8082)
|
|
1403
|
-
container_name = f"lpr_{self.action_record_id}"
|
|
1404
1270
|
worker_cmd = (
|
|
1405
1271
|
f"docker run -d --net=host --pull=always "
|
|
1406
|
-
f"--name
|
|
1272
|
+
f"--name lpr-worker "
|
|
1407
1273
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1408
1274
|
f"-v matrice_myvol:/matrice_data "
|
|
1409
1275
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -1411,6 +1277,7 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1411
1277
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1412
1278
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1413
1279
|
f'-e PORT=8082 '
|
|
1280
|
+
f' --restart=unless-stopped '
|
|
1414
1281
|
f"{image}"
|
|
1415
1282
|
)
|
|
1416
1283
|
logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
|
|
@@ -1441,34 +1308,25 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1441
1308
|
|
|
1442
1309
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1443
1310
|
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
cmd = "docker restart " + self.docker_container
|
|
1454
|
-
self.start(cmd, "inference_ws_server")
|
|
1455
|
-
return
|
|
1456
|
-
else:
|
|
1457
|
-
logging.warning(
|
|
1458
|
-
"Container %s not found. Creating new container.",
|
|
1459
|
-
existing_container_id
|
|
1460
|
-
)
|
|
1461
|
-
# Fall through to create new container
|
|
1311
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1312
|
+
logging.info(
|
|
1313
|
+
"Using existing container ID for inference WebSocket server: %s",
|
|
1314
|
+
action_details["actionDetails"]["containerId"],
|
|
1315
|
+
)
|
|
1316
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1317
|
+
cmd = "docker restart " + self.docker_container
|
|
1318
|
+
self.start(cmd, "inference_ws_server")
|
|
1319
|
+
return
|
|
1462
1320
|
|
|
1463
1321
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1464
|
-
container_name = f"inference_ws_{self.action_record_id}"
|
|
1465
1322
|
worker_cmd = (
|
|
1466
1323
|
f"docker run -d --pull=always --net=host "
|
|
1467
|
-
f"--name
|
|
1324
|
+
f"--name inference "
|
|
1468
1325
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1469
1326
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1470
1327
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1471
1328
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1329
|
+
f' --restart=unless-stopped '
|
|
1472
1330
|
f"{image} "
|
|
1473
1331
|
f"./app "
|
|
1474
1332
|
f"{self.action_record_id} "
|
|
@@ -1499,30 +1357,20 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1499
1357
|
|
|
1500
1358
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1501
1359
|
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
self.start(cmd, "fe_fs_streaming")
|
|
1513
|
-
return
|
|
1514
|
-
else:
|
|
1515
|
-
logging.warning(
|
|
1516
|
-
"Container %s not found. Creating new container.",
|
|
1517
|
-
existing_container_id
|
|
1518
|
-
)
|
|
1519
|
-
# Fall through to create new container
|
|
1520
|
-
|
|
1360
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1361
|
+
logging.info(
|
|
1362
|
+
"Using existing container ID for frontend streaming: %s",
|
|
1363
|
+
action_details["actionDetails"]["containerId"],
|
|
1364
|
+
)
|
|
1365
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1366
|
+
cmd = "docker restart " + self.docker_container
|
|
1367
|
+
self.start(cmd, "fe_fs_streaming")
|
|
1368
|
+
return
|
|
1369
|
+
|
|
1521
1370
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1522
|
-
container_name = f"fe_streaming_{self.action_record_id}"
|
|
1523
1371
|
worker_cmd = (
|
|
1524
1372
|
f"docker run -d --pull=always --net=host "
|
|
1525
|
-
f"--name
|
|
1373
|
+
f"--name fe_streaming "
|
|
1526
1374
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1527
1375
|
f"-v matrice_myvol:/matrice_data "
|
|
1528
1376
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -1530,6 +1378,7 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1530
1378
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1531
1379
|
f"-e PORT=3000 "
|
|
1532
1380
|
f'-e WS_HOST="{ws_url}" '
|
|
1381
|
+
f' --restart=unless-stopped '
|
|
1533
1382
|
f"{image}"
|
|
1534
1383
|
)
|
|
1535
1384
|
logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
|
|
@@ -1554,30 +1403,20 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1554
1403
|
|
|
1555
1404
|
project_id = action_details["_idProject"]
|
|
1556
1405
|
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
self.start(cmd, "fe_analytics_service")
|
|
1568
|
-
return
|
|
1569
|
-
else:
|
|
1570
|
-
logging.warning(
|
|
1571
|
-
"Container %s not found. Creating new container.",
|
|
1572
|
-
existing_container_id
|
|
1573
|
-
)
|
|
1574
|
-
# Fall through to create new container
|
|
1575
|
-
|
|
1406
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1407
|
+
logging.info(
|
|
1408
|
+
"Using existing container ID for frontend analytics service: %s",
|
|
1409
|
+
action_details["actionDetails"]["containerId"],
|
|
1410
|
+
)
|
|
1411
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1412
|
+
cmd = "docker restart " + self.docker_container
|
|
1413
|
+
self.start(cmd, "fe_analytics_service")
|
|
1414
|
+
return
|
|
1415
|
+
|
|
1576
1416
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1577
|
-
container_name = f"fe_analytics_{self.action_record_id}"
|
|
1578
1417
|
worker_cmd = (
|
|
1579
1418
|
f"docker run -d --pull=always --net=host "
|
|
1580
|
-
f"--name
|
|
1419
|
+
f"--name fe-analytics "
|
|
1581
1420
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1582
1421
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1583
1422
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1585,6 +1424,7 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1585
1424
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1586
1425
|
f"-e PORT=3001 "
|
|
1587
1426
|
f'-e PROJECT_ID="{project_id}" '
|
|
1427
|
+
f' --restart=unless-stopped '
|
|
1588
1428
|
f"{image}"
|
|
1589
1429
|
)
|
|
1590
1430
|
logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
|
|
@@ -1609,8 +1449,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
|
1609
1449
|
else:
|
|
1610
1450
|
return
|
|
1611
1451
|
use_gpu = self.get_gpu_config(action_details)
|
|
1612
|
-
|
|
1613
|
-
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1452
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1614
1453
|
logging.info("cmd is: %s", cmd)
|
|
1615
1454
|
self.start(cmd, "dataset_generation")
|
|
1616
1455
|
|
|
@@ -1631,8 +1470,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1631
1470
|
else:
|
|
1632
1471
|
return
|
|
1633
1472
|
use_gpu = self.get_gpu_config(action_details)
|
|
1634
|
-
|
|
1635
|
-
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1473
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1636
1474
|
logging.info("cmd is: %s", cmd)
|
|
1637
1475
|
self.start(cmd, "synthetic_data_setup")
|
|
1638
1476
|
|
|
@@ -1669,60 +1507,31 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1669
1507
|
|
|
1670
1508
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1671
1509
|
|
|
1672
|
-
# Define container names with action_record_id for uniqueness
|
|
1673
|
-
redis_container_name = f"redis_{self.action_record_id}"
|
|
1674
|
-
|
|
1675
|
-
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1676
|
-
if existing_container_id:
|
|
1677
|
-
# Check if both containers actually exist before trying to restart
|
|
1678
|
-
management_container_exists = ActionInstance.container_exists(existing_container_id)
|
|
1679
|
-
redis_container_exists = ActionInstance.container_exists(redis_container_name)
|
|
1680
1510
|
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1511
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1512
|
+
logging.info(
|
|
1513
|
+
"Using existing container ID for redis management: %s",
|
|
1514
|
+
action_details["actionDetails"]["containerId"],
|
|
1515
|
+
)
|
|
1516
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1517
|
+
cmd = "docker restart " + self.docker_container
|
|
1518
|
+
self.start(cmd, "redis_setup")
|
|
1689
1519
|
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
return
|
|
1694
|
-
else:
|
|
1695
|
-
logging.warning(
|
|
1696
|
-
"Container(s) not found (management=%s, redis=%s). Creating new containers.",
|
|
1697
|
-
management_container_exists,
|
|
1698
|
-
redis_container_exists
|
|
1699
|
-
)
|
|
1700
|
-
# Fall through to create new containers
|
|
1520
|
+
# Redis container restart
|
|
1521
|
+
redis_restart_cmd = "docker restart redis_container"
|
|
1522
|
+
self.start(redis_restart_cmd, "redis")
|
|
1701
1523
|
|
|
1524
|
+
return
|
|
1525
|
+
|
|
1702
1526
|
# Redis container with --net=host (Port: 6379)
|
|
1703
1527
|
redis_cmd = (
|
|
1704
1528
|
f"docker run -d --net=host "
|
|
1705
|
-
f"--name
|
|
1529
|
+
f"--name redis_container "
|
|
1706
1530
|
f"--restart unless-stopped "
|
|
1707
1531
|
f"{redis_image} "
|
|
1708
|
-
f"redis-server --bind 0.0.0.0 "
|
|
1709
|
-
f"--appendonly no "
|
|
1710
|
-
f'--save "" '
|
|
1711
|
-
f"--maxmemory 30gb "
|
|
1712
|
-
f"--maxmemory-policy allkeys-lru "
|
|
1713
|
-
f"--io-threads 4 "
|
|
1714
|
-
f"--io-threads-do-reads yes "
|
|
1715
|
-
f"--stream-node-max-bytes 8192 "
|
|
1716
|
-
f"--stream-node-max-entries 1000 "
|
|
1717
|
-
f"--hz 100 "
|
|
1718
|
-
f"--tcp-backlog 2048 "
|
|
1719
|
-
f"--timeout 0 "
|
|
1720
|
-
f"--lazyfree-lazy-eviction yes "
|
|
1721
|
-
f"--lazyfree-lazy-expire yes "
|
|
1722
|
-
f"--lazyfree-lazy-server-del yes "
|
|
1723
|
-
f"--activedefrag yes "
|
|
1724
|
-
f"--requirepass {redis_password}"
|
|
1532
|
+
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
1725
1533
|
)
|
|
1534
|
+
|
|
1726
1535
|
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
1727
1536
|
|
|
1728
1537
|
# Start Redis container first
|
|
@@ -1772,8 +1581,7 @@ def deploy_aggregator_execute(
|
|
|
1772
1581
|
if not action_details:
|
|
1773
1582
|
return
|
|
1774
1583
|
self.setup_action_requirements(action_details, work_fs)
|
|
1775
|
-
|
|
1776
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1584
|
+
cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1777
1585
|
logging.info("cmd: %s", cmd)
|
|
1778
1586
|
self.start(cmd, "deploy_aggregator")
|
|
1779
1587
|
|
|
@@ -1789,10 +1597,6 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1789
1597
|
return
|
|
1790
1598
|
action_id = action_details["_id"]
|
|
1791
1599
|
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1792
|
-
|
|
1793
|
-
# Get the service ID to track deployments
|
|
1794
|
-
service_id = action_details.get("_idService")
|
|
1795
|
-
|
|
1796
1600
|
self.setup_action_requirements(
|
|
1797
1601
|
action_details,
|
|
1798
1602
|
work_fs,
|
|
@@ -1800,29 +1604,17 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1800
1604
|
action_id=action_id,
|
|
1801
1605
|
)
|
|
1802
1606
|
|
|
1803
|
-
#
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
# Get GPU configuration (uses utility function with fail-safe fallback)
|
|
1807
|
-
use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
|
|
1808
|
-
|
|
1809
|
-
logging.info(
|
|
1810
|
-
"Action %s: Model deployment GPU config: %s (first_deployment=%s)",
|
|
1811
|
-
action_id,
|
|
1812
|
-
use_gpu if use_gpu else "CPU-only",
|
|
1813
|
-
is_first_deployment
|
|
1814
|
-
)
|
|
1815
|
-
|
|
1816
|
-
# Get or create TRITON_PORTS (uses utility method)
|
|
1817
|
-
triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
|
|
1607
|
+
# Get GPU configuration based on requirements and availability
|
|
1608
|
+
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1609
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1818
1610
|
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1611
|
+
# Override: If GPU is required, use all available GPUs
|
|
1612
|
+
gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
|
|
1613
|
+
if gpuRequired:
|
|
1614
|
+
use_gpu = "--runtime=nvidia --gpus all"
|
|
1823
1615
|
|
|
1824
|
-
|
|
1825
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"]
|
|
1616
|
+
extra_env_vars = {"INTERNAL_PORT": internal_port}
|
|
1617
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1826
1618
|
logging.info("cmd is: %s", cmd)
|
|
1827
1619
|
self.start(cmd, "deploy_log")
|
|
1828
1620
|
|
|
@@ -1845,27 +1637,17 @@ def model_train_execute(self: ActionInstance):
|
|
|
1845
1637
|
action_id=action_id,
|
|
1846
1638
|
)
|
|
1847
1639
|
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
return
|
|
1860
|
-
else:
|
|
1861
|
-
logging.warning(
|
|
1862
|
-
"Container %s not found. Creating new container.",
|
|
1863
|
-
existing_container_id
|
|
1864
|
-
)
|
|
1865
|
-
# Fall through to create new container
|
|
1866
|
-
|
|
1867
|
-
container_name = f"model_train_{self.action_record_id}"
|
|
1868
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
|
|
1640
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1641
|
+
logging.info(
|
|
1642
|
+
"Using existing container ID for training: %s",
|
|
1643
|
+
action_details["actionDetails"]["containerId"],
|
|
1644
|
+
)
|
|
1645
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1646
|
+
cmd = "docker restart " + self.docker_container
|
|
1647
|
+
self.start(cmd, "train_log")
|
|
1648
|
+
return
|
|
1649
|
+
|
|
1650
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
|
|
1869
1651
|
logging.info("cmd is: %s", cmd)
|
|
1870
1652
|
self.start(cmd, "train_log")
|
|
1871
1653
|
|
|
@@ -1886,27 +1668,17 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1886
1668
|
model_family=model_family,
|
|
1887
1669
|
action_id=action_id,
|
|
1888
1670
|
)
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
return
|
|
1901
|
-
else:
|
|
1902
|
-
logging.warning(
|
|
1903
|
-
"Container %s not found. Creating new container.",
|
|
1904
|
-
existing_container_id
|
|
1905
|
-
)
|
|
1906
|
-
# Fall through to create new container
|
|
1907
|
-
|
|
1908
|
-
container_name = f"model_eval_{self.action_record_id}"
|
|
1909
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
|
|
1671
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1672
|
+
logging.info(
|
|
1673
|
+
"Using existing container ID for training: %s",
|
|
1674
|
+
action_details["actionDetails"]["containerId"],
|
|
1675
|
+
)
|
|
1676
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1677
|
+
cmd = "docker restart " + self.docker_container
|
|
1678
|
+
self.start(cmd, "eval_log")
|
|
1679
|
+
return
|
|
1680
|
+
|
|
1681
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
|
|
1910
1682
|
logging.info("cmd is: %s", cmd)
|
|
1911
1683
|
self.start(cmd, "eval_log")
|
|
1912
1684
|
|
|
@@ -1930,27 +1702,17 @@ def model_export_execute(self: ActionInstance):
|
|
|
1930
1702
|
model_family=model_family,
|
|
1931
1703
|
action_id=action_id,
|
|
1932
1704
|
)
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
return
|
|
1945
|
-
else:
|
|
1946
|
-
logging.warning(
|
|
1947
|
-
"Container %s not found. Creating new container.",
|
|
1948
|
-
existing_container_id
|
|
1949
|
-
)
|
|
1950
|
-
# Fall through to create new container
|
|
1951
|
-
|
|
1952
|
-
container_name = f"model_export_{self.action_record_id}"
|
|
1953
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
|
|
1705
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1706
|
+
logging.info(
|
|
1707
|
+
"Using existing container ID for training: %s",
|
|
1708
|
+
action_details["actionDetails"]["containerId"],
|
|
1709
|
+
)
|
|
1710
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1711
|
+
cmd = "docker restart " + self.docker_container
|
|
1712
|
+
self.start(cmd, "export_log")
|
|
1713
|
+
return
|
|
1714
|
+
|
|
1715
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
|
|
1954
1716
|
logging.info("cmd is: %s", cmd)
|
|
1955
1717
|
self.start(cmd, "export_log")
|
|
1956
1718
|
|
|
@@ -1966,8 +1728,7 @@ def image_build_execute(self: ActionInstance):
|
|
|
1966
1728
|
action_id = action_details["_id"]
|
|
1967
1729
|
internal_api_key = self.get_internal_api_key(action_id)
|
|
1968
1730
|
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1969
|
-
|
|
1970
|
-
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
|
|
1731
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
|
|
1971
1732
|
logging.info("cmd is: %s", cmd)
|
|
1972
1733
|
self.start(cmd, "image_build_log")
|
|
1973
1734
|
|
|
@@ -1979,8 +1740,7 @@ def resource_clone_execute(self: ActionInstance):
|
|
|
1979
1740
|
if not action_details:
|
|
1980
1741
|
return
|
|
1981
1742
|
self.setup_action_requirements(action_details)
|
|
1982
|
-
|
|
1983
|
-
cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
|
|
1743
|
+
cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
|
|
1984
1744
|
logging.info("cmd is: %s", cmd)
|
|
1985
1745
|
self.start(cmd, "resource_clone")
|
|
1986
1746
|
|
|
@@ -1996,27 +1756,17 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1996
1756
|
self.docker_container = (
|
|
1997
1757
|
f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
|
|
1998
1758
|
)
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
return
|
|
2011
|
-
else:
|
|
2012
|
-
logging.warning(
|
|
2013
|
-
"Container %s not found. Creating new container.",
|
|
2014
|
-
existing_container_id
|
|
2015
|
-
)
|
|
2016
|
-
# Fall through to create new container
|
|
2017
|
-
|
|
2018
|
-
container_name = f"streaming_gateway_{self.action_record_id}"
|
|
2019
|
-
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1759
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1760
|
+
logging.info(
|
|
1761
|
+
"Using existing container ID for training: %s",
|
|
1762
|
+
action_details["actionDetails"]["containerId"],
|
|
1763
|
+
)
|
|
1764
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1765
|
+
cmd = "docker restart " + self.docker_container
|
|
1766
|
+
self.start(cmd, "streaming_gateway")
|
|
1767
|
+
return
|
|
1768
|
+
|
|
1769
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
2020
1770
|
logging.info("cmd is: %s", cmd)
|
|
2021
1771
|
self.start(cmd, "streaming_gateway")
|
|
2022
1772
|
|
|
@@ -2110,24 +1860,16 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
2110
1860
|
else:
|
|
2111
1861
|
pkgs = f"matrice_common matrice"
|
|
2112
1862
|
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
self.start(cmd, "kafka_setup")
|
|
2124
|
-
return
|
|
2125
|
-
else:
|
|
2126
|
-
logging.warning(
|
|
2127
|
-
"Container %s not found. Creating new container.",
|
|
2128
|
-
existing_container_id
|
|
2129
|
-
)
|
|
2130
|
-
# Fall through to create new container
|
|
1863
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1864
|
+
logging.info(
|
|
1865
|
+
"Using existing container ID for training: %s",
|
|
1866
|
+
action_details["actionDetails"]["containerId"],
|
|
1867
|
+
)
|
|
1868
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1869
|
+
cmd = "docker restart " + self.docker_container
|
|
1870
|
+
self.start(cmd, "kafka_setup")
|
|
1871
|
+
return
|
|
1872
|
+
|
|
2131
1873
|
|
|
2132
1874
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
2133
1875
|
cmd = (
|
|
@@ -2164,36 +1906,27 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
2164
1906
|
|
|
2165
1907
|
self.setup_action_requirements(action_details)
|
|
2166
1908
|
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
self.start(cmd, "inference_tracker_setup")
|
|
2178
|
-
return
|
|
2179
|
-
else:
|
|
2180
|
-
logging.warning(
|
|
2181
|
-
"Container %s not found. Creating new container.",
|
|
2182
|
-
existing_container_id
|
|
2183
|
-
)
|
|
2184
|
-
# Fall through to create new container
|
|
2185
|
-
|
|
1909
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1910
|
+
logging.info(
|
|
1911
|
+
"Using existing container ID for inference tracker: %s",
|
|
1912
|
+
action_details["actionDetails"]["containerId"],
|
|
1913
|
+
)
|
|
1914
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1915
|
+
cmd = "docker restart " + self.docker_container
|
|
1916
|
+
self.start(cmd, "inference_tracker_setup")
|
|
1917
|
+
return
|
|
1918
|
+
|
|
2186
1919
|
# This is the existing Docker run command
|
|
2187
|
-
container_name = f"inference_tracker_{self.action_record_id}"
|
|
2188
1920
|
worker_cmd = (
|
|
2189
1921
|
f"docker run -d --pull=always --net=host "
|
|
2190
|
-
|
|
2191
|
-
f"--name
|
|
1922
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1923
|
+
f"--name inference-tracker-worker "
|
|
2192
1924
|
f"-v matrice_myvol:/matrice_data "
|
|
2193
1925
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
2194
1926
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
2195
1927
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
2196
1928
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1929
|
+
f' --restart=unless-stopped '
|
|
2197
1930
|
f"{image}"
|
|
2198
1931
|
)
|
|
2199
1932
|
|
|
@@ -2235,6 +1968,7 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
2235
1968
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
2236
1969
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
2237
1970
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1971
|
+
f' --restart=unless-stopped '
|
|
2238
1972
|
f"{image}"
|
|
2239
1973
|
)
|
|
2240
1974
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.34 → matrice_compute-0.1.36}/matrice_compute.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/actions_scaledown_manager.py
RENAMED
|
File without changes
|
{matrice_compute-0.1.34 → matrice_compute-0.1.36}/src/matrice_compute/compute_operations_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|