matrice-compute 0.1.35__tar.gz → 0.1.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/PKG-INFO +1 -1
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/action_instance.py +222 -510
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/LICENSE.txt +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/README.md +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/pyproject.toml +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/setup.cfg +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/setup.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/__init__.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/actions_manager.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/compute_operations_handler.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/instance_manager.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/instance_utils.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/resources_tracker.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/scaling.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/task_utils.py +0 -0
|
@@ -10,7 +10,6 @@ import signal
|
|
|
10
10
|
import urllib.request
|
|
11
11
|
from matrice_compute.instance_utils import (
|
|
12
12
|
get_gpu_with_sufficient_memory_for_action,
|
|
13
|
-
get_gpu_config_for_deployment,
|
|
14
13
|
get_decrypted_access_key_pair,
|
|
15
14
|
get_max_file_system,
|
|
16
15
|
get_best_service_ip_and_network,
|
|
@@ -27,10 +26,6 @@ from matrice_common.utils import log_errors
|
|
|
27
26
|
class ActionInstance:
|
|
28
27
|
"""Base class for tasks that run in Action containers."""
|
|
29
28
|
|
|
30
|
-
# Class-level dictionary to track deployed services and their ports
|
|
31
|
-
# Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
|
|
32
|
-
_deployed_services = {}
|
|
33
|
-
|
|
34
29
|
def __init__(self, scaling: Scaling, action_info: dict):
|
|
35
30
|
"""Initialize an action instance.
|
|
36
31
|
|
|
@@ -90,67 +85,6 @@ class ActionInstance:
|
|
|
90
85
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
91
86
|
self.task = self.actions_map[self.action_type]
|
|
92
87
|
|
|
93
|
-
@classmethod
|
|
94
|
-
def is_first_deployment_for_service(cls, service_id):
|
|
95
|
-
"""Check if this is the first deployment for a given service.
|
|
96
|
-
|
|
97
|
-
Args:
|
|
98
|
-
service_id (str): Service ID (_idService)
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
bool: True if this is the first deployment, False otherwise
|
|
102
|
-
"""
|
|
103
|
-
if not service_id:
|
|
104
|
-
return False
|
|
105
|
-
return service_id not in cls._deployed_services
|
|
106
|
-
|
|
107
|
-
@classmethod
|
|
108
|
-
def get_or_create_triton_ports(cls, service_id, scaling_instance):
|
|
109
|
-
"""Get existing TRITON_PORTS for a service or create new ones.
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
service_id (str): Service ID (_idService)
|
|
113
|
-
scaling_instance: Scaling instance to get open ports
|
|
114
|
-
|
|
115
|
-
Returns:
|
|
116
|
-
str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
|
|
117
|
-
"""
|
|
118
|
-
if not service_id:
|
|
119
|
-
# No service_id, generate new ports
|
|
120
|
-
port1 = scaling_instance.get_open_port()
|
|
121
|
-
port2 = scaling_instance.get_open_port()
|
|
122
|
-
port3 = scaling_instance.get_open_port()
|
|
123
|
-
return f"{port1},{port2},{port3}"
|
|
124
|
-
|
|
125
|
-
# Check if ports already exist for this service
|
|
126
|
-
if service_id in cls._deployed_services:
|
|
127
|
-
triton_ports = cls._deployed_services[service_id]["triton_ports"]
|
|
128
|
-
logging.info(
|
|
129
|
-
"Reusing TRITON_PORTS for service %s: %s",
|
|
130
|
-
service_id,
|
|
131
|
-
triton_ports
|
|
132
|
-
)
|
|
133
|
-
return triton_ports
|
|
134
|
-
|
|
135
|
-
# First deployment: generate new ports and store them
|
|
136
|
-
port1 = scaling_instance.get_open_port()
|
|
137
|
-
port2 = scaling_instance.get_open_port()
|
|
138
|
-
port3 = scaling_instance.get_open_port()
|
|
139
|
-
triton_ports = f"{port1},{port2},{port3}"
|
|
140
|
-
|
|
141
|
-
# Store for future use
|
|
142
|
-
cls._deployed_services[service_id] = {
|
|
143
|
-
"triton_ports": triton_ports,
|
|
144
|
-
"is_first": False
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
logging.info(
|
|
148
|
-
"First deployment for service %s - generated TRITON_PORTS: %s",
|
|
149
|
-
service_id,
|
|
150
|
-
triton_ports
|
|
151
|
-
)
|
|
152
|
-
return triton_ports
|
|
153
|
-
|
|
154
88
|
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
155
89
|
def _init_credentials(self):
|
|
156
90
|
"""Initialize Matrice credentials.
|
|
@@ -297,7 +231,7 @@ class ActionInstance:
|
|
|
297
231
|
getattr(self, "action_record_id", "unknown"),
|
|
298
232
|
)
|
|
299
233
|
else:
|
|
300
|
-
logging.
|
|
234
|
+
logging.debug(
|
|
301
235
|
"No additional logs to send for action %s",
|
|
302
236
|
getattr(self, "action_record_id", "unknown"),
|
|
303
237
|
)
|
|
@@ -352,13 +286,13 @@ class ActionInstance:
|
|
|
352
286
|
).get("gpuMemory", 0)
|
|
353
287
|
|
|
354
288
|
logging.info(
|
|
355
|
-
"Action %s requires GPU with %d MB memory - selecting GPU(s)
|
|
289
|
+
"Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
|
|
356
290
|
action_id,
|
|
357
291
|
required_memory
|
|
358
292
|
)
|
|
359
293
|
|
|
360
294
|
try:
|
|
361
|
-
# Get the GPU(s) with
|
|
295
|
+
# Get the best-fit GPU(s) with sufficient memory
|
|
362
296
|
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
363
297
|
action_details=action_details
|
|
364
298
|
)
|
|
@@ -412,7 +346,6 @@ class ActionInstance:
|
|
|
412
346
|
destination_workspace_path: str = "/usr/src/workspace",
|
|
413
347
|
docker_workdir: str = "",
|
|
414
348
|
extra_pkgs: list = [],
|
|
415
|
-
container_name: str = "",
|
|
416
349
|
):
|
|
417
350
|
"""Build base Docker command with common options.
|
|
418
351
|
|
|
@@ -427,7 +360,6 @@ class ActionInstance:
|
|
|
427
360
|
destination_workspace_path (str): Container workspace path
|
|
428
361
|
docker_workdir (str): Docker working directory
|
|
429
362
|
extra_pkgs (list): List of extra packages to install
|
|
430
|
-
container_name (str): Docker container name (format: {action_type}_{action_id})
|
|
431
363
|
Returns:
|
|
432
364
|
str: Base Docker command
|
|
433
365
|
"""
|
|
@@ -492,20 +424,17 @@ class ActionInstance:
|
|
|
492
424
|
]
|
|
493
425
|
)
|
|
494
426
|
|
|
495
|
-
# Build container name option if provided
|
|
496
|
-
name_option = f"--name {container_name}" if container_name else ""
|
|
497
|
-
|
|
498
427
|
# if the service provider is local, then put --restart unless-stopped
|
|
499
428
|
if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
|
|
500
429
|
env_exports += " && export DOCKER_RESTART_POLICY='--restart unless-stopped' "
|
|
501
430
|
|
|
502
431
|
cmd_parts = [
|
|
503
|
-
f"docker run
|
|
504
|
-
name_option,
|
|
432
|
+
f"docker run {use_gpu} ",
|
|
505
433
|
network_config,
|
|
506
434
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
507
435
|
*volumes,
|
|
508
436
|
# Container configuration and startup commands
|
|
437
|
+
f"--cidfile ./{self.action_record_id}.cid ",
|
|
509
438
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
510
439
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
511
440
|
f"{env_exports} && "
|
|
@@ -893,34 +822,6 @@ class ActionInstance:
|
|
|
893
822
|
job_params=action_details["jobParams"],
|
|
894
823
|
)
|
|
895
824
|
|
|
896
|
-
@staticmethod
|
|
897
|
-
def container_exists(container_id: str) -> bool:
|
|
898
|
-
"""Check if a Docker container exists.
|
|
899
|
-
|
|
900
|
-
Args:
|
|
901
|
-
container_id (str): Container ID or name to check
|
|
902
|
-
|
|
903
|
-
Returns:
|
|
904
|
-
bool: True if container exists, False otherwise
|
|
905
|
-
"""
|
|
906
|
-
if not container_id:
|
|
907
|
-
return False
|
|
908
|
-
try:
|
|
909
|
-
result = subprocess.run(
|
|
910
|
-
["docker", "inspect", container_id],
|
|
911
|
-
capture_output=True,
|
|
912
|
-
text=True,
|
|
913
|
-
timeout=10
|
|
914
|
-
)
|
|
915
|
-
return result.returncode == 0
|
|
916
|
-
except Exception as e:
|
|
917
|
-
logging.warning(
|
|
918
|
-
"Error checking if container %s exists: %s",
|
|
919
|
-
container_id,
|
|
920
|
-
str(e)
|
|
921
|
-
)
|
|
922
|
-
return False
|
|
923
|
-
|
|
924
825
|
@log_errors(raise_exception=True)
|
|
925
826
|
def start_process(self, cmd, log_name):
|
|
926
827
|
"""Start the process and initialize logging.
|
|
@@ -935,54 +836,60 @@ class ActionInstance:
|
|
|
935
836
|
self.cmd = cmd
|
|
936
837
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
937
838
|
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
# Use a longer timeout for docker run since --pull=always may need to
|
|
948
|
-
# download large images on first run. Default: 30 minutes (1800 seconds)
|
|
949
|
-
# Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
|
|
950
|
-
docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
|
|
951
|
-
logging.info(
|
|
952
|
-
"Waiting for docker container to start for action %s (timeout: %d seconds)",
|
|
953
|
-
self.action_record_id,
|
|
954
|
-
docker_start_timeout,
|
|
955
|
-
)
|
|
956
|
-
stdout, stderr = process.communicate(timeout=docker_start_timeout)
|
|
839
|
+
with open(self.log_path, "wb") as out:
|
|
840
|
+
self.process = subprocess.Popen(
|
|
841
|
+
shlex.split(self.cmd),
|
|
842
|
+
stdout=out,
|
|
843
|
+
stderr=out,
|
|
844
|
+
env={**os.environ},
|
|
845
|
+
start_new_session=True,
|
|
846
|
+
)
|
|
957
847
|
|
|
958
|
-
|
|
848
|
+
self.container_id = None
|
|
849
|
+
|
|
850
|
+
cid_file_path = f"./{self.action_record_id}.cid"
|
|
851
|
+
max_retries = 5
|
|
852
|
+
retry_delay = 1 # seconds
|
|
853
|
+
for attempt in range(max_retries):
|
|
854
|
+
try:
|
|
855
|
+
with open(cid_file_path, "r") as cid_file:
|
|
856
|
+
container_id = cid_file.read().strip()
|
|
857
|
+
self.container_id = container_id
|
|
858
|
+
logging.info(
|
|
859
|
+
"Started process for action %s with container ID: %s",
|
|
860
|
+
self.action_record_id,
|
|
861
|
+
self.container_id,
|
|
862
|
+
)
|
|
863
|
+
break
|
|
864
|
+
except FileNotFoundError:
|
|
865
|
+
logging.warning(
|
|
866
|
+
"CID file not found for action %s, attempt %d/%d",
|
|
867
|
+
self.action_record_id,
|
|
868
|
+
attempt + 1,
|
|
869
|
+
max_retries,
|
|
870
|
+
)
|
|
871
|
+
time.sleep(retry_delay)
|
|
872
|
+
except Exception as e:
|
|
873
|
+
logging.error(
|
|
874
|
+
"Error reading CID file for action %s: %s",
|
|
875
|
+
self.action_record_id,
|
|
876
|
+
str(e),
|
|
877
|
+
)
|
|
878
|
+
time.sleep(retry_delay)
|
|
879
|
+
else:
|
|
959
880
|
logging.error(
|
|
960
|
-
"
|
|
881
|
+
"Failed to read CID file for action %s after %d attempts",
|
|
961
882
|
self.action_record_id,
|
|
962
|
-
|
|
883
|
+
max_retries,
|
|
963
884
|
)
|
|
964
|
-
raise
|
|
965
|
-
|
|
966
|
-
self.container_id = stdout.strip()
|
|
967
|
-
logging.info(
|
|
968
|
-
"Started container for action %s with ID: %s",
|
|
969
|
-
self.action_record_id,
|
|
970
|
-
self.container_id,
|
|
971
|
-
)
|
|
972
|
-
|
|
973
|
-
# Start following container logs in background
|
|
974
|
-
self.process = subprocess.Popen(
|
|
975
|
-
["docker", "logs", "-f", self.container_id],
|
|
976
|
-
stdout=open(self.log_path, "wb"),
|
|
977
|
-
stderr=subprocess.STDOUT,
|
|
978
|
-
start_new_session=True,
|
|
979
|
-
)
|
|
885
|
+
raise Exception("Failed to start process: CID file not found")
|
|
980
886
|
|
|
981
|
-
#
|
|
887
|
+
# report container id to scaling service
|
|
982
888
|
self.scaling.update_action_container_id(
|
|
983
889
|
action_record_id=self.action_record_id,
|
|
984
890
|
container_id=self.container_id,
|
|
985
891
|
)
|
|
892
|
+
|
|
986
893
|
|
|
987
894
|
@log_errors(raise_exception=False)
|
|
988
895
|
def start_logger(self):
|
|
@@ -1143,8 +1050,7 @@ def data_preparation_execute(
|
|
|
1143
1050
|
"Started pulling Docker image with PID: %s",
|
|
1144
1051
|
process.pid,
|
|
1145
1052
|
)
|
|
1146
|
-
|
|
1147
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1053
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1148
1054
|
logging.info("cmd is: %s", cmd)
|
|
1149
1055
|
self.start(cmd, "data_preparation_log")
|
|
1150
1056
|
|
|
@@ -1173,8 +1079,7 @@ def data_processing_execute(self: ActionInstance):
|
|
|
1173
1079
|
service="bg-job-scheduler",
|
|
1174
1080
|
job_params=action["jobParams"],
|
|
1175
1081
|
)
|
|
1176
|
-
|
|
1177
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1082
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1178
1083
|
logging.info("cmd: %s", cmd)
|
|
1179
1084
|
self.start(cmd, "data_processing_log")
|
|
1180
1085
|
|
|
@@ -1187,8 +1092,7 @@ def data_split_execute(self: ActionInstance):
|
|
|
1187
1092
|
if not action_details:
|
|
1188
1093
|
return
|
|
1189
1094
|
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1190
|
-
|
|
1191
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1095
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1192
1096
|
logging.info("cmd: %s", cmd)
|
|
1193
1097
|
self.start(cmd, "data_split")
|
|
1194
1098
|
|
|
@@ -1203,8 +1107,7 @@ def dataset_annotation_execute(
|
|
|
1203
1107
|
if not action_details:
|
|
1204
1108
|
return
|
|
1205
1109
|
self.setup_action_requirements(action_details, work_fs)
|
|
1206
|
-
|
|
1207
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1110
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1208
1111
|
logging.info("cmd: %s", cmd)
|
|
1209
1112
|
self.start(cmd, "dataset_annotation")
|
|
1210
1113
|
|
|
@@ -1219,8 +1122,7 @@ def dataset_augmentation_execute(
|
|
|
1219
1122
|
if not action_details:
|
|
1220
1123
|
return
|
|
1221
1124
|
self.setup_action_requirements(action_details, work_fs)
|
|
1222
|
-
|
|
1223
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1125
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1224
1126
|
logging.info("cmd: %s", cmd)
|
|
1225
1127
|
self.start(cmd, "dataset_augmentation")
|
|
1226
1128
|
|
|
@@ -1236,8 +1138,7 @@ def augmentation_server_creation_execute(
|
|
|
1236
1138
|
if not action_details:
|
|
1237
1139
|
return
|
|
1238
1140
|
self.setup_action_requirements(action_details, work_fs)
|
|
1239
|
-
|
|
1240
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1141
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1241
1142
|
logging.info("cmd: %s", cmd)
|
|
1242
1143
|
self.start(cmd, "augmentation_setup")
|
|
1243
1144
|
|
|
@@ -1258,45 +1159,30 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1258
1159
|
|
|
1259
1160
|
project_id = action_details["_idProject"]
|
|
1260
1161
|
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1162
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1163
|
+
logging.info(
|
|
1164
|
+
"Using existing container ID for inference tracker: %s",
|
|
1165
|
+
action_details["actionDetails"]["containerId"],
|
|
1166
|
+
)
|
|
1167
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1168
|
+
cmd = "docker restart " + self.docker_container
|
|
1169
|
+
self.start(cmd, "qdrant_setup")
|
|
1264
1170
|
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
|
|
1269
|
-
qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
|
|
1171
|
+
#qdrant restart
|
|
1172
|
+
qdrant_cmd = "docker restart qdrant"
|
|
1173
|
+
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1270
1174
|
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
"Using existing container ID for database setup: %s",
|
|
1274
|
-
existing_container_id,
|
|
1275
|
-
)
|
|
1276
|
-
self.docker_container = existing_container_id
|
|
1277
|
-
cmd = "docker restart " + self.docker_container
|
|
1278
|
-
self.start(cmd, "database_setup")
|
|
1175
|
+
return
|
|
1176
|
+
|
|
1279
1177
|
|
|
1280
|
-
|
|
1281
|
-
qdrant_cmd = f"docker restart {qdrant_container_name}"
|
|
1282
|
-
self.start(qdrant_cmd, "qdrant_setup")
|
|
1283
|
-
return
|
|
1284
|
-
else:
|
|
1285
|
-
logging.warning(
|
|
1286
|
-
"Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
|
|
1287
|
-
mongodb_container_exists,
|
|
1288
|
-
qdrant_container_exists
|
|
1289
|
-
)
|
|
1290
|
-
# Fall through to create new containers
|
|
1178
|
+
dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
|
|
1291
1179
|
|
|
1292
|
-
dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
|
|
1293
1180
|
|
|
1294
1181
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1295
1182
|
cmd = (
|
|
1296
1183
|
f"docker run --pull=always --net=host "
|
|
1297
|
-
f"--name {mongodb_container_name} "
|
|
1298
|
-
f"-v matrice_myvol:/matrice_data "
|
|
1299
1184
|
f"-v {dbPath}:{dbPath} "
|
|
1185
|
+
f"--name database_setup_{self.action_record_id} "
|
|
1300
1186
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1301
1187
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1302
1188
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
@@ -1308,23 +1194,6 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1308
1194
|
)
|
|
1309
1195
|
logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
|
|
1310
1196
|
|
|
1311
|
-
# Qdrant container with --net=host (Port: 6334)
|
|
1312
|
-
qdrant_cmd = (
|
|
1313
|
-
f"docker run -d --pull=always --net=host "
|
|
1314
|
-
f"--name {qdrant_container_name} "
|
|
1315
|
-
f"-v matrice_myvol:/matrice_data "
|
|
1316
|
-
f"qdrant/qdrant:latest "
|
|
1317
|
-
)
|
|
1318
|
-
logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
|
|
1319
|
-
|
|
1320
|
-
# Start Qdrant container
|
|
1321
|
-
qdrant_process = subprocess.Popen(
|
|
1322
|
-
qdrant_cmd,
|
|
1323
|
-
shell=True,
|
|
1324
|
-
stdout=subprocess.PIPE,
|
|
1325
|
-
stderr=subprocess.PIPE,
|
|
1326
|
-
)
|
|
1327
|
-
logging.info("Qdrant container started successfully")
|
|
1328
1197
|
|
|
1329
1198
|
# Docker Command run
|
|
1330
1199
|
self.start(cmd, "database_setup")
|
|
@@ -1344,32 +1213,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1344
1213
|
|
|
1345
1214
|
self.setup_action_requirements(action_details)
|
|
1346
1215
|
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
cmd = "docker restart " + self.docker_container
|
|
1357
|
-
self.start(cmd, "facial_recognition_setup")
|
|
1358
|
-
return
|
|
1359
|
-
else:
|
|
1360
|
-
logging.warning(
|
|
1361
|
-
"Container %s not found. Creating new container.",
|
|
1362
|
-
existing_container_id
|
|
1363
|
-
)
|
|
1364
|
-
# Fall through to create new container
|
|
1216
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1217
|
+
logging.info(
|
|
1218
|
+
"Using existing container ID for facial recognition worker: %s",
|
|
1219
|
+
action_details["actionDetails"]["containerId"],
|
|
1220
|
+
)
|
|
1221
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1222
|
+
cmd = "docker restart " + self.docker_container
|
|
1223
|
+
self.start(cmd, "facial_recognition_setup")
|
|
1224
|
+
return
|
|
1365
1225
|
|
|
1366
1226
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1367
|
-
container_name = f"facial_recognition_{self.action_record_id}"
|
|
1368
1227
|
worker_cmd = (
|
|
1369
1228
|
f"docker run -d --pull=always --net=host "
|
|
1370
|
-
f"--name
|
|
1371
|
-
|
|
1229
|
+
f"--name worker "
|
|
1230
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1372
1231
|
f"-v matrice_myvol:/matrice_data "
|
|
1232
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1373
1233
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1374
1234
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1375
1235
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1396,30 +1256,20 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1396
1256
|
|
|
1397
1257
|
self.setup_action_requirements(action_details)
|
|
1398
1258
|
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
cmd = "docker restart " + self.docker_container
|
|
1409
|
-
self.start(cmd, "lpr_setup")
|
|
1410
|
-
return
|
|
1411
|
-
else:
|
|
1412
|
-
logging.warning(
|
|
1413
|
-
"Container %s not found. Creating new container.",
|
|
1414
|
-
existing_container_id
|
|
1415
|
-
)
|
|
1416
|
-
# Fall through to create new container
|
|
1259
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1260
|
+
logging.info(
|
|
1261
|
+
"Using existing container ID for LPR worker: %s",
|
|
1262
|
+
action_details["actionDetails"]["containerId"],
|
|
1263
|
+
)
|
|
1264
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1265
|
+
cmd = "docker restart " + self.docker_container
|
|
1266
|
+
self.start(cmd, "lpr_setup")
|
|
1267
|
+
return
|
|
1417
1268
|
|
|
1418
1269
|
# LPR worker container with --net=host (Port: 8082)
|
|
1419
|
-
container_name = f"lpr_{self.action_record_id}"
|
|
1420
1270
|
worker_cmd = (
|
|
1421
1271
|
f"docker run -d --net=host --pull=always "
|
|
1422
|
-
f"--name
|
|
1272
|
+
f"--name lpr-worker "
|
|
1423
1273
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1424
1274
|
f"-v matrice_myvol:/matrice_data "
|
|
1425
1275
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -1458,30 +1308,20 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1458
1308
|
|
|
1459
1309
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1460
1310
|
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
cmd = "docker restart " + self.docker_container
|
|
1471
|
-
self.start(cmd, "inference_ws_server")
|
|
1472
|
-
return
|
|
1473
|
-
else:
|
|
1474
|
-
logging.warning(
|
|
1475
|
-
"Container %s not found. Creating new container.",
|
|
1476
|
-
existing_container_id
|
|
1477
|
-
)
|
|
1478
|
-
# Fall through to create new container
|
|
1311
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1312
|
+
logging.info(
|
|
1313
|
+
"Using existing container ID for inference WebSocket server: %s",
|
|
1314
|
+
action_details["actionDetails"]["containerId"],
|
|
1315
|
+
)
|
|
1316
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1317
|
+
cmd = "docker restart " + self.docker_container
|
|
1318
|
+
self.start(cmd, "inference_ws_server")
|
|
1319
|
+
return
|
|
1479
1320
|
|
|
1480
1321
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1481
|
-
container_name = f"inference_ws_{self.action_record_id}"
|
|
1482
1322
|
worker_cmd = (
|
|
1483
1323
|
f"docker run -d --pull=always --net=host "
|
|
1484
|
-
f"--name
|
|
1324
|
+
f"--name inference "
|
|
1485
1325
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1486
1326
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1487
1327
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1517,30 +1357,20 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1517
1357
|
|
|
1518
1358
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1519
1359
|
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
self.start(cmd, "fe_fs_streaming")
|
|
1531
|
-
return
|
|
1532
|
-
else:
|
|
1533
|
-
logging.warning(
|
|
1534
|
-
"Container %s not found. Creating new container.",
|
|
1535
|
-
existing_container_id
|
|
1536
|
-
)
|
|
1537
|
-
# Fall through to create new container
|
|
1538
|
-
|
|
1360
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1361
|
+
logging.info(
|
|
1362
|
+
"Using existing container ID for frontend streaming: %s",
|
|
1363
|
+
action_details["actionDetails"]["containerId"],
|
|
1364
|
+
)
|
|
1365
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1366
|
+
cmd = "docker restart " + self.docker_container
|
|
1367
|
+
self.start(cmd, "fe_fs_streaming")
|
|
1368
|
+
return
|
|
1369
|
+
|
|
1539
1370
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1540
|
-
container_name = f"fe_streaming_{self.action_record_id}"
|
|
1541
1371
|
worker_cmd = (
|
|
1542
1372
|
f"docker run -d --pull=always --net=host "
|
|
1543
|
-
f"--name
|
|
1373
|
+
f"--name fe_streaming "
|
|
1544
1374
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1545
1375
|
f"-v matrice_myvol:/matrice_data "
|
|
1546
1376
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -1573,30 +1403,20 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1573
1403
|
|
|
1574
1404
|
project_id = action_details["_idProject"]
|
|
1575
1405
|
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
self.start(cmd, "fe_analytics_service")
|
|
1587
|
-
return
|
|
1588
|
-
else:
|
|
1589
|
-
logging.warning(
|
|
1590
|
-
"Container %s not found. Creating new container.",
|
|
1591
|
-
existing_container_id
|
|
1592
|
-
)
|
|
1593
|
-
# Fall through to create new container
|
|
1594
|
-
|
|
1406
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1407
|
+
logging.info(
|
|
1408
|
+
"Using existing container ID for frontend analytics service: %s",
|
|
1409
|
+
action_details["actionDetails"]["containerId"],
|
|
1410
|
+
)
|
|
1411
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1412
|
+
cmd = "docker restart " + self.docker_container
|
|
1413
|
+
self.start(cmd, "fe_analytics_service")
|
|
1414
|
+
return
|
|
1415
|
+
|
|
1595
1416
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1596
|
-
container_name = f"fe_analytics_{self.action_record_id}"
|
|
1597
1417
|
worker_cmd = (
|
|
1598
1418
|
f"docker run -d --pull=always --net=host "
|
|
1599
|
-
f"--name
|
|
1419
|
+
f"--name fe-analytics "
|
|
1600
1420
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1601
1421
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1602
1422
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1629,8 +1449,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
|
1629
1449
|
else:
|
|
1630
1450
|
return
|
|
1631
1451
|
use_gpu = self.get_gpu_config(action_details)
|
|
1632
|
-
|
|
1633
|
-
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1452
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1634
1453
|
logging.info("cmd is: %s", cmd)
|
|
1635
1454
|
self.start(cmd, "dataset_generation")
|
|
1636
1455
|
|
|
@@ -1651,8 +1470,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1651
1470
|
else:
|
|
1652
1471
|
return
|
|
1653
1472
|
use_gpu = self.get_gpu_config(action_details)
|
|
1654
|
-
|
|
1655
|
-
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1473
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1656
1474
|
logging.info("cmd is: %s", cmd)
|
|
1657
1475
|
self.start(cmd, "synthetic_data_setup")
|
|
1658
1476
|
|
|
@@ -1689,60 +1507,31 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1689
1507
|
|
|
1690
1508
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1691
1509
|
|
|
1692
|
-
# Define container names with action_record_id for uniqueness
|
|
1693
|
-
redis_container_name = f"redis_{self.action_record_id}"
|
|
1694
1510
|
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
"Using existing container ID for redis management: %s",
|
|
1704
|
-
existing_container_id,
|
|
1705
|
-
)
|
|
1706
|
-
self.docker_container = existing_container_id
|
|
1707
|
-
cmd = "docker restart " + self.docker_container
|
|
1708
|
-
self.start(cmd, "redis_setup")
|
|
1511
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1512
|
+
logging.info(
|
|
1513
|
+
"Using existing container ID for redis management: %s",
|
|
1514
|
+
action_details["actionDetails"]["containerId"],
|
|
1515
|
+
)
|
|
1516
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1517
|
+
cmd = "docker restart " + self.docker_container
|
|
1518
|
+
self.start(cmd, "redis_setup")
|
|
1709
1519
|
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
return
|
|
1714
|
-
else:
|
|
1715
|
-
logging.warning(
|
|
1716
|
-
"Container(s) not found (management=%s, redis=%s). Creating new containers.",
|
|
1717
|
-
management_container_exists,
|
|
1718
|
-
redis_container_exists
|
|
1719
|
-
)
|
|
1720
|
-
# Fall through to create new containers
|
|
1520
|
+
# Redis container restart
|
|
1521
|
+
redis_restart_cmd = "docker restart redis_container"
|
|
1522
|
+
self.start(redis_restart_cmd, "redis")
|
|
1721
1523
|
|
|
1524
|
+
return
|
|
1525
|
+
|
|
1722
1526
|
# Redis container with --net=host (Port: 6379)
|
|
1723
1527
|
redis_cmd = (
|
|
1724
1528
|
f"docker run -d --net=host "
|
|
1725
|
-
f"--name
|
|
1529
|
+
f"--name redis_container "
|
|
1726
1530
|
f"--restart unless-stopped "
|
|
1727
1531
|
f"{redis_image} "
|
|
1728
|
-
f"redis-server --bind 0.0.0.0 "
|
|
1729
|
-
f"--appendonly no "
|
|
1730
|
-
f'--save "" '
|
|
1731
|
-
f"--maxmemory 30gb "
|
|
1732
|
-
f"--maxmemory-policy allkeys-lru "
|
|
1733
|
-
f"--io-threads 4 "
|
|
1734
|
-
f"--io-threads-do-reads yes "
|
|
1735
|
-
f"--stream-node-max-bytes 8192 "
|
|
1736
|
-
f"--stream-node-max-entries 1000 "
|
|
1737
|
-
f"--hz 100 "
|
|
1738
|
-
f"--tcp-backlog 2048 "
|
|
1739
|
-
f"--timeout 0 "
|
|
1740
|
-
f"--lazyfree-lazy-eviction yes "
|
|
1741
|
-
f"--lazyfree-lazy-expire yes "
|
|
1742
|
-
f"--lazyfree-lazy-server-del yes "
|
|
1743
|
-
f"--activedefrag yes "
|
|
1744
|
-
f"--requirepass {redis_password}"
|
|
1532
|
+
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
1745
1533
|
)
|
|
1534
|
+
|
|
1746
1535
|
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
1747
1536
|
|
|
1748
1537
|
# Start Redis container first
|
|
@@ -1792,8 +1581,7 @@ def deploy_aggregator_execute(
|
|
|
1792
1581
|
if not action_details:
|
|
1793
1582
|
return
|
|
1794
1583
|
self.setup_action_requirements(action_details, work_fs)
|
|
1795
|
-
|
|
1796
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1584
|
+
cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1797
1585
|
logging.info("cmd: %s", cmd)
|
|
1798
1586
|
self.start(cmd, "deploy_aggregator")
|
|
1799
1587
|
|
|
@@ -1809,10 +1597,6 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1809
1597
|
return
|
|
1810
1598
|
action_id = action_details["_id"]
|
|
1811
1599
|
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1812
|
-
|
|
1813
|
-
# Get the service ID to track deployments
|
|
1814
|
-
service_id = action_details.get("_idService")
|
|
1815
|
-
|
|
1816
1600
|
self.setup_action_requirements(
|
|
1817
1601
|
action_details,
|
|
1818
1602
|
work_fs,
|
|
@@ -1820,29 +1604,17 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1820
1604
|
action_id=action_id,
|
|
1821
1605
|
)
|
|
1822
1606
|
|
|
1823
|
-
#
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
# Get GPU configuration (uses utility function with fail-safe fallback)
|
|
1827
|
-
use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
|
|
1828
|
-
|
|
1829
|
-
logging.info(
|
|
1830
|
-
"Action %s: Model deployment GPU config: %s (first_deployment=%s)",
|
|
1831
|
-
action_id,
|
|
1832
|
-
use_gpu if use_gpu else "CPU-only",
|
|
1833
|
-
is_first_deployment
|
|
1834
|
-
)
|
|
1835
|
-
|
|
1836
|
-
# Get or create TRITON_PORTS (uses utility method)
|
|
1837
|
-
triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
|
|
1607
|
+
# Get GPU configuration based on requirements and availability
|
|
1608
|
+
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1609
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1838
1610
|
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1611
|
+
# Override: If GPU is required, use all available GPUs
|
|
1612
|
+
gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
|
|
1613
|
+
if gpuRequired:
|
|
1614
|
+
use_gpu = "--runtime=nvidia --gpus all"
|
|
1843
1615
|
|
|
1844
|
-
|
|
1845
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"]
|
|
1616
|
+
extra_env_vars = {"INTERNAL_PORT": internal_port}
|
|
1617
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1846
1618
|
logging.info("cmd is: %s", cmd)
|
|
1847
1619
|
self.start(cmd, "deploy_log")
|
|
1848
1620
|
|
|
@@ -1865,27 +1637,17 @@ def model_train_execute(self: ActionInstance):
|
|
|
1865
1637
|
action_id=action_id,
|
|
1866
1638
|
)
|
|
1867
1639
|
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
return
|
|
1880
|
-
else:
|
|
1881
|
-
logging.warning(
|
|
1882
|
-
"Container %s not found. Creating new container.",
|
|
1883
|
-
existing_container_id
|
|
1884
|
-
)
|
|
1885
|
-
# Fall through to create new container
|
|
1886
|
-
|
|
1887
|
-
container_name = f"model_train_{self.action_record_id}"
|
|
1888
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
|
|
1640
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1641
|
+
logging.info(
|
|
1642
|
+
"Using existing container ID for training: %s",
|
|
1643
|
+
action_details["actionDetails"]["containerId"],
|
|
1644
|
+
)
|
|
1645
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1646
|
+
cmd = "docker restart " + self.docker_container
|
|
1647
|
+
self.start(cmd, "train_log")
|
|
1648
|
+
return
|
|
1649
|
+
|
|
1650
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
|
|
1889
1651
|
logging.info("cmd is: %s", cmd)
|
|
1890
1652
|
self.start(cmd, "train_log")
|
|
1891
1653
|
|
|
@@ -1906,27 +1668,17 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1906
1668
|
model_family=model_family,
|
|
1907
1669
|
action_id=action_id,
|
|
1908
1670
|
)
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
return
|
|
1921
|
-
else:
|
|
1922
|
-
logging.warning(
|
|
1923
|
-
"Container %s not found. Creating new container.",
|
|
1924
|
-
existing_container_id
|
|
1925
|
-
)
|
|
1926
|
-
# Fall through to create new container
|
|
1927
|
-
|
|
1928
|
-
container_name = f"model_eval_{self.action_record_id}"
|
|
1929
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
|
|
1671
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1672
|
+
logging.info(
|
|
1673
|
+
"Using existing container ID for training: %s",
|
|
1674
|
+
action_details["actionDetails"]["containerId"],
|
|
1675
|
+
)
|
|
1676
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1677
|
+
cmd = "docker restart " + self.docker_container
|
|
1678
|
+
self.start(cmd, "eval_log")
|
|
1679
|
+
return
|
|
1680
|
+
|
|
1681
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
|
|
1930
1682
|
logging.info("cmd is: %s", cmd)
|
|
1931
1683
|
self.start(cmd, "eval_log")
|
|
1932
1684
|
|
|
@@ -1950,27 +1702,17 @@ def model_export_execute(self: ActionInstance):
|
|
|
1950
1702
|
model_family=model_family,
|
|
1951
1703
|
action_id=action_id,
|
|
1952
1704
|
)
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
return
|
|
1965
|
-
else:
|
|
1966
|
-
logging.warning(
|
|
1967
|
-
"Container %s not found. Creating new container.",
|
|
1968
|
-
existing_container_id
|
|
1969
|
-
)
|
|
1970
|
-
# Fall through to create new container
|
|
1971
|
-
|
|
1972
|
-
container_name = f"model_export_{self.action_record_id}"
|
|
1973
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
|
|
1705
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1706
|
+
logging.info(
|
|
1707
|
+
"Using existing container ID for training: %s",
|
|
1708
|
+
action_details["actionDetails"]["containerId"],
|
|
1709
|
+
)
|
|
1710
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1711
|
+
cmd = "docker restart " + self.docker_container
|
|
1712
|
+
self.start(cmd, "export_log")
|
|
1713
|
+
return
|
|
1714
|
+
|
|
1715
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
|
|
1974
1716
|
logging.info("cmd is: %s", cmd)
|
|
1975
1717
|
self.start(cmd, "export_log")
|
|
1976
1718
|
|
|
@@ -1986,8 +1728,7 @@ def image_build_execute(self: ActionInstance):
|
|
|
1986
1728
|
action_id = action_details["_id"]
|
|
1987
1729
|
internal_api_key = self.get_internal_api_key(action_id)
|
|
1988
1730
|
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1989
|
-
|
|
1990
|
-
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
|
|
1731
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
|
|
1991
1732
|
logging.info("cmd is: %s", cmd)
|
|
1992
1733
|
self.start(cmd, "image_build_log")
|
|
1993
1734
|
|
|
@@ -1999,8 +1740,7 @@ def resource_clone_execute(self: ActionInstance):
|
|
|
1999
1740
|
if not action_details:
|
|
2000
1741
|
return
|
|
2001
1742
|
self.setup_action_requirements(action_details)
|
|
2002
|
-
|
|
2003
|
-
cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
|
|
1743
|
+
cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
|
|
2004
1744
|
logging.info("cmd is: %s", cmd)
|
|
2005
1745
|
self.start(cmd, "resource_clone")
|
|
2006
1746
|
|
|
@@ -2016,27 +1756,17 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
2016
1756
|
self.docker_container = (
|
|
2017
1757
|
f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
|
|
2018
1758
|
)
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
return
|
|
2031
|
-
else:
|
|
2032
|
-
logging.warning(
|
|
2033
|
-
"Container %s not found. Creating new container.",
|
|
2034
|
-
existing_container_id
|
|
2035
|
-
)
|
|
2036
|
-
# Fall through to create new container
|
|
2037
|
-
|
|
2038
|
-
container_name = f"streaming_gateway_{self.action_record_id}"
|
|
2039
|
-
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1759
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1760
|
+
logging.info(
|
|
1761
|
+
"Using existing container ID for training: %s",
|
|
1762
|
+
action_details["actionDetails"]["containerId"],
|
|
1763
|
+
)
|
|
1764
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1765
|
+
cmd = "docker restart " + self.docker_container
|
|
1766
|
+
self.start(cmd, "streaming_gateway")
|
|
1767
|
+
return
|
|
1768
|
+
|
|
1769
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
2040
1770
|
logging.info("cmd is: %s", cmd)
|
|
2041
1771
|
self.start(cmd, "streaming_gateway")
|
|
2042
1772
|
|
|
@@ -2130,24 +1860,16 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
2130
1860
|
else:
|
|
2131
1861
|
pkgs = f"matrice_common matrice"
|
|
2132
1862
|
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
self.start(cmd, "kafka_setup")
|
|
2144
|
-
return
|
|
2145
|
-
else:
|
|
2146
|
-
logging.warning(
|
|
2147
|
-
"Container %s not found. Creating new container.",
|
|
2148
|
-
existing_container_id
|
|
2149
|
-
)
|
|
2150
|
-
# Fall through to create new container
|
|
1863
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1864
|
+
logging.info(
|
|
1865
|
+
"Using existing container ID for training: %s",
|
|
1866
|
+
action_details["actionDetails"]["containerId"],
|
|
1867
|
+
)
|
|
1868
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1869
|
+
cmd = "docker restart " + self.docker_container
|
|
1870
|
+
self.start(cmd, "kafka_setup")
|
|
1871
|
+
return
|
|
1872
|
+
|
|
2151
1873
|
|
|
2152
1874
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
2153
1875
|
cmd = (
|
|
@@ -2184,31 +1906,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
2184
1906
|
|
|
2185
1907
|
self.setup_action_requirements(action_details)
|
|
2186
1908
|
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
self.start(cmd, "inference_tracker_setup")
|
|
2198
|
-
return
|
|
2199
|
-
else:
|
|
2200
|
-
logging.warning(
|
|
2201
|
-
"Container %s not found. Creating new container.",
|
|
2202
|
-
existing_container_id
|
|
2203
|
-
)
|
|
2204
|
-
# Fall through to create new container
|
|
2205
|
-
|
|
1909
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1910
|
+
logging.info(
|
|
1911
|
+
"Using existing container ID for inference tracker: %s",
|
|
1912
|
+
action_details["actionDetails"]["containerId"],
|
|
1913
|
+
)
|
|
1914
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1915
|
+
cmd = "docker restart " + self.docker_container
|
|
1916
|
+
self.start(cmd, "inference_tracker_setup")
|
|
1917
|
+
return
|
|
1918
|
+
|
|
2206
1919
|
# This is the existing Docker run command
|
|
2207
|
-
container_name = f"inference_tracker_{self.action_record_id}"
|
|
2208
1920
|
worker_cmd = (
|
|
2209
1921
|
f"docker run -d --pull=always --net=host "
|
|
2210
|
-
|
|
2211
|
-
f"--name
|
|
1922
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1923
|
+
f"--name inference-tracker-worker "
|
|
2212
1924
|
f"-v matrice_myvol:/matrice_data "
|
|
2213
1925
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
2214
1926
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -2256,7 +1968,7 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
2256
1968
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
2257
1969
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
2258
1970
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
2259
|
-
f'--restart=unless-stopped '
|
|
1971
|
+
f' --restart=unless-stopped '
|
|
2260
1972
|
f"{image}"
|
|
2261
1973
|
)
|
|
2262
1974
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.35 → matrice_compute-0.1.36}/matrice_compute.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/actions_scaledown_manager.py
RENAMED
|
File without changes
|
{matrice_compute-0.1.35 → matrice_compute-0.1.36}/src/matrice_compute/compute_operations_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|