matrice-compute 0.1.35__py3-none-any.whl → 0.1.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/action_instance.py +225 -511
- {matrice_compute-0.1.35.dist-info → matrice_compute-0.1.37.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.35.dist-info → matrice_compute-0.1.37.dist-info}/RECORD +6 -6
- {matrice_compute-0.1.35.dist-info → matrice_compute-0.1.37.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.35.dist-info → matrice_compute-0.1.37.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.35.dist-info → matrice_compute-0.1.37.dist-info}/top_level.txt +0 -0
|
@@ -10,7 +10,6 @@ import signal
|
|
|
10
10
|
import urllib.request
|
|
11
11
|
from matrice_compute.instance_utils import (
|
|
12
12
|
get_gpu_with_sufficient_memory_for_action,
|
|
13
|
-
get_gpu_config_for_deployment,
|
|
14
13
|
get_decrypted_access_key_pair,
|
|
15
14
|
get_max_file_system,
|
|
16
15
|
get_best_service_ip_and_network,
|
|
@@ -27,10 +26,6 @@ from matrice_common.utils import log_errors
|
|
|
27
26
|
class ActionInstance:
|
|
28
27
|
"""Base class for tasks that run in Action containers."""
|
|
29
28
|
|
|
30
|
-
# Class-level dictionary to track deployed services and their ports
|
|
31
|
-
# Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
|
|
32
|
-
_deployed_services = {}
|
|
33
|
-
|
|
34
29
|
def __init__(self, scaling: Scaling, action_info: dict):
|
|
35
30
|
"""Initialize an action instance.
|
|
36
31
|
|
|
@@ -90,67 +85,6 @@ class ActionInstance:
|
|
|
90
85
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
91
86
|
self.task = self.actions_map[self.action_type]
|
|
92
87
|
|
|
93
|
-
@classmethod
|
|
94
|
-
def is_first_deployment_for_service(cls, service_id):
|
|
95
|
-
"""Check if this is the first deployment for a given service.
|
|
96
|
-
|
|
97
|
-
Args:
|
|
98
|
-
service_id (str): Service ID (_idService)
|
|
99
|
-
|
|
100
|
-
Returns:
|
|
101
|
-
bool: True if this is the first deployment, False otherwise
|
|
102
|
-
"""
|
|
103
|
-
if not service_id:
|
|
104
|
-
return False
|
|
105
|
-
return service_id not in cls._deployed_services
|
|
106
|
-
|
|
107
|
-
@classmethod
|
|
108
|
-
def get_or_create_triton_ports(cls, service_id, scaling_instance):
|
|
109
|
-
"""Get existing TRITON_PORTS for a service or create new ones.
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
service_id (str): Service ID (_idService)
|
|
113
|
-
scaling_instance: Scaling instance to get open ports
|
|
114
|
-
|
|
115
|
-
Returns:
|
|
116
|
-
str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
|
|
117
|
-
"""
|
|
118
|
-
if not service_id:
|
|
119
|
-
# No service_id, generate new ports
|
|
120
|
-
port1 = scaling_instance.get_open_port()
|
|
121
|
-
port2 = scaling_instance.get_open_port()
|
|
122
|
-
port3 = scaling_instance.get_open_port()
|
|
123
|
-
return f"{port1},{port2},{port3}"
|
|
124
|
-
|
|
125
|
-
# Check if ports already exist for this service
|
|
126
|
-
if service_id in cls._deployed_services:
|
|
127
|
-
triton_ports = cls._deployed_services[service_id]["triton_ports"]
|
|
128
|
-
logging.info(
|
|
129
|
-
"Reusing TRITON_PORTS for service %s: %s",
|
|
130
|
-
service_id,
|
|
131
|
-
triton_ports
|
|
132
|
-
)
|
|
133
|
-
return triton_ports
|
|
134
|
-
|
|
135
|
-
# First deployment: generate new ports and store them
|
|
136
|
-
port1 = scaling_instance.get_open_port()
|
|
137
|
-
port2 = scaling_instance.get_open_port()
|
|
138
|
-
port3 = scaling_instance.get_open_port()
|
|
139
|
-
triton_ports = f"{port1},{port2},{port3}"
|
|
140
|
-
|
|
141
|
-
# Store for future use
|
|
142
|
-
cls._deployed_services[service_id] = {
|
|
143
|
-
"triton_ports": triton_ports,
|
|
144
|
-
"is_first": False
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
logging.info(
|
|
148
|
-
"First deployment for service %s - generated TRITON_PORTS: %s",
|
|
149
|
-
service_id,
|
|
150
|
-
triton_ports
|
|
151
|
-
)
|
|
152
|
-
return triton_ports
|
|
153
|
-
|
|
154
88
|
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
155
89
|
def _init_credentials(self):
|
|
156
90
|
"""Initialize Matrice credentials.
|
|
@@ -297,7 +231,7 @@ class ActionInstance:
|
|
|
297
231
|
getattr(self, "action_record_id", "unknown"),
|
|
298
232
|
)
|
|
299
233
|
else:
|
|
300
|
-
logging.
|
|
234
|
+
logging.debug(
|
|
301
235
|
"No additional logs to send for action %s",
|
|
302
236
|
getattr(self, "action_record_id", "unknown"),
|
|
303
237
|
)
|
|
@@ -352,13 +286,13 @@ class ActionInstance:
|
|
|
352
286
|
).get("gpuMemory", 0)
|
|
353
287
|
|
|
354
288
|
logging.info(
|
|
355
|
-
"Action %s requires GPU with %d MB memory - selecting GPU(s)
|
|
289
|
+
"Action %s requires GPU with %d MB memory - selecting best-fit GPU(s)",
|
|
356
290
|
action_id,
|
|
357
291
|
required_memory
|
|
358
292
|
)
|
|
359
293
|
|
|
360
294
|
try:
|
|
361
|
-
# Get the GPU(s) with
|
|
295
|
+
# Get the best-fit GPU(s) with sufficient memory
|
|
362
296
|
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
363
297
|
action_details=action_details
|
|
364
298
|
)
|
|
@@ -412,7 +346,6 @@ class ActionInstance:
|
|
|
412
346
|
destination_workspace_path: str = "/usr/src/workspace",
|
|
413
347
|
docker_workdir: str = "",
|
|
414
348
|
extra_pkgs: list = [],
|
|
415
|
-
container_name: str = "",
|
|
416
349
|
):
|
|
417
350
|
"""Build base Docker command with common options.
|
|
418
351
|
|
|
@@ -427,7 +360,6 @@ class ActionInstance:
|
|
|
427
360
|
destination_workspace_path (str): Container workspace path
|
|
428
361
|
docker_workdir (str): Docker working directory
|
|
429
362
|
extra_pkgs (list): List of extra packages to install
|
|
430
|
-
container_name (str): Docker container name (format: {action_type}_{action_id})
|
|
431
363
|
Returns:
|
|
432
364
|
str: Base Docker command
|
|
433
365
|
"""
|
|
@@ -492,20 +424,19 @@ class ActionInstance:
|
|
|
492
424
|
]
|
|
493
425
|
)
|
|
494
426
|
|
|
495
|
-
# Build container name option if provided
|
|
496
|
-
name_option = f"--name {container_name}" if container_name else ""
|
|
497
|
-
|
|
498
427
|
# if the service provider is local, then put --restart unless-stopped
|
|
499
428
|
if os.environ.get("SERVICE_PROVIDER") in ("local", "LOCAL"):
|
|
500
|
-
|
|
429
|
+
use_restart_policy = "--restart unless-stopped"
|
|
430
|
+
else:
|
|
431
|
+
use_restart_policy = ""
|
|
501
432
|
|
|
502
433
|
cmd_parts = [
|
|
503
|
-
f"docker run
|
|
504
|
-
name_option,
|
|
434
|
+
f"docker run {use_gpu} {use_restart_policy} ",
|
|
505
435
|
network_config,
|
|
506
436
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
507
437
|
*volumes,
|
|
508
438
|
# Container configuration and startup commands
|
|
439
|
+
f"--cidfile ./{self.action_record_id}.cid ",
|
|
509
440
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
510
441
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
511
442
|
f"{env_exports} && "
|
|
@@ -893,34 +824,6 @@ class ActionInstance:
|
|
|
893
824
|
job_params=action_details["jobParams"],
|
|
894
825
|
)
|
|
895
826
|
|
|
896
|
-
@staticmethod
|
|
897
|
-
def container_exists(container_id: str) -> bool:
|
|
898
|
-
"""Check if a Docker container exists.
|
|
899
|
-
|
|
900
|
-
Args:
|
|
901
|
-
container_id (str): Container ID or name to check
|
|
902
|
-
|
|
903
|
-
Returns:
|
|
904
|
-
bool: True if container exists, False otherwise
|
|
905
|
-
"""
|
|
906
|
-
if not container_id:
|
|
907
|
-
return False
|
|
908
|
-
try:
|
|
909
|
-
result = subprocess.run(
|
|
910
|
-
["docker", "inspect", container_id],
|
|
911
|
-
capture_output=True,
|
|
912
|
-
text=True,
|
|
913
|
-
timeout=10
|
|
914
|
-
)
|
|
915
|
-
return result.returncode == 0
|
|
916
|
-
except Exception as e:
|
|
917
|
-
logging.warning(
|
|
918
|
-
"Error checking if container %s exists: %s",
|
|
919
|
-
container_id,
|
|
920
|
-
str(e)
|
|
921
|
-
)
|
|
922
|
-
return False
|
|
923
|
-
|
|
924
827
|
@log_errors(raise_exception=True)
|
|
925
828
|
def start_process(self, cmd, log_name):
|
|
926
829
|
"""Start the process and initialize logging.
|
|
@@ -935,54 +838,60 @@ class ActionInstance:
|
|
|
935
838
|
self.cmd = cmd
|
|
936
839
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
937
840
|
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
# Use a longer timeout for docker run since --pull=always may need to
|
|
948
|
-
# download large images on first run. Default: 30 minutes (1800 seconds)
|
|
949
|
-
# Can be configured via DOCKER_START_TIMEOUT_SECONDS environment variable
|
|
950
|
-
docker_start_timeout = int(os.environ.get("DOCKER_START_TIMEOUT_SECONDS", 1800))
|
|
951
|
-
logging.info(
|
|
952
|
-
"Waiting for docker container to start for action %s (timeout: %d seconds)",
|
|
953
|
-
self.action_record_id,
|
|
954
|
-
docker_start_timeout,
|
|
955
|
-
)
|
|
956
|
-
stdout, stderr = process.communicate(timeout=docker_start_timeout)
|
|
841
|
+
with open(self.log_path, "wb") as out:
|
|
842
|
+
self.process = subprocess.Popen(
|
|
843
|
+
shlex.split(self.cmd),
|
|
844
|
+
stdout=out,
|
|
845
|
+
stderr=out,
|
|
846
|
+
env={**os.environ},
|
|
847
|
+
start_new_session=True,
|
|
848
|
+
)
|
|
957
849
|
|
|
958
|
-
|
|
850
|
+
self.container_id = None
|
|
851
|
+
|
|
852
|
+
cid_file_path = f"./{self.action_record_id}.cid"
|
|
853
|
+
max_retries = 5
|
|
854
|
+
retry_delay = 1 # seconds
|
|
855
|
+
for attempt in range(max_retries):
|
|
856
|
+
try:
|
|
857
|
+
with open(cid_file_path, "r") as cid_file:
|
|
858
|
+
container_id = cid_file.read().strip()
|
|
859
|
+
self.container_id = container_id
|
|
860
|
+
logging.info(
|
|
861
|
+
"Started process for action %s with container ID: %s",
|
|
862
|
+
self.action_record_id,
|
|
863
|
+
self.container_id,
|
|
864
|
+
)
|
|
865
|
+
break
|
|
866
|
+
except FileNotFoundError:
|
|
867
|
+
logging.warning(
|
|
868
|
+
"CID file not found for action %s, attempt %d/%d",
|
|
869
|
+
self.action_record_id,
|
|
870
|
+
attempt + 1,
|
|
871
|
+
max_retries,
|
|
872
|
+
)
|
|
873
|
+
time.sleep(retry_delay)
|
|
874
|
+
except Exception as e:
|
|
875
|
+
logging.error(
|
|
876
|
+
"Error reading CID file for action %s: %s",
|
|
877
|
+
self.action_record_id,
|
|
878
|
+
str(e),
|
|
879
|
+
)
|
|
880
|
+
time.sleep(retry_delay)
|
|
881
|
+
else:
|
|
959
882
|
logging.error(
|
|
960
|
-
"
|
|
883
|
+
"Failed to read CID file for action %s after %d attempts",
|
|
961
884
|
self.action_record_id,
|
|
962
|
-
|
|
885
|
+
max_retries,
|
|
963
886
|
)
|
|
964
|
-
raise
|
|
965
|
-
|
|
966
|
-
self.container_id = stdout.strip()
|
|
967
|
-
logging.info(
|
|
968
|
-
"Started container for action %s with ID: %s",
|
|
969
|
-
self.action_record_id,
|
|
970
|
-
self.container_id,
|
|
971
|
-
)
|
|
972
|
-
|
|
973
|
-
# Start following container logs in background
|
|
974
|
-
self.process = subprocess.Popen(
|
|
975
|
-
["docker", "logs", "-f", self.container_id],
|
|
976
|
-
stdout=open(self.log_path, "wb"),
|
|
977
|
-
stderr=subprocess.STDOUT,
|
|
978
|
-
start_new_session=True,
|
|
979
|
-
)
|
|
887
|
+
raise Exception("Failed to start process: CID file not found")
|
|
980
888
|
|
|
981
|
-
#
|
|
889
|
+
# report container id to scaling service
|
|
982
890
|
self.scaling.update_action_container_id(
|
|
983
891
|
action_record_id=self.action_record_id,
|
|
984
892
|
container_id=self.container_id,
|
|
985
893
|
)
|
|
894
|
+
|
|
986
895
|
|
|
987
896
|
@log_errors(raise_exception=False)
|
|
988
897
|
def start_logger(self):
|
|
@@ -1143,8 +1052,7 @@ def data_preparation_execute(
|
|
|
1143
1052
|
"Started pulling Docker image with PID: %s",
|
|
1144
1053
|
process.pid,
|
|
1145
1054
|
)
|
|
1146
|
-
|
|
1147
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1055
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1148
1056
|
logging.info("cmd is: %s", cmd)
|
|
1149
1057
|
self.start(cmd, "data_preparation_log")
|
|
1150
1058
|
|
|
@@ -1173,8 +1081,7 @@ def data_processing_execute(self: ActionInstance):
|
|
|
1173
1081
|
service="bg-job-scheduler",
|
|
1174
1082
|
job_params=action["jobParams"],
|
|
1175
1083
|
)
|
|
1176
|
-
|
|
1177
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1084
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1178
1085
|
logging.info("cmd: %s", cmd)
|
|
1179
1086
|
self.start(cmd, "data_processing_log")
|
|
1180
1087
|
|
|
@@ -1187,8 +1094,7 @@ def data_split_execute(self: ActionInstance):
|
|
|
1187
1094
|
if not action_details:
|
|
1188
1095
|
return
|
|
1189
1096
|
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1190
|
-
|
|
1191
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1097
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1192
1098
|
logging.info("cmd: %s", cmd)
|
|
1193
1099
|
self.start(cmd, "data_split")
|
|
1194
1100
|
|
|
@@ -1203,8 +1109,7 @@ def dataset_annotation_execute(
|
|
|
1203
1109
|
if not action_details:
|
|
1204
1110
|
return
|
|
1205
1111
|
self.setup_action_requirements(action_details, work_fs)
|
|
1206
|
-
|
|
1207
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1112
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1208
1113
|
logging.info("cmd: %s", cmd)
|
|
1209
1114
|
self.start(cmd, "dataset_annotation")
|
|
1210
1115
|
|
|
@@ -1219,8 +1124,7 @@ def dataset_augmentation_execute(
|
|
|
1219
1124
|
if not action_details:
|
|
1220
1125
|
return
|
|
1221
1126
|
self.setup_action_requirements(action_details, work_fs)
|
|
1222
|
-
|
|
1223
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1127
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1224
1128
|
logging.info("cmd: %s", cmd)
|
|
1225
1129
|
self.start(cmd, "dataset_augmentation")
|
|
1226
1130
|
|
|
@@ -1236,8 +1140,7 @@ def augmentation_server_creation_execute(
|
|
|
1236
1140
|
if not action_details:
|
|
1237
1141
|
return
|
|
1238
1142
|
self.setup_action_requirements(action_details, work_fs)
|
|
1239
|
-
|
|
1240
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1143
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1241
1144
|
logging.info("cmd: %s", cmd)
|
|
1242
1145
|
self.start(cmd, "augmentation_setup")
|
|
1243
1146
|
|
|
@@ -1258,45 +1161,30 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1258
1161
|
|
|
1259
1162
|
project_id = action_details["_idProject"]
|
|
1260
1163
|
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1164
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1165
|
+
logging.info(
|
|
1166
|
+
"Using existing container ID for inference tracker: %s",
|
|
1167
|
+
action_details["actionDetails"]["containerId"],
|
|
1168
|
+
)
|
|
1169
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1170
|
+
cmd = "docker restart " + self.docker_container
|
|
1171
|
+
self.start(cmd, "qdrant_setup")
|
|
1264
1172
|
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
|
|
1269
|
-
qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
|
|
1173
|
+
#qdrant restart
|
|
1174
|
+
qdrant_cmd = "docker restart qdrant"
|
|
1175
|
+
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1270
1176
|
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
"Using existing container ID for database setup: %s",
|
|
1274
|
-
existing_container_id,
|
|
1275
|
-
)
|
|
1276
|
-
self.docker_container = existing_container_id
|
|
1277
|
-
cmd = "docker restart " + self.docker_container
|
|
1278
|
-
self.start(cmd, "database_setup")
|
|
1177
|
+
return
|
|
1178
|
+
|
|
1279
1179
|
|
|
1280
|
-
|
|
1281
|
-
qdrant_cmd = f"docker restart {qdrant_container_name}"
|
|
1282
|
-
self.start(qdrant_cmd, "qdrant_setup")
|
|
1283
|
-
return
|
|
1284
|
-
else:
|
|
1285
|
-
logging.warning(
|
|
1286
|
-
"Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
|
|
1287
|
-
mongodb_container_exists,
|
|
1288
|
-
qdrant_container_exists
|
|
1289
|
-
)
|
|
1290
|
-
# Fall through to create new containers
|
|
1180
|
+
dbPath =action_details["jobParams"].get("dbPath","/host/data/path/mongodb_data")
|
|
1291
1181
|
|
|
1292
|
-
dbPath = action_details["jobParams"].get("dbPath", "/host/data/path/mongodb_data")
|
|
1293
1182
|
|
|
1294
1183
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1295
1184
|
cmd = (
|
|
1296
1185
|
f"docker run --pull=always --net=host "
|
|
1297
|
-
f"--name {mongodb_container_name} "
|
|
1298
|
-
f"-v matrice_myvol:/matrice_data "
|
|
1299
1186
|
f"-v {dbPath}:{dbPath} "
|
|
1187
|
+
f"--name database_setup_{self.action_record_id} "
|
|
1300
1188
|
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1301
1189
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1302
1190
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
@@ -1308,23 +1196,6 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1308
1196
|
)
|
|
1309
1197
|
logging.info("Starting DB container (Port: 27020:27017): %s", cmd)
|
|
1310
1198
|
|
|
1311
|
-
# Qdrant container with --net=host (Port: 6334)
|
|
1312
|
-
qdrant_cmd = (
|
|
1313
|
-
f"docker run -d --pull=always --net=host "
|
|
1314
|
-
f"--name {qdrant_container_name} "
|
|
1315
|
-
f"-v matrice_myvol:/matrice_data "
|
|
1316
|
-
f"qdrant/qdrant:latest "
|
|
1317
|
-
)
|
|
1318
|
-
logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
|
|
1319
|
-
|
|
1320
|
-
# Start Qdrant container
|
|
1321
|
-
qdrant_process = subprocess.Popen(
|
|
1322
|
-
qdrant_cmd,
|
|
1323
|
-
shell=True,
|
|
1324
|
-
stdout=subprocess.PIPE,
|
|
1325
|
-
stderr=subprocess.PIPE,
|
|
1326
|
-
)
|
|
1327
|
-
logging.info("Qdrant container started successfully")
|
|
1328
1199
|
|
|
1329
1200
|
# Docker Command run
|
|
1330
1201
|
self.start(cmd, "database_setup")
|
|
@@ -1344,32 +1215,23 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1344
1215
|
|
|
1345
1216
|
self.setup_action_requirements(action_details)
|
|
1346
1217
|
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
cmd = "docker restart " + self.docker_container
|
|
1357
|
-
self.start(cmd, "facial_recognition_setup")
|
|
1358
|
-
return
|
|
1359
|
-
else:
|
|
1360
|
-
logging.warning(
|
|
1361
|
-
"Container %s not found. Creating new container.",
|
|
1362
|
-
existing_container_id
|
|
1363
|
-
)
|
|
1364
|
-
# Fall through to create new container
|
|
1218
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1219
|
+
logging.info(
|
|
1220
|
+
"Using existing container ID for facial recognition worker: %s",
|
|
1221
|
+
action_details["actionDetails"]["containerId"],
|
|
1222
|
+
)
|
|
1223
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1224
|
+
cmd = "docker restart " + self.docker_container
|
|
1225
|
+
self.start(cmd, "facial_recognition_setup")
|
|
1226
|
+
return
|
|
1365
1227
|
|
|
1366
1228
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1367
|
-
container_name = f"facial_recognition_{self.action_record_id}"
|
|
1368
1229
|
worker_cmd = (
|
|
1369
1230
|
f"docker run -d --pull=always --net=host "
|
|
1370
|
-
f"--name
|
|
1371
|
-
|
|
1231
|
+
f"--name worker "
|
|
1232
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1372
1233
|
f"-v matrice_myvol:/matrice_data "
|
|
1234
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1373
1235
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1374
1236
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1375
1237
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1396,30 +1258,20 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1396
1258
|
|
|
1397
1259
|
self.setup_action_requirements(action_details)
|
|
1398
1260
|
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
cmd = "docker restart " + self.docker_container
|
|
1409
|
-
self.start(cmd, "lpr_setup")
|
|
1410
|
-
return
|
|
1411
|
-
else:
|
|
1412
|
-
logging.warning(
|
|
1413
|
-
"Container %s not found. Creating new container.",
|
|
1414
|
-
existing_container_id
|
|
1415
|
-
)
|
|
1416
|
-
# Fall through to create new container
|
|
1261
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1262
|
+
logging.info(
|
|
1263
|
+
"Using existing container ID for LPR worker: %s",
|
|
1264
|
+
action_details["actionDetails"]["containerId"],
|
|
1265
|
+
)
|
|
1266
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1267
|
+
cmd = "docker restart " + self.docker_container
|
|
1268
|
+
self.start(cmd, "lpr_setup")
|
|
1269
|
+
return
|
|
1417
1270
|
|
|
1418
1271
|
# LPR worker container with --net=host (Port: 8082)
|
|
1419
|
-
container_name = f"lpr_{self.action_record_id}"
|
|
1420
1272
|
worker_cmd = (
|
|
1421
1273
|
f"docker run -d --net=host --pull=always "
|
|
1422
|
-
f"--name
|
|
1274
|
+
f"--name lpr-worker "
|
|
1423
1275
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1424
1276
|
f"-v matrice_myvol:/matrice_data "
|
|
1425
1277
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -1458,30 +1310,20 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1458
1310
|
|
|
1459
1311
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1460
1312
|
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
cmd = "docker restart " + self.docker_container
|
|
1471
|
-
self.start(cmd, "inference_ws_server")
|
|
1472
|
-
return
|
|
1473
|
-
else:
|
|
1474
|
-
logging.warning(
|
|
1475
|
-
"Container %s not found. Creating new container.",
|
|
1476
|
-
existing_container_id
|
|
1477
|
-
)
|
|
1478
|
-
# Fall through to create new container
|
|
1313
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1314
|
+
logging.info(
|
|
1315
|
+
"Using existing container ID for inference WebSocket server: %s",
|
|
1316
|
+
action_details["actionDetails"]["containerId"],
|
|
1317
|
+
)
|
|
1318
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1319
|
+
cmd = "docker restart " + self.docker_container
|
|
1320
|
+
self.start(cmd, "inference_ws_server")
|
|
1321
|
+
return
|
|
1479
1322
|
|
|
1480
1323
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1481
|
-
container_name = f"inference_ws_{self.action_record_id}"
|
|
1482
1324
|
worker_cmd = (
|
|
1483
1325
|
f"docker run -d --pull=always --net=host "
|
|
1484
|
-
f"--name
|
|
1326
|
+
f"--name inference "
|
|
1485
1327
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1486
1328
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1487
1329
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1517,30 +1359,20 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1517
1359
|
|
|
1518
1360
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1519
1361
|
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
self.start(cmd, "fe_fs_streaming")
|
|
1531
|
-
return
|
|
1532
|
-
else:
|
|
1533
|
-
logging.warning(
|
|
1534
|
-
"Container %s not found. Creating new container.",
|
|
1535
|
-
existing_container_id
|
|
1536
|
-
)
|
|
1537
|
-
# Fall through to create new container
|
|
1538
|
-
|
|
1362
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1363
|
+
logging.info(
|
|
1364
|
+
"Using existing container ID for frontend streaming: %s",
|
|
1365
|
+
action_details["actionDetails"]["containerId"],
|
|
1366
|
+
)
|
|
1367
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1368
|
+
cmd = "docker restart " + self.docker_container
|
|
1369
|
+
self.start(cmd, "fe_fs_streaming")
|
|
1370
|
+
return
|
|
1371
|
+
|
|
1539
1372
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1540
|
-
container_name = f"fe_streaming_{self.action_record_id}"
|
|
1541
1373
|
worker_cmd = (
|
|
1542
1374
|
f"docker run -d --pull=always --net=host "
|
|
1543
|
-
f"--name
|
|
1375
|
+
f"--name fe_streaming "
|
|
1544
1376
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1545
1377
|
f"-v matrice_myvol:/matrice_data "
|
|
1546
1378
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -1573,30 +1405,20 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1573
1405
|
|
|
1574
1406
|
project_id = action_details["_idProject"]
|
|
1575
1407
|
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
self.start(cmd, "fe_analytics_service")
|
|
1587
|
-
return
|
|
1588
|
-
else:
|
|
1589
|
-
logging.warning(
|
|
1590
|
-
"Container %s not found. Creating new container.",
|
|
1591
|
-
existing_container_id
|
|
1592
|
-
)
|
|
1593
|
-
# Fall through to create new container
|
|
1594
|
-
|
|
1408
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1409
|
+
logging.info(
|
|
1410
|
+
"Using existing container ID for frontend analytics service: %s",
|
|
1411
|
+
action_details["actionDetails"]["containerId"],
|
|
1412
|
+
)
|
|
1413
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1414
|
+
cmd = "docker restart " + self.docker_container
|
|
1415
|
+
self.start(cmd, "fe_analytics_service")
|
|
1416
|
+
return
|
|
1417
|
+
|
|
1595
1418
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1596
|
-
container_name = f"fe_analytics_{self.action_record_id}"
|
|
1597
1419
|
worker_cmd = (
|
|
1598
1420
|
f"docker run -d --pull=always --net=host "
|
|
1599
|
-
f"--name
|
|
1421
|
+
f"--name fe-analytics "
|
|
1600
1422
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1601
1423
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1602
1424
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1629,8 +1451,7 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
|
1629
1451
|
else:
|
|
1630
1452
|
return
|
|
1631
1453
|
use_gpu = self.get_gpu_config(action_details)
|
|
1632
|
-
|
|
1633
|
-
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1454
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1634
1455
|
logging.info("cmd is: %s", cmd)
|
|
1635
1456
|
self.start(cmd, "dataset_generation")
|
|
1636
1457
|
|
|
@@ -1651,8 +1472,7 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1651
1472
|
else:
|
|
1652
1473
|
return
|
|
1653
1474
|
use_gpu = self.get_gpu_config(action_details)
|
|
1654
|
-
|
|
1655
|
-
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1475
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1656
1476
|
logging.info("cmd is: %s", cmd)
|
|
1657
1477
|
self.start(cmd, "synthetic_data_setup")
|
|
1658
1478
|
|
|
@@ -1689,60 +1509,31 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1689
1509
|
|
|
1690
1510
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1691
1511
|
|
|
1692
|
-
# Define container names with action_record_id for uniqueness
|
|
1693
|
-
redis_container_name = f"redis_{self.action_record_id}"
|
|
1694
1512
|
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
"Using existing container ID for redis management: %s",
|
|
1704
|
-
existing_container_id,
|
|
1705
|
-
)
|
|
1706
|
-
self.docker_container = existing_container_id
|
|
1707
|
-
cmd = "docker restart " + self.docker_container
|
|
1708
|
-
self.start(cmd, "redis_setup")
|
|
1513
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1514
|
+
logging.info(
|
|
1515
|
+
"Using existing container ID for redis management: %s",
|
|
1516
|
+
action_details["actionDetails"]["containerId"],
|
|
1517
|
+
)
|
|
1518
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1519
|
+
cmd = "docker restart " + self.docker_container
|
|
1520
|
+
self.start(cmd, "redis_setup")
|
|
1709
1521
|
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
return
|
|
1714
|
-
else:
|
|
1715
|
-
logging.warning(
|
|
1716
|
-
"Container(s) not found (management=%s, redis=%s). Creating new containers.",
|
|
1717
|
-
management_container_exists,
|
|
1718
|
-
redis_container_exists
|
|
1719
|
-
)
|
|
1720
|
-
# Fall through to create new containers
|
|
1522
|
+
# Redis container restart
|
|
1523
|
+
redis_restart_cmd = "docker restart redis_container"
|
|
1524
|
+
self.start(redis_restart_cmd, "redis")
|
|
1721
1525
|
|
|
1526
|
+
return
|
|
1527
|
+
|
|
1722
1528
|
# Redis container with --net=host (Port: 6379)
|
|
1723
1529
|
redis_cmd = (
|
|
1724
1530
|
f"docker run -d --net=host "
|
|
1725
|
-
f"--name
|
|
1531
|
+
f"--name redis_container "
|
|
1726
1532
|
f"--restart unless-stopped "
|
|
1727
1533
|
f"{redis_image} "
|
|
1728
|
-
f"redis-server --bind 0.0.0.0 "
|
|
1729
|
-
f"--appendonly no "
|
|
1730
|
-
f'--save "" '
|
|
1731
|
-
f"--maxmemory 30gb "
|
|
1732
|
-
f"--maxmemory-policy allkeys-lru "
|
|
1733
|
-
f"--io-threads 4 "
|
|
1734
|
-
f"--io-threads-do-reads yes "
|
|
1735
|
-
f"--stream-node-max-bytes 8192 "
|
|
1736
|
-
f"--stream-node-max-entries 1000 "
|
|
1737
|
-
f"--hz 100 "
|
|
1738
|
-
f"--tcp-backlog 2048 "
|
|
1739
|
-
f"--timeout 0 "
|
|
1740
|
-
f"--lazyfree-lazy-eviction yes "
|
|
1741
|
-
f"--lazyfree-lazy-expire yes "
|
|
1742
|
-
f"--lazyfree-lazy-server-del yes "
|
|
1743
|
-
f"--activedefrag yes "
|
|
1744
|
-
f"--requirepass {redis_password}"
|
|
1534
|
+
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
1745
1535
|
)
|
|
1536
|
+
|
|
1746
1537
|
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
1747
1538
|
|
|
1748
1539
|
# Start Redis container first
|
|
@@ -1792,8 +1583,7 @@ def deploy_aggregator_execute(
|
|
|
1792
1583
|
if not action_details:
|
|
1793
1584
|
return
|
|
1794
1585
|
self.setup_action_requirements(action_details, work_fs)
|
|
1795
|
-
|
|
1796
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1586
|
+
cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1797
1587
|
logging.info("cmd: %s", cmd)
|
|
1798
1588
|
self.start(cmd, "deploy_aggregator")
|
|
1799
1589
|
|
|
@@ -1809,10 +1599,6 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1809
1599
|
return
|
|
1810
1600
|
action_id = action_details["_id"]
|
|
1811
1601
|
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1812
|
-
|
|
1813
|
-
# Get the service ID to track deployments
|
|
1814
|
-
service_id = action_details.get("_idService")
|
|
1815
|
-
|
|
1816
1602
|
self.setup_action_requirements(
|
|
1817
1603
|
action_details,
|
|
1818
1604
|
work_fs,
|
|
@@ -1820,29 +1606,17 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1820
1606
|
action_id=action_id,
|
|
1821
1607
|
)
|
|
1822
1608
|
|
|
1823
|
-
#
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
# Get GPU configuration (uses utility function with fail-safe fallback)
|
|
1827
|
-
use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
|
|
1828
|
-
|
|
1829
|
-
logging.info(
|
|
1830
|
-
"Action %s: Model deployment GPU config: %s (first_deployment=%s)",
|
|
1831
|
-
action_id,
|
|
1832
|
-
use_gpu if use_gpu else "CPU-only",
|
|
1833
|
-
is_first_deployment
|
|
1834
|
-
)
|
|
1835
|
-
|
|
1836
|
-
# Get or create TRITON_PORTS (uses utility method)
|
|
1837
|
-
triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
|
|
1609
|
+
# Get GPU configuration based on requirements and availability
|
|
1610
|
+
# This uses the best-fit algorithm to select the most appropriate GPU(s)
|
|
1611
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1838
1612
|
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1613
|
+
# Override: If GPU is required, use all available GPUs
|
|
1614
|
+
gpuRequired = action_details["actionDetails"].get("gpuRequired", False)
|
|
1615
|
+
if gpuRequired:
|
|
1616
|
+
use_gpu = "--runtime=nvidia --gpus all"
|
|
1843
1617
|
|
|
1844
|
-
|
|
1845
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"]
|
|
1618
|
+
extra_env_vars = {"INTERNAL_PORT": internal_port}
|
|
1619
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1846
1620
|
logging.info("cmd is: %s", cmd)
|
|
1847
1621
|
self.start(cmd, "deploy_log")
|
|
1848
1622
|
|
|
@@ -1865,27 +1639,17 @@ def model_train_execute(self: ActionInstance):
|
|
|
1865
1639
|
action_id=action_id,
|
|
1866
1640
|
)
|
|
1867
1641
|
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
return
|
|
1880
|
-
else:
|
|
1881
|
-
logging.warning(
|
|
1882
|
-
"Container %s not found. Creating new container.",
|
|
1883
|
-
existing_container_id
|
|
1884
|
-
)
|
|
1885
|
-
# Fall through to create new container
|
|
1886
|
-
|
|
1887
|
-
container_name = f"model_train_{self.action_record_id}"
|
|
1888
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
|
|
1642
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1643
|
+
logging.info(
|
|
1644
|
+
"Using existing container ID for training: %s",
|
|
1645
|
+
action_details["actionDetails"]["containerId"],
|
|
1646
|
+
)
|
|
1647
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1648
|
+
cmd = "docker restart " + self.docker_container
|
|
1649
|
+
self.start(cmd, "train_log")
|
|
1650
|
+
return
|
|
1651
|
+
|
|
1652
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
|
|
1889
1653
|
logging.info("cmd is: %s", cmd)
|
|
1890
1654
|
self.start(cmd, "train_log")
|
|
1891
1655
|
|
|
@@ -1906,27 +1670,17 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1906
1670
|
model_family=model_family,
|
|
1907
1671
|
action_id=action_id,
|
|
1908
1672
|
)
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
return
|
|
1921
|
-
else:
|
|
1922
|
-
logging.warning(
|
|
1923
|
-
"Container %s not found. Creating new container.",
|
|
1924
|
-
existing_container_id
|
|
1925
|
-
)
|
|
1926
|
-
# Fall through to create new container
|
|
1927
|
-
|
|
1928
|
-
container_name = f"model_eval_{self.action_record_id}"
|
|
1929
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
|
|
1673
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1674
|
+
logging.info(
|
|
1675
|
+
"Using existing container ID for training: %s",
|
|
1676
|
+
action_details["actionDetails"]["containerId"],
|
|
1677
|
+
)
|
|
1678
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1679
|
+
cmd = "docker restart " + self.docker_container
|
|
1680
|
+
self.start(cmd, "eval_log")
|
|
1681
|
+
return
|
|
1682
|
+
|
|
1683
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
|
|
1930
1684
|
logging.info("cmd is: %s", cmd)
|
|
1931
1685
|
self.start(cmd, "eval_log")
|
|
1932
1686
|
|
|
@@ -1950,27 +1704,17 @@ def model_export_execute(self: ActionInstance):
|
|
|
1950
1704
|
model_family=model_family,
|
|
1951
1705
|
action_id=action_id,
|
|
1952
1706
|
)
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
return
|
|
1965
|
-
else:
|
|
1966
|
-
logging.warning(
|
|
1967
|
-
"Container %s not found. Creating new container.",
|
|
1968
|
-
existing_container_id
|
|
1969
|
-
)
|
|
1970
|
-
# Fall through to create new container
|
|
1971
|
-
|
|
1972
|
-
container_name = f"model_export_{self.action_record_id}"
|
|
1973
|
-
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
|
|
1707
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1708
|
+
logging.info(
|
|
1709
|
+
"Using existing container ID for training: %s",
|
|
1710
|
+
action_details["actionDetails"]["containerId"],
|
|
1711
|
+
)
|
|
1712
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1713
|
+
cmd = "docker restart " + self.docker_container
|
|
1714
|
+
self.start(cmd, "export_log")
|
|
1715
|
+
return
|
|
1716
|
+
|
|
1717
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
|
|
1974
1718
|
logging.info("cmd is: %s", cmd)
|
|
1975
1719
|
self.start(cmd, "export_log")
|
|
1976
1720
|
|
|
@@ -1986,8 +1730,7 @@ def image_build_execute(self: ActionInstance):
|
|
|
1986
1730
|
action_id = action_details["_id"]
|
|
1987
1731
|
internal_api_key = self.get_internal_api_key(action_id)
|
|
1988
1732
|
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1989
|
-
|
|
1990
|
-
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
|
|
1733
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
|
|
1991
1734
|
logging.info("cmd is: %s", cmd)
|
|
1992
1735
|
self.start(cmd, "image_build_log")
|
|
1993
1736
|
|
|
@@ -1999,8 +1742,7 @@ def resource_clone_execute(self: ActionInstance):
|
|
|
1999
1742
|
if not action_details:
|
|
2000
1743
|
return
|
|
2001
1744
|
self.setup_action_requirements(action_details)
|
|
2002
|
-
|
|
2003
|
-
cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
|
|
1745
|
+
cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
|
|
2004
1746
|
logging.info("cmd is: %s", cmd)
|
|
2005
1747
|
self.start(cmd, "resource_clone")
|
|
2006
1748
|
|
|
@@ -2016,27 +1758,17 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
2016
1758
|
self.docker_container = (
|
|
2017
1759
|
f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
|
|
2018
1760
|
)
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
return
|
|
2031
|
-
else:
|
|
2032
|
-
logging.warning(
|
|
2033
|
-
"Container %s not found. Creating new container.",
|
|
2034
|
-
existing_container_id
|
|
2035
|
-
)
|
|
2036
|
-
# Fall through to create new container
|
|
2037
|
-
|
|
2038
|
-
container_name = f"streaming_gateway_{self.action_record_id}"
|
|
2039
|
-
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1761
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1762
|
+
logging.info(
|
|
1763
|
+
"Using existing container ID for training: %s",
|
|
1764
|
+
action_details["actionDetails"]["containerId"],
|
|
1765
|
+
)
|
|
1766
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1767
|
+
cmd = "docker restart " + self.docker_container
|
|
1768
|
+
self.start(cmd, "streaming_gateway")
|
|
1769
|
+
return
|
|
1770
|
+
|
|
1771
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
2040
1772
|
logging.info("cmd is: %s", cmd)
|
|
2041
1773
|
self.start(cmd, "streaming_gateway")
|
|
2042
1774
|
|
|
@@ -2130,24 +1862,16 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
2130
1862
|
else:
|
|
2131
1863
|
pkgs = f"matrice_common matrice"
|
|
2132
1864
|
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
self.start(cmd, "kafka_setup")
|
|
2144
|
-
return
|
|
2145
|
-
else:
|
|
2146
|
-
logging.warning(
|
|
2147
|
-
"Container %s not found. Creating new container.",
|
|
2148
|
-
existing_container_id
|
|
2149
|
-
)
|
|
2150
|
-
# Fall through to create new container
|
|
1865
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1866
|
+
logging.info(
|
|
1867
|
+
"Using existing container ID for training: %s",
|
|
1868
|
+
action_details["actionDetails"]["containerId"],
|
|
1869
|
+
)
|
|
1870
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1871
|
+
cmd = "docker restart " + self.docker_container
|
|
1872
|
+
self.start(cmd, "kafka_setup")
|
|
1873
|
+
return
|
|
1874
|
+
|
|
2151
1875
|
|
|
2152
1876
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
2153
1877
|
cmd = (
|
|
@@ -2184,31 +1908,21 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
2184
1908
|
|
|
2185
1909
|
self.setup_action_requirements(action_details)
|
|
2186
1910
|
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
self.start(cmd, "inference_tracker_setup")
|
|
2198
|
-
return
|
|
2199
|
-
else:
|
|
2200
|
-
logging.warning(
|
|
2201
|
-
"Container %s not found. Creating new container.",
|
|
2202
|
-
existing_container_id
|
|
2203
|
-
)
|
|
2204
|
-
# Fall through to create new container
|
|
2205
|
-
|
|
1911
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1912
|
+
logging.info(
|
|
1913
|
+
"Using existing container ID for inference tracker: %s",
|
|
1914
|
+
action_details["actionDetails"]["containerId"],
|
|
1915
|
+
)
|
|
1916
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1917
|
+
cmd = "docker restart " + self.docker_container
|
|
1918
|
+
self.start(cmd, "inference_tracker_setup")
|
|
1919
|
+
return
|
|
1920
|
+
|
|
2206
1921
|
# This is the existing Docker run command
|
|
2207
|
-
container_name = f"inference_tracker_{self.action_record_id}"
|
|
2208
1922
|
worker_cmd = (
|
|
2209
1923
|
f"docker run -d --pull=always --net=host "
|
|
2210
|
-
|
|
2211
|
-
f"--name
|
|
1924
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1925
|
+
f"--name inference-tracker-worker "
|
|
2212
1926
|
f"-v matrice_myvol:/matrice_data "
|
|
2213
1927
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
2214
1928
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -2256,7 +1970,7 @@ def video_storage_setup_execute(self: ActionInstance):
|
|
|
2256
1970
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
2257
1971
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
2258
1972
|
f'-e ACTION_ID="{self.action_record_id}" '
|
|
2259
|
-
f'--restart=unless-stopped '
|
|
1973
|
+
f' --restart=unless-stopped '
|
|
2260
1974
|
f"{image}"
|
|
2261
1975
|
)
|
|
2262
1976
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
2
|
+
matrice_compute/action_instance.py,sha256=AMWjva5EALh2X_rG7UpiOBnfoM7_b50gI0zuuTQMD4I,75942
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=a_TulMnu462xc0t_A-Mpug5zhQTmtpjiv7mhiC_IAVw,18280
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
5
|
matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
|
|
@@ -11,8 +11,8 @@ matrice_compute/resources_tracker.py,sha256=DffKitGU1gran0OAuKIsfH0XeOe03xU7NGl-
|
|
|
11
11
|
matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
|
|
12
12
|
matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
|
|
13
13
|
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
18
|
-
matrice_compute-0.1.
|
|
14
|
+
matrice_compute-0.1.37.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
15
|
+
matrice_compute-0.1.37.dist-info/METADATA,sha256=eat7e7dCRgYrKD1LSYvQZCvZasZtC8C6vUmqB6n7fqQ,1038
|
|
16
|
+
matrice_compute-0.1.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
matrice_compute-0.1.37.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
18
|
+
matrice_compute-0.1.37.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|