matrice-compute 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +4 -0
- matrice_compute/action_instance.py +375 -208
- matrice_compute/actions_manager.py +1 -1
- matrice_compute/instance_manager.py +1 -1
- matrice_compute/scaling.py +1 -1
- matrice_compute/shutdown_manager.py +2 -2
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.32.dist-info}/METADATA +1 -1
- matrice_compute-0.1.32.dist-info/RECORD +18 -0
- matrice_compute-0.1.30.dist-info/RECORD +0 -18
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.32.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.32.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.30.dist-info → matrice_compute-0.1.32.dist-info}/top_level.txt +0 -0
matrice_compute/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Module providing __init__ functionality."""
|
|
2
2
|
|
|
3
3
|
import subprocess
|
|
4
|
+
import logging
|
|
4
5
|
|
|
5
6
|
from matrice_common.utils import dependencies_check
|
|
6
7
|
|
|
@@ -17,4 +18,7 @@ subprocess.run( # Re-upgrade docker to avoid missing DOCKER_HOST connection erro
|
|
|
17
18
|
|
|
18
19
|
from matrice_compute.instance_manager import InstanceManager # noqa: E402
|
|
19
20
|
|
|
21
|
+
logging.getLogger("kafka").setLevel(logging.INFO)
|
|
22
|
+
logging.getLogger("confluent_kafka").setLevel(logging.INFO)
|
|
23
|
+
|
|
20
24
|
__all__ = ["InstanceManager"]
|
|
@@ -296,7 +296,7 @@ class ActionInstance:
|
|
|
296
296
|
getattr(self, "action_record_id", "unknown"),
|
|
297
297
|
)
|
|
298
298
|
else:
|
|
299
|
-
logging.
|
|
299
|
+
logging.info(
|
|
300
300
|
"No additional logs to send for action %s",
|
|
301
301
|
getattr(self, "action_record_id", "unknown"),
|
|
302
302
|
)
|
|
@@ -411,6 +411,7 @@ class ActionInstance:
|
|
|
411
411
|
destination_workspace_path: str = "/usr/src/workspace",
|
|
412
412
|
docker_workdir: str = "",
|
|
413
413
|
extra_pkgs: list = [],
|
|
414
|
+
container_name: str = "",
|
|
414
415
|
):
|
|
415
416
|
"""Build base Docker command with common options.
|
|
416
417
|
|
|
@@ -425,6 +426,7 @@ class ActionInstance:
|
|
|
425
426
|
destination_workspace_path (str): Container workspace path
|
|
426
427
|
docker_workdir (str): Docker working directory
|
|
427
428
|
extra_pkgs (list): List of extra packages to install
|
|
429
|
+
container_name (str): Docker container name (format: {action_type}_{action_id})
|
|
428
430
|
Returns:
|
|
429
431
|
str: Base Docker command
|
|
430
432
|
"""
|
|
@@ -489,13 +491,16 @@ class ActionInstance:
|
|
|
489
491
|
]
|
|
490
492
|
)
|
|
491
493
|
|
|
494
|
+
# Build container name option if provided
|
|
495
|
+
name_option = f"--name {container_name}" if container_name else ""
|
|
496
|
+
|
|
492
497
|
cmd_parts = [
|
|
493
|
-
f"docker run {use_gpu} ",
|
|
498
|
+
f"docker run -d {use_gpu} ",
|
|
499
|
+
name_option,
|
|
494
500
|
network_config,
|
|
495
501
|
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
496
502
|
*volumes,
|
|
497
503
|
# Container configuration and startup commands
|
|
498
|
-
f"--cidfile ./{self.action_record_id}.cid ",
|
|
499
504
|
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
500
505
|
f'/bin/bash -c "cd {docker_workdir} && '
|
|
501
506
|
f"{env_exports} && "
|
|
@@ -883,6 +888,34 @@ class ActionInstance:
|
|
|
883
888
|
job_params=action_details["jobParams"],
|
|
884
889
|
)
|
|
885
890
|
|
|
891
|
+
@staticmethod
|
|
892
|
+
def container_exists(container_id: str) -> bool:
|
|
893
|
+
"""Check if a Docker container exists.
|
|
894
|
+
|
|
895
|
+
Args:
|
|
896
|
+
container_id (str): Container ID or name to check
|
|
897
|
+
|
|
898
|
+
Returns:
|
|
899
|
+
bool: True if container exists, False otherwise
|
|
900
|
+
"""
|
|
901
|
+
if not container_id:
|
|
902
|
+
return False
|
|
903
|
+
try:
|
|
904
|
+
result = subprocess.run(
|
|
905
|
+
["docker", "inspect", container_id],
|
|
906
|
+
capture_output=True,
|
|
907
|
+
text=True,
|
|
908
|
+
timeout=10
|
|
909
|
+
)
|
|
910
|
+
return result.returncode == 0
|
|
911
|
+
except Exception as e:
|
|
912
|
+
logging.warning(
|
|
913
|
+
"Error checking if container %s exists: %s",
|
|
914
|
+
container_id,
|
|
915
|
+
str(e)
|
|
916
|
+
)
|
|
917
|
+
return False
|
|
918
|
+
|
|
886
919
|
@log_errors(raise_exception=True)
|
|
887
920
|
def start_process(self, cmd, log_name):
|
|
888
921
|
"""Start the process and initialize logging.
|
|
@@ -897,60 +930,45 @@ class ActionInstance:
|
|
|
897
930
|
self.cmd = cmd
|
|
898
931
|
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
899
932
|
|
|
900
|
-
with
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
933
|
+
# Run docker with -d flag to get container ID from stdout
|
|
934
|
+
process = subprocess.Popen(
|
|
935
|
+
shlex.split(self.cmd),
|
|
936
|
+
stdout=subprocess.PIPE,
|
|
937
|
+
stderr=subprocess.PIPE,
|
|
938
|
+
text=True,
|
|
939
|
+
env={**os.environ},
|
|
940
|
+
)
|
|
908
941
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
max_retries = 5
|
|
913
|
-
retry_delay = 1 # seconds
|
|
914
|
-
for attempt in range(max_retries):
|
|
915
|
-
try:
|
|
916
|
-
with open(cid_file_path, "r") as cid_file:
|
|
917
|
-
container_id = cid_file.read().strip()
|
|
918
|
-
self.container_id = container_id
|
|
919
|
-
logging.info(
|
|
920
|
-
"Started process for action %s with container ID: %s",
|
|
921
|
-
self.action_record_id,
|
|
922
|
-
self.container_id,
|
|
923
|
-
)
|
|
924
|
-
break
|
|
925
|
-
except FileNotFoundError:
|
|
926
|
-
logging.warning(
|
|
927
|
-
"CID file not found for action %s, attempt %d/%d",
|
|
928
|
-
self.action_record_id,
|
|
929
|
-
attempt + 1,
|
|
930
|
-
max_retries,
|
|
931
|
-
)
|
|
932
|
-
time.sleep(retry_delay)
|
|
933
|
-
except Exception as e:
|
|
934
|
-
logging.error(
|
|
935
|
-
"Error reading CID file for action %s: %s",
|
|
936
|
-
self.action_record_id,
|
|
937
|
-
str(e),
|
|
938
|
-
)
|
|
939
|
-
time.sleep(retry_delay)
|
|
940
|
-
else:
|
|
942
|
+
stdout, stderr = process.communicate(timeout=120)
|
|
943
|
+
|
|
944
|
+
if process.returncode != 0:
|
|
941
945
|
logging.error(
|
|
942
|
-
"
|
|
946
|
+
"Docker run failed for action %s: %s",
|
|
943
947
|
self.action_record_id,
|
|
944
|
-
|
|
948
|
+
stderr,
|
|
945
949
|
)
|
|
946
|
-
raise
|
|
950
|
+
raise RuntimeError(f"Docker run failed: {stderr}")
|
|
947
951
|
|
|
948
|
-
|
|
952
|
+
self.container_id = stdout.strip()
|
|
953
|
+
logging.info(
|
|
954
|
+
"Started container for action %s with ID: %s",
|
|
955
|
+
self.action_record_id,
|
|
956
|
+
self.container_id,
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
# Start following container logs in background
|
|
960
|
+
self.process = subprocess.Popen(
|
|
961
|
+
["docker", "logs", "-f", self.container_id],
|
|
962
|
+
stdout=open(self.log_path, "wb"),
|
|
963
|
+
stderr=subprocess.STDOUT,
|
|
964
|
+
start_new_session=True,
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
# Report container id to scaling service
|
|
949
968
|
self.scaling.update_action_container_id(
|
|
950
969
|
action_record_id=self.action_record_id,
|
|
951
970
|
container_id=self.container_id,
|
|
952
971
|
)
|
|
953
|
-
|
|
954
972
|
|
|
955
973
|
@log_errors(raise_exception=False)
|
|
956
974
|
def start_logger(self):
|
|
@@ -1111,7 +1129,8 @@ def data_preparation_execute(
|
|
|
1111
1129
|
"Started pulling Docker image with PID: %s",
|
|
1112
1130
|
process.pid,
|
|
1113
1131
|
)
|
|
1114
|
-
|
|
1132
|
+
container_name = f"data_prep_{self.action_record_id}"
|
|
1133
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1115
1134
|
logging.info("cmd is: %s", cmd)
|
|
1116
1135
|
self.start(cmd, "data_preparation_log")
|
|
1117
1136
|
|
|
@@ -1140,7 +1159,8 @@ def data_processing_execute(self: ActionInstance):
|
|
|
1140
1159
|
service="bg-job-scheduler",
|
|
1141
1160
|
job_params=action["jobParams"],
|
|
1142
1161
|
)
|
|
1143
|
-
|
|
1162
|
+
container_name = f"data_processing_{self.action_record_id}"
|
|
1163
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1144
1164
|
logging.info("cmd: %s", cmd)
|
|
1145
1165
|
self.start(cmd, "data_processing_log")
|
|
1146
1166
|
|
|
@@ -1153,7 +1173,8 @@ def data_split_execute(self: ActionInstance):
|
|
|
1153
1173
|
if not action_details:
|
|
1154
1174
|
return
|
|
1155
1175
|
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1156
|
-
|
|
1176
|
+
container_name = f"data_split_{self.action_record_id}"
|
|
1177
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1157
1178
|
logging.info("cmd: %s", cmd)
|
|
1158
1179
|
self.start(cmd, "data_split")
|
|
1159
1180
|
|
|
@@ -1168,7 +1189,8 @@ def dataset_annotation_execute(
|
|
|
1168
1189
|
if not action_details:
|
|
1169
1190
|
return
|
|
1170
1191
|
self.setup_action_requirements(action_details, work_fs)
|
|
1171
|
-
|
|
1192
|
+
container_name = f"dataset_annotation_{self.action_record_id}"
|
|
1193
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1172
1194
|
logging.info("cmd: %s", cmd)
|
|
1173
1195
|
self.start(cmd, "dataset_annotation")
|
|
1174
1196
|
|
|
@@ -1183,7 +1205,8 @@ def dataset_augmentation_execute(
|
|
|
1183
1205
|
if not action_details:
|
|
1184
1206
|
return
|
|
1185
1207
|
self.setup_action_requirements(action_details, work_fs)
|
|
1186
|
-
|
|
1208
|
+
container_name = f"dataset_augmentation_{self.action_record_id}"
|
|
1209
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1187
1210
|
logging.info("cmd: %s", cmd)
|
|
1188
1211
|
self.start(cmd, "dataset_augmentation")
|
|
1189
1212
|
|
|
@@ -1199,7 +1222,8 @@ def augmentation_server_creation_execute(
|
|
|
1199
1222
|
if not action_details:
|
|
1200
1223
|
return
|
|
1201
1224
|
self.setup_action_requirements(action_details, work_fs)
|
|
1202
|
-
|
|
1225
|
+
container_name = f"augmentation_setup_{self.action_record_id}"
|
|
1226
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1203
1227
|
logging.info("cmd: %s", cmd)
|
|
1204
1228
|
self.start(cmd, "augmentation_setup")
|
|
1205
1229
|
|
|
@@ -1220,25 +1244,41 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1220
1244
|
|
|
1221
1245
|
project_id = action_details["_idProject"]
|
|
1222
1246
|
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
action_details["actionDetails"]["containerId"],
|
|
1227
|
-
)
|
|
1228
|
-
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1229
|
-
cmd = "docker restart " + self.docker_container
|
|
1230
|
-
self.start(cmd, "qdrant_setup")
|
|
1247
|
+
# Define container names with action_record_id for uniqueness
|
|
1248
|
+
mongodb_container_name = f"database_setup_{self.action_record_id}"
|
|
1249
|
+
qdrant_container_name = f"qdrant_{self.action_record_id}"
|
|
1231
1250
|
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1251
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1252
|
+
if existing_container_id:
|
|
1253
|
+
# Check if both containers actually exist before trying to restart
|
|
1254
|
+
mongodb_container_exists = ActionInstance.container_exists(existing_container_id)
|
|
1255
|
+
qdrant_container_exists = ActionInstance.container_exists(qdrant_container_name)
|
|
1235
1256
|
|
|
1236
|
-
|
|
1257
|
+
if mongodb_container_exists and qdrant_container_exists:
|
|
1258
|
+
logging.info(
|
|
1259
|
+
"Using existing container ID for database setup: %s",
|
|
1260
|
+
existing_container_id,
|
|
1261
|
+
)
|
|
1262
|
+
self.docker_container = existing_container_id
|
|
1263
|
+
cmd = "docker restart " + self.docker_container
|
|
1264
|
+
self.start(cmd, "qdrant_setup")
|
|
1265
|
+
|
|
1266
|
+
# qdrant restart
|
|
1267
|
+
qdrant_cmd = f"docker restart {qdrant_container_name}"
|
|
1268
|
+
self.start(qdrant_cmd, "qdrant_setup")
|
|
1269
|
+
return
|
|
1270
|
+
else:
|
|
1271
|
+
logging.warning(
|
|
1272
|
+
"Container(s) not found (mongodb=%s, qdrant=%s). Creating new containers.",
|
|
1273
|
+
mongodb_container_exists,
|
|
1274
|
+
qdrant_container_exists
|
|
1275
|
+
)
|
|
1276
|
+
# Fall through to create new containers
|
|
1237
1277
|
|
|
1238
1278
|
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1239
1279
|
cmd = (
|
|
1240
1280
|
f"docker run --pull=always --net=host "
|
|
1241
|
-
f"--name
|
|
1281
|
+
f"--name {mongodb_container_name} "
|
|
1242
1282
|
f"-v matrice_myvol:/matrice_data "
|
|
1243
1283
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1244
1284
|
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
@@ -1253,7 +1293,7 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1253
1293
|
# Qdrant container with --net=host (Port: 6334)
|
|
1254
1294
|
qdrant_cmd = (
|
|
1255
1295
|
f"docker run --pull=always --net=host "
|
|
1256
|
-
f"--name
|
|
1296
|
+
f"--name {qdrant_container_name} "
|
|
1257
1297
|
f"-v matrice_myvol:/matrice_data "
|
|
1258
1298
|
f"{'qdrant/qdrant:latest'} "
|
|
1259
1299
|
)
|
|
@@ -1279,23 +1319,32 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1279
1319
|
|
|
1280
1320
|
self.setup_action_requirements(action_details)
|
|
1281
1321
|
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1322
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1323
|
+
if existing_container_id:
|
|
1324
|
+
# Check if container actually exists before trying to restart
|
|
1325
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1326
|
+
logging.info(
|
|
1327
|
+
"Using existing container ID for facial recognition worker: %s",
|
|
1328
|
+
existing_container_id,
|
|
1329
|
+
)
|
|
1330
|
+
self.docker_container = existing_container_id
|
|
1331
|
+
cmd = "docker restart " + self.docker_container
|
|
1332
|
+
self.start(cmd, "facial_recognition_setup")
|
|
1333
|
+
return
|
|
1334
|
+
else:
|
|
1335
|
+
logging.warning(
|
|
1336
|
+
"Container %s not found. Creating new container.",
|
|
1337
|
+
existing_container_id
|
|
1338
|
+
)
|
|
1339
|
+
# Fall through to create new container
|
|
1291
1340
|
|
|
1292
1341
|
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1342
|
+
container_name = f"facial_recognition_{self.action_record_id}"
|
|
1293
1343
|
worker_cmd = (
|
|
1294
1344
|
f"docker run -d --pull=always --net=host "
|
|
1295
|
-
f"--name
|
|
1296
|
-
f"--cidfile ./{self.action_record_id}.cid "
|
|
1297
|
-
f"-v matrice_myvol:/matrice_data "
|
|
1345
|
+
f"--name {container_name} "
|
|
1298
1346
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1347
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1299
1348
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1300
1349
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1301
1350
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
@@ -1321,20 +1370,30 @@ def lpr_setup_execute(self: ActionInstance):
|
|
|
1321
1370
|
|
|
1322
1371
|
self.setup_action_requirements(action_details)
|
|
1323
1372
|
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1373
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1374
|
+
if existing_container_id:
|
|
1375
|
+
# Check if container actually exists before trying to restart
|
|
1376
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1377
|
+
logging.info(
|
|
1378
|
+
"Using existing container ID for LPR worker: %s",
|
|
1379
|
+
existing_container_id,
|
|
1380
|
+
)
|
|
1381
|
+
self.docker_container = existing_container_id
|
|
1382
|
+
cmd = "docker restart " + self.docker_container
|
|
1383
|
+
self.start(cmd, "lpr_setup")
|
|
1384
|
+
return
|
|
1385
|
+
else:
|
|
1386
|
+
logging.warning(
|
|
1387
|
+
"Container %s not found. Creating new container.",
|
|
1388
|
+
existing_container_id
|
|
1389
|
+
)
|
|
1390
|
+
# Fall through to create new container
|
|
1333
1391
|
|
|
1334
1392
|
# LPR worker container with --net=host (Port: 8082)
|
|
1393
|
+
container_name = f"lpr_{self.action_record_id}"
|
|
1335
1394
|
worker_cmd = (
|
|
1336
1395
|
f"docker run -d --net=host --pull=always "
|
|
1337
|
-
f"--name
|
|
1396
|
+
f"--name {container_name} "
|
|
1338
1397
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1339
1398
|
f"-v matrice_myvol:/matrice_data "
|
|
1340
1399
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -1372,20 +1431,30 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1372
1431
|
|
|
1373
1432
|
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1374
1433
|
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1434
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1435
|
+
if existing_container_id:
|
|
1436
|
+
# Check if container actually exists before trying to restart
|
|
1437
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1438
|
+
logging.info(
|
|
1439
|
+
"Using existing container ID for inference WebSocket server: %s",
|
|
1440
|
+
existing_container_id,
|
|
1441
|
+
)
|
|
1442
|
+
self.docker_container = existing_container_id
|
|
1443
|
+
cmd = "docker restart " + self.docker_container
|
|
1444
|
+
self.start(cmd, "inference_ws_server")
|
|
1445
|
+
return
|
|
1446
|
+
else:
|
|
1447
|
+
logging.warning(
|
|
1448
|
+
"Container %s not found. Creating new container.",
|
|
1449
|
+
existing_container_id
|
|
1450
|
+
)
|
|
1451
|
+
# Fall through to create new container
|
|
1384
1452
|
|
|
1385
1453
|
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1454
|
+
container_name = f"inference_ws_{self.action_record_id}"
|
|
1386
1455
|
worker_cmd = (
|
|
1387
1456
|
f"docker run -d --pull=always --net=host "
|
|
1388
|
-
f"--name
|
|
1457
|
+
f"--name {container_name} "
|
|
1389
1458
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1390
1459
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1391
1460
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1420,20 +1489,30 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1420
1489
|
|
|
1421
1490
|
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1422
1491
|
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1492
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1493
|
+
if existing_container_id:
|
|
1494
|
+
# Check if container actually exists before trying to restart
|
|
1495
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1496
|
+
logging.info(
|
|
1497
|
+
"Using existing container ID for frontend streaming: %s",
|
|
1498
|
+
existing_container_id,
|
|
1499
|
+
)
|
|
1500
|
+
self.docker_container = existing_container_id
|
|
1501
|
+
cmd = "docker restart " + self.docker_container
|
|
1502
|
+
self.start(cmd, "fe_fs_streaming")
|
|
1503
|
+
return
|
|
1504
|
+
else:
|
|
1505
|
+
logging.warning(
|
|
1506
|
+
"Container %s not found. Creating new container.",
|
|
1507
|
+
existing_container_id
|
|
1508
|
+
)
|
|
1509
|
+
# Fall through to create new container
|
|
1510
|
+
|
|
1433
1511
|
# Frontend streaming with --net=host (Port: 3000)
|
|
1512
|
+
container_name = f"fe_streaming_{self.action_record_id}"
|
|
1434
1513
|
worker_cmd = (
|
|
1435
1514
|
f"docker run -d --pull=always --net=host "
|
|
1436
|
-
f"--name
|
|
1515
|
+
f"--name {container_name} "
|
|
1437
1516
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1438
1517
|
f"-v matrice_myvol:/matrice_data "
|
|
1439
1518
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
@@ -1465,20 +1544,30 @@ def fe_analytics_service_execute(self: ActionInstance):
|
|
|
1465
1544
|
|
|
1466
1545
|
project_id = action_details["_idProject"]
|
|
1467
1546
|
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1547
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1548
|
+
if existing_container_id:
|
|
1549
|
+
# Check if container actually exists before trying to restart
|
|
1550
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1551
|
+
logging.info(
|
|
1552
|
+
"Using existing container ID for frontend analytics service: %s",
|
|
1553
|
+
existing_container_id,
|
|
1554
|
+
)
|
|
1555
|
+
self.docker_container = existing_container_id
|
|
1556
|
+
cmd = "docker restart " + self.docker_container
|
|
1557
|
+
self.start(cmd, "fe_analytics_service")
|
|
1558
|
+
return
|
|
1559
|
+
else:
|
|
1560
|
+
logging.warning(
|
|
1561
|
+
"Container %s not found. Creating new container.",
|
|
1562
|
+
existing_container_id
|
|
1563
|
+
)
|
|
1564
|
+
# Fall through to create new container
|
|
1565
|
+
|
|
1478
1566
|
# Frontend analytics service with --net=host (Port: 3001)
|
|
1567
|
+
container_name = f"fe_analytics_{self.action_record_id}"
|
|
1479
1568
|
worker_cmd = (
|
|
1480
1569
|
f"docker run -d --pull=always --net=host "
|
|
1481
|
-
f"--name
|
|
1570
|
+
f"--name {container_name} "
|
|
1482
1571
|
f"--cidfile ./{self.action_record_id}.cid "
|
|
1483
1572
|
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1484
1573
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -1510,7 +1599,8 @@ def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
|
1510
1599
|
else:
|
|
1511
1600
|
return
|
|
1512
1601
|
use_gpu = self.get_gpu_config(action_details)
|
|
1513
|
-
|
|
1602
|
+
container_name = f"dataset_generation_{self.action_record_id}"
|
|
1603
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1514
1604
|
logging.info("cmd is: %s", cmd)
|
|
1515
1605
|
self.start(cmd, "dataset_generation")
|
|
1516
1606
|
|
|
@@ -1531,7 +1621,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1531
1621
|
else:
|
|
1532
1622
|
return
|
|
1533
1623
|
use_gpu = self.get_gpu_config(action_details)
|
|
1534
|
-
|
|
1624
|
+
container_name = f"synthetic_data_setup_{self.action_record_id}"
|
|
1625
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"], container_name=container_name)} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1535
1626
|
logging.info("cmd is: %s", cmd)
|
|
1536
1627
|
self.start(cmd, "synthetic_data_setup")
|
|
1537
1628
|
|
|
@@ -1568,26 +1659,40 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1568
1659
|
|
|
1569
1660
|
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1570
1661
|
|
|
1662
|
+
# Define container names with action_record_id for uniqueness
|
|
1663
|
+
redis_container_name = f"redis_{self.action_record_id}"
|
|
1571
1664
|
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
)
|
|
1577
|
-
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1578
|
-
cmd = "docker restart " + self.docker_container
|
|
1579
|
-
self.start(cmd, "redis_setup")
|
|
1665
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1666
|
+
if existing_container_id:
|
|
1667
|
+
# Check if both containers actually exist before trying to restart
|
|
1668
|
+
management_container_exists = ActionInstance.container_exists(existing_container_id)
|
|
1669
|
+
redis_container_exists = ActionInstance.container_exists(redis_container_name)
|
|
1580
1670
|
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1671
|
+
if management_container_exists and redis_container_exists:
|
|
1672
|
+
logging.info(
|
|
1673
|
+
"Using existing container ID for redis management: %s",
|
|
1674
|
+
existing_container_id,
|
|
1675
|
+
)
|
|
1676
|
+
self.docker_container = existing_container_id
|
|
1677
|
+
cmd = "docker restart " + self.docker_container
|
|
1678
|
+
self.start(cmd, "redis_setup")
|
|
1679
|
+
|
|
1680
|
+
# Redis container restart
|
|
1681
|
+
redis_restart_cmd = f"docker restart {redis_container_name}"
|
|
1682
|
+
self.start(redis_restart_cmd, "redis")
|
|
1683
|
+
return
|
|
1684
|
+
else:
|
|
1685
|
+
logging.warning(
|
|
1686
|
+
"Container(s) not found (management=%s, redis=%s). Creating new containers.",
|
|
1687
|
+
management_container_exists,
|
|
1688
|
+
redis_container_exists
|
|
1689
|
+
)
|
|
1690
|
+
# Fall through to create new containers
|
|
1584
1691
|
|
|
1585
|
-
return
|
|
1586
|
-
|
|
1587
1692
|
# Redis container with --net=host (Port: 6379)
|
|
1588
1693
|
redis_cmd = (
|
|
1589
1694
|
f"docker run -d --net=host "
|
|
1590
|
-
f"--name
|
|
1695
|
+
f"--name {redis_container_name} "
|
|
1591
1696
|
f"--restart unless-stopped "
|
|
1592
1697
|
f"{redis_image} "
|
|
1593
1698
|
f"redis-server --bind 0.0.0.0 "
|
|
@@ -1657,7 +1762,8 @@ def deploy_aggregator_execute(
|
|
|
1657
1762
|
if not action_details:
|
|
1658
1763
|
return
|
|
1659
1764
|
self.setup_action_requirements(action_details, work_fs)
|
|
1660
|
-
|
|
1765
|
+
container_name = f"deploy_aggregator_{self.action_record_id}"
|
|
1766
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, container_name=container_name)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1661
1767
|
logging.info("cmd: %s", cmd)
|
|
1662
1768
|
self.start(cmd, "deploy_aggregator")
|
|
1663
1769
|
|
|
@@ -1705,7 +1811,8 @@ def model_deploy_execute(self: ActionInstance):
|
|
|
1705
1811
|
"TRITON_PORTS": triton_ports
|
|
1706
1812
|
}
|
|
1707
1813
|
|
|
1708
|
-
|
|
1814
|
+
container_name = f"model_deploy_{self.action_record_id}"
|
|
1815
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"], container_name=container_name)} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1709
1816
|
logging.info("cmd is: %s", cmd)
|
|
1710
1817
|
self.start(cmd, "deploy_log")
|
|
1711
1818
|
|
|
@@ -1728,17 +1835,27 @@ def model_train_execute(self: ActionInstance):
|
|
|
1728
1835
|
action_id=action_id,
|
|
1729
1836
|
)
|
|
1730
1837
|
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1838
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1839
|
+
if existing_container_id:
|
|
1840
|
+
# Check if container actually exists before trying to restart
|
|
1841
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1842
|
+
logging.info(
|
|
1843
|
+
"Using existing container ID for training: %s",
|
|
1844
|
+
existing_container_id,
|
|
1845
|
+
)
|
|
1846
|
+
self.docker_container = existing_container_id
|
|
1847
|
+
cmd = "docker restart " + self.docker_container
|
|
1848
|
+
self.start(cmd, "train_log")
|
|
1849
|
+
return
|
|
1850
|
+
else:
|
|
1851
|
+
logging.warning(
|
|
1852
|
+
"Container %s not found. Creating new container.",
|
|
1853
|
+
existing_container_id
|
|
1854
|
+
)
|
|
1855
|
+
# Fall through to create new container
|
|
1856
|
+
|
|
1857
|
+
container_name = f"model_train_{self.action_record_id}"
|
|
1858
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key, container_name=container_name)} python3 train.py {self.action_record_id} "'
|
|
1742
1859
|
logging.info("cmd is: %s", cmd)
|
|
1743
1860
|
self.start(cmd, "train_log")
|
|
1744
1861
|
|
|
@@ -1759,17 +1876,27 @@ def model_eval_execute(self: ActionInstance):
|
|
|
1759
1876
|
model_family=model_family,
|
|
1760
1877
|
action_id=action_id,
|
|
1761
1878
|
)
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1879
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1880
|
+
if existing_container_id:
|
|
1881
|
+
# Check if container actually exists before trying to restart
|
|
1882
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1883
|
+
logging.info(
|
|
1884
|
+
"Using existing container ID for evaluation: %s",
|
|
1885
|
+
existing_container_id,
|
|
1886
|
+
)
|
|
1887
|
+
self.docker_container = existing_container_id
|
|
1888
|
+
cmd = "docker restart " + self.docker_container
|
|
1889
|
+
self.start(cmd, "eval_log")
|
|
1890
|
+
return
|
|
1891
|
+
else:
|
|
1892
|
+
logging.warning(
|
|
1893
|
+
"Container %s not found. Creating new container.",
|
|
1894
|
+
existing_container_id
|
|
1895
|
+
)
|
|
1896
|
+
# Fall through to create new container
|
|
1897
|
+
|
|
1898
|
+
container_name = f"model_eval_{self.action_record_id}"
|
|
1899
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 eval.py {self.action_record_id} "'
|
|
1773
1900
|
logging.info("cmd is: %s", cmd)
|
|
1774
1901
|
self.start(cmd, "eval_log")
|
|
1775
1902
|
|
|
@@ -1793,17 +1920,27 @@ def model_export_execute(self: ActionInstance):
|
|
|
1793
1920
|
model_family=model_family,
|
|
1794
1921
|
action_id=action_id,
|
|
1795
1922
|
)
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1923
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1924
|
+
if existing_container_id:
|
|
1925
|
+
# Check if container actually exists before trying to restart
|
|
1926
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1927
|
+
logging.info(
|
|
1928
|
+
"Using existing container ID for export: %s",
|
|
1929
|
+
existing_container_id,
|
|
1930
|
+
)
|
|
1931
|
+
self.docker_container = existing_container_id
|
|
1932
|
+
cmd = "docker restart " + self.docker_container
|
|
1933
|
+
self.start(cmd, "export_log")
|
|
1934
|
+
return
|
|
1935
|
+
else:
|
|
1936
|
+
logging.warning(
|
|
1937
|
+
"Container %s not found. Creating new container.",
|
|
1938
|
+
existing_container_id
|
|
1939
|
+
)
|
|
1940
|
+
# Fall through to create new container
|
|
1941
|
+
|
|
1942
|
+
container_name = f"model_export_{self.action_record_id}"
|
|
1943
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, container_name=container_name)} python3 export.py {self.action_record_id} "'
|
|
1807
1944
|
logging.info("cmd is: %s", cmd)
|
|
1808
1945
|
self.start(cmd, "export_log")
|
|
1809
1946
|
|
|
@@ -1819,7 +1956,8 @@ def image_build_execute(self: ActionInstance):
|
|
|
1819
1956
|
action_id = action_details["_id"]
|
|
1820
1957
|
internal_api_key = self.get_internal_api_key(action_id)
|
|
1821
1958
|
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1822
|
-
|
|
1959
|
+
container_name = f"image_build_{self.action_record_id}"
|
|
1960
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars, container_name=container_name)} python3 main.py {model_family_id} {action_id}"'
|
|
1823
1961
|
logging.info("cmd is: %s", cmd)
|
|
1824
1962
|
self.start(cmd, "image_build_log")
|
|
1825
1963
|
|
|
@@ -1831,7 +1969,8 @@ def resource_clone_execute(self: ActionInstance):
|
|
|
1831
1969
|
if not action_details:
|
|
1832
1970
|
return
|
|
1833
1971
|
self.setup_action_requirements(action_details)
|
|
1834
|
-
|
|
1972
|
+
container_name = f"resource_clone_{self.action_record_id}"
|
|
1973
|
+
cmd = f'{self.get_base_docker_cmd(container_name=container_name)} python3 main.py {self.action_record_id} "'
|
|
1835
1974
|
logging.info("cmd is: %s", cmd)
|
|
1836
1975
|
self.start(cmd, "resource_clone")
|
|
1837
1976
|
|
|
@@ -1847,17 +1986,27 @@ def streaming_gateway_execute(self: ActionInstance):
|
|
|
1847
1986
|
self.docker_container = (
|
|
1848
1987
|
f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
|
|
1849
1988
|
)
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1989
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
1990
|
+
if existing_container_id:
|
|
1991
|
+
# Check if container actually exists before trying to restart
|
|
1992
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
1993
|
+
logging.info(
|
|
1994
|
+
"Using existing container ID for streaming gateway: %s",
|
|
1995
|
+
existing_container_id,
|
|
1996
|
+
)
|
|
1997
|
+
self.docker_container = existing_container_id
|
|
1998
|
+
cmd = "docker restart " + self.docker_container
|
|
1999
|
+
self.start(cmd, "streaming_gateway")
|
|
2000
|
+
return
|
|
2001
|
+
else:
|
|
2002
|
+
logging.warning(
|
|
2003
|
+
"Container %s not found. Creating new container.",
|
|
2004
|
+
existing_container_id
|
|
2005
|
+
)
|
|
2006
|
+
# Fall through to create new container
|
|
2007
|
+
|
|
2008
|
+
container_name = f"streaming_gateway_{self.action_record_id}"
|
|
2009
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"], container_name=container_name)} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1861
2010
|
logging.info("cmd is: %s", cmd)
|
|
1862
2011
|
self.start(cmd, "streaming_gateway")
|
|
1863
2012
|
|
|
@@ -1951,16 +2100,24 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1951
2100
|
else:
|
|
1952
2101
|
pkgs = f"matrice_common matrice"
|
|
1953
2102
|
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
2103
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
2104
|
+
if existing_container_id:
|
|
2105
|
+
# Check if container actually exists before trying to restart
|
|
2106
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
2107
|
+
logging.info(
|
|
2108
|
+
"Using existing container ID for kafka: %s",
|
|
2109
|
+
existing_container_id,
|
|
2110
|
+
)
|
|
2111
|
+
self.docker_container = existing_container_id
|
|
2112
|
+
cmd = "docker restart " + self.docker_container
|
|
2113
|
+
self.start(cmd, "kafka_setup")
|
|
2114
|
+
return
|
|
2115
|
+
else:
|
|
2116
|
+
logging.warning(
|
|
2117
|
+
"Container %s not found. Creating new container.",
|
|
2118
|
+
existing_container_id
|
|
2119
|
+
)
|
|
2120
|
+
# Fall through to create new container
|
|
1964
2121
|
|
|
1965
2122
|
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
1966
2123
|
cmd = (
|
|
@@ -1997,21 +2154,31 @@ def inference_tracker_setup_execute(self: ActionInstance):
|
|
|
1997
2154
|
|
|
1998
2155
|
self.setup_action_requirements(action_details)
|
|
1999
2156
|
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2157
|
+
existing_container_id = action_details["actionDetails"].get("containerId")
|
|
2158
|
+
if existing_container_id:
|
|
2159
|
+
# Check if container actually exists before trying to restart
|
|
2160
|
+
if ActionInstance.container_exists(existing_container_id):
|
|
2161
|
+
logging.info(
|
|
2162
|
+
"Using existing container ID for inference tracker: %s",
|
|
2163
|
+
existing_container_id,
|
|
2164
|
+
)
|
|
2165
|
+
self.docker_container = existing_container_id
|
|
2166
|
+
cmd = "docker restart " + self.docker_container
|
|
2167
|
+
self.start(cmd, "inference_tracker_setup")
|
|
2168
|
+
return
|
|
2169
|
+
else:
|
|
2170
|
+
logging.warning(
|
|
2171
|
+
"Container %s not found. Creating new container.",
|
|
2172
|
+
existing_container_id
|
|
2173
|
+
)
|
|
2174
|
+
# Fall through to create new container
|
|
2175
|
+
|
|
2010
2176
|
# This is the existing Docker run command
|
|
2177
|
+
container_name = f"inference_tracker_{self.action_record_id}"
|
|
2011
2178
|
worker_cmd = (
|
|
2012
2179
|
f"docker run -d --pull=always --net=host "
|
|
2013
|
-
|
|
2014
|
-
f"--name
|
|
2180
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
2181
|
+
f"--name {container_name} "
|
|
2015
2182
|
f"-v matrice_myvol:/matrice_data "
|
|
2016
2183
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
2017
2184
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
@@ -404,7 +404,7 @@ class InstanceManager:
|
|
|
404
404
|
if self.container_kafka_producer:
|
|
405
405
|
try:
|
|
406
406
|
self.container_kafka_producer.send(topic_name, status_message)
|
|
407
|
-
logging.
|
|
407
|
+
logging.info("Container status monitor: Sent status for %d containers", len(containers))
|
|
408
408
|
except Exception as e:
|
|
409
409
|
logging.error("Container status monitor: Failed to send to Kafka: %s", str(e))
|
|
410
410
|
|
matrice_compute/scaling.py
CHANGED
|
@@ -295,7 +295,7 @@ class Scaling:
|
|
|
295
295
|
logging.warning(f"Kafka returned error for {api}, falling back to REST")
|
|
296
296
|
|
|
297
297
|
# Kafka failed or disabled, try REST
|
|
298
|
-
logging.
|
|
298
|
+
logging.debug(f"Using REST API for {api}")
|
|
299
299
|
try:
|
|
300
300
|
rest_response = rest_fallback_func()
|
|
301
301
|
|
|
@@ -185,7 +185,7 @@ class ShutdownManager:
|
|
|
185
185
|
time.sleep(2)
|
|
186
186
|
return True
|
|
187
187
|
except Exception as e:
|
|
188
|
-
logging.
|
|
188
|
+
logging.info("Aggressive command failed: %s", str(e))
|
|
189
189
|
except Exception as e:
|
|
190
190
|
logging.error("Error in aggressive shutdown methods: %s", str(e))
|
|
191
191
|
return False
|
|
@@ -271,7 +271,7 @@ class ShutdownManager:
|
|
|
271
271
|
"""
|
|
272
272
|
# CRITICAL: Check if this is a reserved instance that should not be shut down
|
|
273
273
|
# if self.reserved_instance:
|
|
274
|
-
# logging.
|
|
274
|
+
# logging.info("Reserved instance detected, skipping shutdown check")
|
|
275
275
|
# return
|
|
276
276
|
|
|
277
277
|
# Update idle time tracking
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
matrice_compute/__init__.py,sha256=YZhx7rQlD1TAlhBMbsU3_Xp-tpLyTAxWZDcQvqmwR2g,723
|
|
2
|
+
matrice_compute/action_instance.py,sha256=j6_3OG82HT7WcdWMy6VjEWwYxELfLhGJ1Y6ZaoRgWig,85420
|
|
3
|
+
matrice_compute/actions_manager.py,sha256=14DKWfdJ145oyA0x5YVaj4ylnKE5Kd6xJZ5xzk0Jres,18147
|
|
4
|
+
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
|
+
matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
|
|
6
|
+
matrice_compute/instance_manager.py,sha256=9u3QRTP-MkAWmrSQMMbCKc0TfK584teAg1wWIaqMZdE,19291
|
|
7
|
+
matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
|
|
8
|
+
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
9
|
+
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
matrice_compute/resources_tracker.py,sha256=1jSLrIFlOh-vgyNzFrUrE2Ak2JAGCIfV7wcyEPJ0f2c,32246
|
|
11
|
+
matrice_compute/scaling.py,sha256=UQDI8wN9JEKafvUVPF0Pk9XmhKlbMkeu16AZyyOuSE8,55147
|
|
12
|
+
matrice_compute/shutdown_manager.py,sha256=rnP9Qes6JJKDnebmBC9rqkH__X9a8TMjhWQPWoOQKFs,13232
|
|
13
|
+
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
14
|
+
matrice_compute-0.1.32.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
15
|
+
matrice_compute-0.1.32.dist-info/METADATA,sha256=DMQ2-4mfoiU0aUvxsTVe7lcvhrZ5_uiIvzkIun_6sP4,1038
|
|
16
|
+
matrice_compute-0.1.32.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
matrice_compute-0.1.32.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
18
|
+
matrice_compute-0.1.32.dist-info/RECORD,,
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
|
|
2
|
-
matrice_compute/action_instance.py,sha256=qe_5AYptyLyFbFLyNUW9e762vAZnIRtyHTtnu_8GxPM,77359
|
|
3
|
-
matrice_compute/actions_manager.py,sha256=Iex5uw0PLRR4pvIAZDxc2CypucbanKDbJ3SK8mMGXK8,18148
|
|
4
|
-
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
|
-
matrice_compute/compute_operations_handler.py,sha256=amcMhmXtv2irE6qK8Vbgec_8uFqjWmVVp0VWq-73_MU,17781
|
|
6
|
-
matrice_compute/instance_manager.py,sha256=kPZYfiq3Oevs5r1xzwvDzE27zeWF9oBBxh9KhpHJuG4,19292
|
|
7
|
-
matrice_compute/instance_utils.py,sha256=N4yPDvNukFEEBngR0lEt4x_XT5hur1q0P-spM2xQIlU,42025
|
|
8
|
-
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
9
|
-
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
matrice_compute/resources_tracker.py,sha256=1jSLrIFlOh-vgyNzFrUrE2Ak2JAGCIfV7wcyEPJ0f2c,32246
|
|
11
|
-
matrice_compute/scaling.py,sha256=cdEJqdVsPGDeOjkVAG85lubOn-qwDRV5qqmrNl_XpCM,55146
|
|
12
|
-
matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
|
|
13
|
-
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
14
|
-
matrice_compute-0.1.30.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
15
|
-
matrice_compute-0.1.30.dist-info/METADATA,sha256=kTDi915lwYwQJWMEyAT1F5GT5GfUW2SQfon5ki_X9tM,1038
|
|
16
|
-
matrice_compute-0.1.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
-
matrice_compute-0.1.30.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
18
|
-
matrice_compute-0.1.30.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|