matrice-compute 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +12 -1
- matrice_compute/action_instance.py +149 -103
- matrice_compute/instance_manager.py +2 -1
- matrice_compute/instance_utils.py +7 -1
- matrice_compute/resources_tracker.py +16 -4
- matrice_compute/scaling.py +305 -220
- matrice_compute/task_utils.py +2 -2
- {matrice_compute-0.1.12.dist-info → matrice_compute-0.1.14.dist-info}/METADATA +1 -1
- matrice_compute-0.1.14.dist-info/RECORD +17 -0
- matrice_compute-0.1.12.dist-info/RECORD +0 -17
- {matrice_compute-0.1.12.dist-info → matrice_compute-0.1.14.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.12.dist-info → matrice_compute-0.1.14.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.12.dist-info → matrice_compute-0.1.14.dist-info}/top_level.txt +0 -0
matrice_compute/__init__.py
CHANGED
|
@@ -1,9 +1,20 @@
|
|
|
1
1
|
"""Module providing __init__ functionality."""
|
|
2
2
|
|
|
3
|
+
import subprocess
|
|
3
4
|
|
|
4
5
|
from matrice_common.utils import dependencies_check
|
|
5
6
|
|
|
6
|
-
dependencies_check(
|
|
7
|
+
dependencies_check(
|
|
8
|
+
["docker", "psutil", "cryptography", "notebook", "aiohttp", "kafka-python"]
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
subprocess.run( # Re-upgrade docker to avoid missing DOCKER_HOST connection error
|
|
12
|
+
["pip", "install", "--upgrade", "docker"],
|
|
13
|
+
check=True,
|
|
14
|
+
stdout=subprocess.DEVNULL, # suppress normal output
|
|
15
|
+
stderr=subprocess.DEVNULL # suppress warnings/progress
|
|
16
|
+
)
|
|
17
|
+
|
|
7
18
|
from matrice_compute.instance_manager import InstanceManager # noqa: E402
|
|
8
19
|
|
|
9
20
|
__all__ = ["InstanceManager"]
|
|
@@ -74,7 +74,8 @@ class ActionInstance:
|
|
|
74
74
|
"streaming_gateway": streaming_gateway_execute,
|
|
75
75
|
"facial_recognition_setup": facial_recognition_setup_execute,
|
|
76
76
|
"fe_fs_streaming": fe_fs_streaming_execute,
|
|
77
|
-
"inference_ws_server": inference_ws_server_execute
|
|
77
|
+
"inference_ws_server": inference_ws_server_execute,
|
|
78
|
+
"lpr_setup": lpr_setup_execute
|
|
78
79
|
}
|
|
79
80
|
if self.action_type not in self.actions_map:
|
|
80
81
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
@@ -574,75 +575,75 @@ class ActionInstance:
|
|
|
574
575
|
)
|
|
575
576
|
raise
|
|
576
577
|
|
|
577
|
-
@log_errors(raise_exception=False)
|
|
578
|
-
def create_redis_container(self, redis_image=None, redis_password=None):
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
578
|
+
# @log_errors(raise_exception=False)
|
|
579
|
+
# def create_redis_container(self, redis_image=None, redis_password=None):
|
|
580
|
+
# """Create and start a Redis container using Docker.
|
|
581
|
+
|
|
582
|
+
# Args:
|
|
583
|
+
# redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
|
|
584
|
+
|
|
585
|
+
# Returns:
|
|
586
|
+
# tuple: (container_info, error, message)
|
|
587
|
+
# """
|
|
588
|
+
# if redis_image is None:
|
|
589
|
+
# redis_image = "redis:latest"
|
|
590
|
+
|
|
591
|
+
# network_name = f"redis_network_{int(time.time())}"
|
|
592
|
+
# subprocess.run(f"docker network create {network_name}", shell=True, check=True)
|
|
593
|
+
|
|
594
|
+
# try:
|
|
595
|
+
# # Get an available port for Redis
|
|
596
|
+
# external_port = "6379"
|
|
597
|
+
|
|
598
|
+
# # Generate a unique container name and password
|
|
599
|
+
# container_name = f"redis_container_{int(time.time())}"
|
|
600
|
+
|
|
601
|
+
# # Build the docker command to create Redis container with password
|
|
602
|
+
# cmd = (
|
|
603
|
+
# f"docker run -d "
|
|
604
|
+
# f"--network {network_name} "
|
|
605
|
+
# f"--name {container_name} "
|
|
606
|
+
# f"-p {external_port}:6379 "
|
|
607
|
+
# f"--restart unless-stopped "
|
|
608
|
+
# f"{redis_image} "
|
|
609
|
+
# f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
610
|
+
# )
|
|
611
|
+
|
|
612
|
+
# logging.info("Creating Redis container with command: %s", cmd)
|
|
613
|
+
|
|
614
|
+
# # Execute the command
|
|
615
|
+
# result = subprocess.run(
|
|
616
|
+
# cmd, shell=True, capture_output=True, text=True, timeout=60
|
|
617
|
+
# )
|
|
618
|
+
|
|
619
|
+
# if result.returncode == 0:
|
|
620
|
+
# container_id = result.stdout.strip()
|
|
621
|
+
# container_info = {
|
|
622
|
+
# "container_id": container_id,
|
|
623
|
+
# "container_name": container_name,
|
|
624
|
+
# "network_name": network_name,
|
|
625
|
+
# "external_port": external_port,
|
|
626
|
+
# "internal_port": 6379,
|
|
627
|
+
# "password": redis_password,
|
|
628
|
+
# "image": redis_image,
|
|
629
|
+
# "status": "running",
|
|
630
|
+
# }
|
|
631
|
+
|
|
632
|
+
# logging.info("Redis container created successfully: %s", container_info)
|
|
633
|
+
# return container_info, None, "Redis container created successfully"
|
|
634
|
+
# else:
|
|
635
|
+
# error_message = f"Failed to create Redis container: {result.stderr}"
|
|
636
|
+
# logging.error(error_message)
|
|
637
|
+
# return None, "ContainerCreationError", error_message
|
|
638
|
+
|
|
639
|
+
# except subprocess.TimeoutExpired:
|
|
640
|
+
# error_message = "Timeout while creating Redis container"
|
|
641
|
+
# logging.error(error_message)
|
|
642
|
+
# return None, "TimeoutError", error_message
|
|
643
|
+
# except Exception as e:
|
|
644
|
+
# error_message = f"Unexpected error creating Redis container: {str(e)}"
|
|
645
|
+
# logging.error(error_message)
|
|
646
|
+
# return None, "UnexpectedError", error_message
|
|
646
647
|
|
|
647
648
|
@log_errors(raise_exception=False, log_error=False)
|
|
648
649
|
def send_logs_continuously(self):
|
|
@@ -1052,7 +1053,7 @@ def database_setup_execute(self: ActionInstance):
|
|
|
1052
1053
|
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1053
1054
|
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
1054
1055
|
f"-e PROJECT_ID={project_id} "
|
|
1055
|
-
f
|
|
1056
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1056
1057
|
f"{image} "
|
|
1057
1058
|
)
|
|
1058
1059
|
print("Docker command", cmd)
|
|
@@ -1101,29 +1102,64 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1101
1102
|
self.start(worker_cmd, "facial_recognition_setup")
|
|
1102
1103
|
|
|
1103
1104
|
@log_errors(raise_exception=False)
|
|
1104
|
-
def
|
|
1105
|
+
def lpr_setup_execute(self: ActionInstance):
|
|
1105
1106
|
"""
|
|
1106
|
-
Creates and
|
|
1107
|
+
Creates and setup the database for license plate server.
|
|
1107
1108
|
"""
|
|
1108
1109
|
action_details = self.get_action_details()
|
|
1109
1110
|
|
|
1110
1111
|
if not action_details:
|
|
1111
1112
|
return
|
|
1112
|
-
image =
|
|
1113
|
+
image = self.docker_container
|
|
1114
|
+
external_port = self.scaling.get_open_port()
|
|
1113
1115
|
|
|
1114
1116
|
self.setup_action_requirements(action_details)
|
|
1115
1117
|
|
|
1116
1118
|
# Add worker container run command
|
|
1117
1119
|
worker_cmd = (
|
|
1118
1120
|
f"docker run -d --pull=always "
|
|
1119
|
-
f"--name
|
|
1120
|
-
f"-p
|
|
1121
|
+
f"--name lpr-worker "
|
|
1122
|
+
f"-p {external_port}:8082 "
|
|
1121
1123
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1122
1124
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1123
1125
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1126
|
+
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1127
|
+
f'-e PORT={external_port} '
|
|
1124
1128
|
f"{image}"
|
|
1125
1129
|
)
|
|
1126
|
-
print("
|
|
1130
|
+
print("Worker docker run command:", worker_cmd)
|
|
1131
|
+
|
|
1132
|
+
# Docker Command run
|
|
1133
|
+
self.start(worker_cmd, "lpr_setup")
|
|
1134
|
+
|
|
1135
|
+
@log_errors(raise_exception=False)
|
|
1136
|
+
def inference_ws_server_execute(self: ActionInstance):
|
|
1137
|
+
"""
|
|
1138
|
+
Creates and start inference pipeline.
|
|
1139
|
+
Inference WebSocket server runs on port 8102 (localhost only with --net=host).
|
|
1140
|
+
"""
|
|
1141
|
+
action_details = self.get_action_details()
|
|
1142
|
+
|
|
1143
|
+
if not action_details:
|
|
1144
|
+
return
|
|
1145
|
+
image = action_details["actionDetails"].get("docker")
|
|
1146
|
+
|
|
1147
|
+
|
|
1148
|
+
self.setup_action_requirements(action_details)
|
|
1149
|
+
|
|
1150
|
+
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1151
|
+
worker_cmd = (
|
|
1152
|
+
f"docker run -d --pull=always --net=host "
|
|
1153
|
+
f"--name inference "
|
|
1154
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1155
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1156
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1157
|
+
f"{image} "
|
|
1158
|
+
f"./app "
|
|
1159
|
+
f"{self.action_record_id} "
|
|
1160
|
+
|
|
1161
|
+
)
|
|
1162
|
+
logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
|
|
1127
1163
|
|
|
1128
1164
|
# Docker Command run
|
|
1129
1165
|
self.start(worker_cmd, "inference_ws_server")
|
|
@@ -1132,7 +1168,8 @@ def inference_ws_server_execute(self: ActionInstance):
|
|
|
1132
1168
|
@log_errors(raise_exception=False)
|
|
1133
1169
|
def fe_fs_streaming_execute(self: ActionInstance):
|
|
1134
1170
|
"""
|
|
1135
|
-
Creates and setup the frontend for fs streaming
|
|
1171
|
+
Creates and setup the frontend for fs streaming.
|
|
1172
|
+
Frontend streaming runs on port 3000 (localhost only with --net=host).
|
|
1136
1173
|
"""
|
|
1137
1174
|
action_details = self.get_action_details()
|
|
1138
1175
|
|
|
@@ -1142,17 +1179,16 @@ def fe_fs_streaming_execute(self: ActionInstance):
|
|
|
1142
1179
|
|
|
1143
1180
|
self.setup_action_requirements(action_details)
|
|
1144
1181
|
|
|
1145
|
-
#
|
|
1182
|
+
# Frontend streaming with --net=host (Port: 3000)
|
|
1146
1183
|
worker_cmd = (
|
|
1147
|
-
f"docker run -d --pull=always "
|
|
1184
|
+
f"docker run -d --pull=always --net=host "
|
|
1148
1185
|
f"--name fe_streaming "
|
|
1149
|
-
f"-p 3000:3000 "
|
|
1150
1186
|
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1151
1187
|
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1152
1188
|
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1153
1189
|
f"{image}"
|
|
1154
1190
|
)
|
|
1155
|
-
|
|
1191
|
+
logging.info("Starting frontend streaming (Port: 3000): %s", worker_cmd)
|
|
1156
1192
|
|
|
1157
1193
|
# Docker Command run
|
|
1158
1194
|
self.start(worker_cmd, "fe_fs_streaming")
|
|
@@ -1204,8 +1240,8 @@ def synthetic_data_setup_execute(self: ActionInstance):
|
|
|
1204
1240
|
def redis_setup_execute(self: ActionInstance):
|
|
1205
1241
|
"""
|
|
1206
1242
|
Creates and starts a Redis container using Docker.
|
|
1243
|
+
Redis runs on port 6379 (localhost only with --net=host).
|
|
1207
1244
|
"""
|
|
1208
|
-
external_port = self.scaling.get_open_port()
|
|
1209
1245
|
work_fs = get_max_file_system()
|
|
1210
1246
|
|
|
1211
1247
|
action_details = self.get_action_details()
|
|
@@ -1217,18 +1253,6 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1217
1253
|
"password", f"redis_pass_{int(time.time())}"
|
|
1218
1254
|
)
|
|
1219
1255
|
|
|
1220
|
-
container_info, error, message = self.create_redis_container(
|
|
1221
|
-
action_details["actionDetails"].get("redis_image", "redis:latest"),
|
|
1222
|
-
redis_password=redis_password,
|
|
1223
|
-
)
|
|
1224
|
-
if error:
|
|
1225
|
-
logging.error(
|
|
1226
|
-
"Error creating Redis container: %s",
|
|
1227
|
-
message,
|
|
1228
|
-
)
|
|
1229
|
-
return
|
|
1230
|
-
logging.info("Redis container created successfully: %s", container_info)
|
|
1231
|
-
|
|
1232
1256
|
# Initialize redis container
|
|
1233
1257
|
self.setup_action_requirements(
|
|
1234
1258
|
action_details,
|
|
@@ -1237,17 +1261,39 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1237
1261
|
action_id=action_id,
|
|
1238
1262
|
)
|
|
1239
1263
|
|
|
1264
|
+
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1265
|
+
|
|
1266
|
+
# Redis container with --net=host (Port: 6379)
|
|
1267
|
+
redis_cmd = (
|
|
1268
|
+
f"docker run -d --net=host "
|
|
1269
|
+
f"--name redis_container_{int(time.time())} "
|
|
1270
|
+
f"--restart unless-stopped "
|
|
1271
|
+
f"{redis_image} "
|
|
1272
|
+
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
logging.info("Starting Redis container (Port: 6379): %s", redis_cmd)
|
|
1276
|
+
|
|
1277
|
+
# Start Redis container first
|
|
1278
|
+
redis_process = subprocess.Popen(
|
|
1279
|
+
redis_cmd,
|
|
1280
|
+
shell=True,
|
|
1281
|
+
stdout=subprocess.PIPE,
|
|
1282
|
+
stderr=subprocess.PIPE,
|
|
1283
|
+
)
|
|
1284
|
+
logging.info("Redis container started successfully on localhost:6379")
|
|
1285
|
+
|
|
1286
|
+
# Wait for Redis to be ready
|
|
1287
|
+
time.sleep(5)
|
|
1288
|
+
|
|
1240
1289
|
env_vars = {
|
|
1241
|
-
"REDIS_URL": f"
|
|
1242
|
-
"REDIS_PASSWORD":
|
|
1290
|
+
"REDIS_URL": f"localhost:6379",
|
|
1291
|
+
"REDIS_PASSWORD": redis_password,
|
|
1243
1292
|
}
|
|
1244
1293
|
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
# Make the docker file here
|
|
1294
|
+
# bg-redis management container with --net=host (Port: 8082)
|
|
1248
1295
|
cmd = (
|
|
1249
|
-
f"docker run "
|
|
1250
|
-
f"{network_config} "
|
|
1296
|
+
f"docker run --net=host "
|
|
1251
1297
|
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1252
1298
|
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1253
1299
|
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
@@ -1259,7 +1305,7 @@ def redis_setup_execute(self: ActionInstance):
|
|
|
1259
1305
|
f"{self.action_record_id} "
|
|
1260
1306
|
)
|
|
1261
1307
|
|
|
1262
|
-
logging.info("
|
|
1308
|
+
logging.info("Starting bg-redis management (Port: 8082): %s", cmd)
|
|
1263
1309
|
|
|
1264
1310
|
self.start(cmd, "redis_setup")
|
|
1265
1311
|
|
|
@@ -153,7 +153,8 @@ class InstanceManager:
|
|
|
153
153
|
key,
|
|
154
154
|
value,
|
|
155
155
|
) in manual_instance_info.items():
|
|
156
|
-
|
|
156
|
+
if value is not None:
|
|
157
|
+
os.environ[key] = str(value)
|
|
157
158
|
if not (os.environ.get("SERVICE_PROVIDER") and os.environ.get("INSTANCE_ID")):
|
|
158
159
|
raise Exception(
|
|
159
160
|
"SERVICE_PROVIDER and INSTANCE_ID must be set as environment variables or passed as arguments"
|
|
@@ -366,7 +366,13 @@ def get_max_file_system() -> str:
|
|
|
366
366
|
max_available_filesystem,
|
|
367
367
|
max_disk["available"],
|
|
368
368
|
)
|
|
369
|
-
if
|
|
369
|
+
# Check if filesystem is writable, or if it's root/empty
|
|
370
|
+
if max_available_filesystem in ["/", ""] or not os.access(max_available_filesystem, os.W_OK):
|
|
371
|
+
if max_available_filesystem not in ["/", ""]:
|
|
372
|
+
logging.warning(
|
|
373
|
+
"Filesystem %s is not writable, falling back to home directory",
|
|
374
|
+
max_available_filesystem,
|
|
375
|
+
)
|
|
370
376
|
home_dir = os.path.expanduser("~")
|
|
371
377
|
if not os.environ.get("WORKSPACE_DIR"):
|
|
372
378
|
logging.error("WORKSPACE_DIR environment variable not set")
|
|
@@ -247,10 +247,14 @@ class ActionsResourcesTracker:
|
|
|
247
247
|
self.max_actions_usage = {}
|
|
248
248
|
self.resources_tracker = ResourcesTracker()
|
|
249
249
|
self.client = docker.from_env()
|
|
250
|
+
self.logged_stopped_containers = []
|
|
250
251
|
|
|
251
252
|
@log_errors(raise_exception=False, log_error=True)
|
|
252
253
|
def update_actions_resources(self) -> None:
|
|
253
|
-
"""Process both running and exited containers
|
|
254
|
+
"""Process both running and exited containers.
|
|
255
|
+
|
|
256
|
+
Note: Does not remove containers to keep logs. Only tracks resource usage.
|
|
257
|
+
"""
|
|
254
258
|
exited_containers = self.client.containers.list(
|
|
255
259
|
filters={"status": "exited"},
|
|
256
260
|
all=True,
|
|
@@ -259,8 +263,12 @@ class ActionsResourcesTracker:
|
|
|
259
263
|
if exited_containers:
|
|
260
264
|
for container in exited_containers:
|
|
261
265
|
try:
|
|
266
|
+
if container.id in self.logged_stopped_containers:
|
|
267
|
+
continue
|
|
262
268
|
self._update_container_action_status(container, "completed")
|
|
263
|
-
container.
|
|
269
|
+
self.logged_stopped_containers.append(container.id)
|
|
270
|
+
# COMMENTED OUT: Do not remove containers to keep logs
|
|
271
|
+
# container.remove()
|
|
264
272
|
except Exception as err:
|
|
265
273
|
logging.error(
|
|
266
274
|
"Error processing exited container %s: %s",
|
|
@@ -310,7 +318,7 @@ class ActionsResourcesTracker:
|
|
|
310
318
|
args_24 = [arg for arg in remove_quotation_marks(inspect_data["Args"]) if len(arg) == 24 and "pypi" not in arg]
|
|
311
319
|
action_record_id = args_24[-1] if args_24 else None
|
|
312
320
|
if not action_record_id:
|
|
313
|
-
logging.
|
|
321
|
+
logging.debug("No valid action_id found for the container. Container ID: %s, Args: %s", container.id, inspect_data["Args"])
|
|
314
322
|
duration = calculate_time_difference(start_time, finish_time)
|
|
315
323
|
(
|
|
316
324
|
current_gpu_utilization,
|
|
@@ -320,6 +328,8 @@ class ActionsResourcesTracker:
|
|
|
320
328
|
) = self.get_current_action_usage(container, status)
|
|
321
329
|
sub_containers = self.get_sub_containers_by_label("action_id", action_record_id)
|
|
322
330
|
for sub_container in sub_containers:
|
|
331
|
+
if sub_container.id in self.logged_stopped_containers:
|
|
332
|
+
continue
|
|
323
333
|
(
|
|
324
334
|
sub_container_gpu_utilization,
|
|
325
335
|
sub_container_gpu_memory,
|
|
@@ -330,10 +340,12 @@ class ActionsResourcesTracker:
|
|
|
330
340
|
current_gpu_memory += sub_container_gpu_memory
|
|
331
341
|
current_cpu_utilization += sub_container_cpu_utilization
|
|
332
342
|
current_memory_utilization += sub_container_memory_utilization
|
|
343
|
+
# COMMENTED OUT: Do not stop/remove sub-containers to keep logs
|
|
333
344
|
if status == "completed":
|
|
334
345
|
try:
|
|
335
346
|
sub_container.stop()
|
|
336
|
-
sub_container.
|
|
347
|
+
self.logged_stopped_containers.append(sub_container.id)
|
|
348
|
+
# sub_container.remove(force=True)
|
|
337
349
|
except Exception as err:
|
|
338
350
|
logging.error(
|
|
339
351
|
"Error removing sub-container %s: %s",
|
matrice_compute/scaling.py
CHANGED
|
@@ -3,12 +3,13 @@
|
|
|
3
3
|
import os
|
|
4
4
|
import logging
|
|
5
5
|
from matrice_common.utils import log_errors
|
|
6
|
-
from kafka import KafkaProducer, KafkaConsumer
|
|
6
|
+
# from kafka import KafkaProducer, KafkaConsumer
|
|
7
7
|
import uuid
|
|
8
8
|
import json
|
|
9
9
|
import time
|
|
10
10
|
import base64
|
|
11
11
|
|
|
12
|
+
# TODO: update /scaling to /compute
|
|
12
13
|
|
|
13
14
|
class Scaling:
|
|
14
15
|
|
|
@@ -37,32 +38,34 @@ class Scaling:
|
|
|
37
38
|
"Initialized Scaling with instance_id: %s",
|
|
38
39
|
instance_id,
|
|
39
40
|
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
41
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
42
|
+
# self.kafka_config = {
|
|
43
|
+
# "bootstrap_servers": self.get_kafka_bootstrap_servers(),
|
|
44
|
+
# "api_request_topic": "action_requests",
|
|
45
|
+
# "api_response_topic": "action_responses",
|
|
46
|
+
# "scaling_request_topic": "compute_requests",
|
|
47
|
+
# "scaling_response_topic": "compute_responses"
|
|
48
|
+
# }
|
|
49
|
+
# self.kafka_producer = KafkaProducer(
|
|
50
|
+
# bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
51
|
+
# value_serializer=lambda v: json.dumps(v).encode("utf-8"),)
|
|
50
52
|
|
|
51
53
|
|
|
52
54
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
55
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
56
|
+
# @log_errors(default_return=(None, "Error creating Kafka producer", "Kafka producer creation failed"), log_error=True)
|
|
57
|
+
# def get_kafka_bootstrap_servers(self):
|
|
58
|
+
# """Get Kafka bootstrap servers from API and decode base64 fields."""
|
|
59
|
+
# path = "/v1/actions/get_kafka_info"
|
|
60
|
+
# response = self.rpc.get(path=path)
|
|
61
|
+
# if not response or not response.get("success"):
|
|
62
|
+
# raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
|
|
63
|
+
# encoded_ip = response["data"]["ip"]
|
|
64
|
+
# encoded_port = response["data"]["port"]
|
|
65
|
+
# ip = base64.b64decode(encoded_ip).decode("utf-8")
|
|
66
|
+
# port = base64.b64decode(encoded_port).decode("utf-8")
|
|
67
|
+
# bootstrap_servers = f"{ip}:{port}"
|
|
68
|
+
# return bootstrap_servers
|
|
66
69
|
|
|
67
70
|
@log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
|
|
68
71
|
def handle_response(self, resp, success_message, error_message):
|
|
@@ -183,6 +186,8 @@ class Scaling:
|
|
|
183
186
|
Returns:
|
|
184
187
|
Tuple of (data, error, message) from API response
|
|
185
188
|
"""
|
|
189
|
+
if not action_record_id:
|
|
190
|
+
return None, "Action record id is required", "Action record id is required"
|
|
186
191
|
logging.info(
|
|
187
192
|
"Updating action status for action %s",
|
|
188
193
|
action_record_id,
|
|
@@ -285,34 +290,44 @@ class Scaling:
|
|
|
285
290
|
|
|
286
291
|
@log_errors(log_error=True)
|
|
287
292
|
def get_action_details(self, action_status_id):
|
|
288
|
-
"""Get details for a specific action using
|
|
293
|
+
"""Get details for a specific action using REST API.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
action_status_id: ID of the action status to fetch
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Tuple of (data, error, message) from API response
|
|
300
|
+
"""
|
|
289
301
|
logging.info("Getting action details for action %s", action_status_id)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
#
|
|
293
|
-
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
)
|
|
300
|
-
if
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
#
|
|
302
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
303
|
+
# api = "get_action_details"
|
|
304
|
+
# payload = {"actionRecordId": action_status_id}
|
|
305
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
306
|
+
# api=api,
|
|
307
|
+
# payload=payload,
|
|
308
|
+
# request_topic=self.kafka_config["api_request_topic"],
|
|
309
|
+
# response_topic=self.kafka_config["api_response_topic"],
|
|
310
|
+
# timeout=60
|
|
311
|
+
# )
|
|
312
|
+
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
313
|
+
# if kafka_response_received:
|
|
314
|
+
# if error:
|
|
315
|
+
# logging.warning("Kafka returned error for get_action_details: %s. Falling back to REST API.", error)
|
|
316
|
+
# else:
|
|
317
|
+
# return data, error, message
|
|
318
|
+
|
|
319
|
+
# Using REST API directly
|
|
304
320
|
try:
|
|
305
321
|
path = f"/v1/actions/action/{action_status_id}/details"
|
|
306
322
|
resp = self.rpc.get(path=path)
|
|
307
323
|
return self.handle_response(
|
|
308
324
|
resp,
|
|
309
|
-
"Task details fetched successfully
|
|
310
|
-
"Could not fetch the task details
|
|
325
|
+
"Task details fetched successfully",
|
|
326
|
+
"Could not fetch the task details",
|
|
311
327
|
)
|
|
312
328
|
except Exception as e:
|
|
313
|
-
logging.error("REST
|
|
314
|
-
|
|
315
|
-
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
329
|
+
logging.error("REST API failed (get_action_details): %s", e)
|
|
330
|
+
return None, f"Failed via REST: {e}", "REST API failed"
|
|
316
331
|
|
|
317
332
|
|
|
318
333
|
@log_errors(log_error=True)
|
|
@@ -327,11 +342,26 @@ class Scaling:
|
|
|
327
342
|
service="",
|
|
328
343
|
job_params=None,
|
|
329
344
|
):
|
|
330
|
-
"""Update an action using
|
|
345
|
+
"""Update an action using REST API.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
id: Action ID
|
|
349
|
+
step_code: Step code
|
|
350
|
+
action_type: Type of action
|
|
351
|
+
status: Status of the action
|
|
352
|
+
sub_action: Sub-action details
|
|
353
|
+
status_description: Description of the status
|
|
354
|
+
service: Service name
|
|
355
|
+
job_params: Job parameters dictionary
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Tuple of (data, error, message) from API response
|
|
359
|
+
"""
|
|
331
360
|
if job_params is None:
|
|
332
361
|
job_params = {}
|
|
333
362
|
logging.info("Updating action %s", id)
|
|
334
|
-
|
|
363
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
364
|
+
# api = "update_action"
|
|
335
365
|
payload = {
|
|
336
366
|
"_id": id,
|
|
337
367
|
"stepCode": step_code,
|
|
@@ -342,63 +372,85 @@ class Scaling:
|
|
|
342
372
|
"serviceName": service,
|
|
343
373
|
"jobParams": job_params,
|
|
344
374
|
}
|
|
345
|
-
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
)
|
|
352
|
-
if
|
|
353
|
-
|
|
375
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
376
|
+
# api=api,
|
|
377
|
+
# payload=payload,
|
|
378
|
+
# request_topic=self.kafka_config["api_request_topic"],
|
|
379
|
+
# response_topic=self.kafka_config["api_response_topic"],
|
|
380
|
+
# timeout=60
|
|
381
|
+
# )
|
|
382
|
+
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
383
|
+
# if kafka_response_received:
|
|
384
|
+
# if error:
|
|
385
|
+
# logging.warning("Kafka returned error for update_action: %s. Falling back to REST API.", error)
|
|
386
|
+
# else:
|
|
387
|
+
# return data, error, message
|
|
388
|
+
|
|
389
|
+
# Using REST API directly
|
|
354
390
|
try:
|
|
355
391
|
path = "/v1/actions"
|
|
356
392
|
resp = self.rpc.put(path=path, payload=payload)
|
|
357
393
|
return self.handle_response(
|
|
358
394
|
resp,
|
|
359
|
-
"Error logged successfully
|
|
360
|
-
"Could not log the errors
|
|
395
|
+
"Error logged successfully",
|
|
396
|
+
"Could not log the errors",
|
|
361
397
|
)
|
|
362
398
|
except Exception as e:
|
|
363
|
-
logging.error("REST
|
|
364
|
-
|
|
365
|
-
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
399
|
+
logging.error("REST API failed (update_action): %s", e)
|
|
400
|
+
return None, f"Failed via REST: {e}", "REST API failed"
|
|
366
401
|
|
|
367
402
|
|
|
368
403
|
@log_errors(log_error=True)
|
|
369
404
|
def assign_jobs(self, is_gpu):
|
|
370
|
-
"""Assign jobs to the instance using
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
405
|
+
"""Assign jobs to the instance using REST API.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
is_gpu: Boolean or any value indicating if this is a GPU instance.
|
|
409
|
+
Will be converted to proper boolean.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
Tuple of (data, error, message) from API response
|
|
413
|
+
"""
|
|
414
|
+
# Convert is_gpu to proper boolean
|
|
415
|
+
is_gpu_bool = bool(is_gpu)
|
|
416
|
+
logging.info("Assigning jobs for instance %s (GPU: %s)", self.instance_id, is_gpu_bool)
|
|
417
|
+
|
|
418
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
419
|
+
# api = "assign_jobs"
|
|
420
|
+
# payload = {
|
|
421
|
+
# "instanceID": self.instance_id,
|
|
422
|
+
# "isGPUInstance": is_gpu_bool,
|
|
423
|
+
# }
|
|
424
|
+
|
|
425
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
426
|
+
# api=api,
|
|
427
|
+
# payload=payload,
|
|
428
|
+
# request_topic=self.kafka_config["api_request_topic"],
|
|
429
|
+
# response_topic=self.kafka_config["api_response_topic"],
|
|
430
|
+
# timeout=60
|
|
431
|
+
# )
|
|
432
|
+
|
|
433
|
+
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
434
|
+
# if kafka_response_received:
|
|
435
|
+
# if error:
|
|
436
|
+
# logging.warning("Kafka returned error for assign_jobs: %s. Falling back to REST API.", error)
|
|
437
|
+
# else:
|
|
438
|
+
# return data, error, message
|
|
439
|
+
|
|
440
|
+
# Using REST API directly
|
|
390
441
|
try:
|
|
391
|
-
|
|
442
|
+
# Convert boolean to lowercase string for API endpoint
|
|
443
|
+
is_gpu_str = str(is_gpu_bool).lower()
|
|
444
|
+
path = f"/v1/actions/assign_jobs/{is_gpu_str}/{self.instance_id}"
|
|
392
445
|
resp = self.rpc.get(path=path)
|
|
393
446
|
return self.handle_response(
|
|
394
447
|
resp,
|
|
395
|
-
"Pinged successfully
|
|
396
|
-
"Could not ping the scaling jobs
|
|
448
|
+
"Pinged successfully",
|
|
449
|
+
"Could not ping the scaling jobs",
|
|
397
450
|
)
|
|
398
451
|
except Exception as e:
|
|
399
|
-
logging.error("REST
|
|
400
|
-
|
|
401
|
-
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
452
|
+
logging.error("REST API failed (assign_jobs): %s", e)
|
|
453
|
+
return None, f"Failed via REST: {e}", "REST API failed"
|
|
402
454
|
|
|
403
455
|
|
|
404
456
|
@log_errors(log_error=True)
|
|
@@ -409,7 +461,17 @@ class Scaling:
|
|
|
409
461
|
availableMemory=0,
|
|
410
462
|
availableGPUMemory=0,
|
|
411
463
|
):
|
|
412
|
-
"""Update available resources for the instance using
|
|
464
|
+
"""Update available resources for the instance using REST API.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
availableCPU: Available CPU resources
|
|
468
|
+
availableGPU: Available GPU resources
|
|
469
|
+
availableMemory: Available memory
|
|
470
|
+
availableGPUMemory: Available GPU memory
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
Tuple of (data, error, message) from API response
|
|
474
|
+
"""
|
|
413
475
|
logging.info("Updating available resources for instance %s", self.instance_id)
|
|
414
476
|
payload = {
|
|
415
477
|
"instance_id": self.instance_id,
|
|
@@ -418,63 +480,84 @@ class Scaling:
|
|
|
418
480
|
"availableGPUMemory": availableGPUMemory,
|
|
419
481
|
"availableGPU": availableGPU,
|
|
420
482
|
}
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
)
|
|
432
|
-
|
|
433
|
-
if
|
|
434
|
-
|
|
483
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
484
|
+
# api = "update_available_resources"
|
|
485
|
+
# correlation_id = str(uuid.uuid4())
|
|
486
|
+
|
|
487
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
488
|
+
# api=api,
|
|
489
|
+
# payload=payload,
|
|
490
|
+
# request_topic=self.kafka_config["scaling_request_topic"],
|
|
491
|
+
# response_topic=self.kafka_config["scaling_response_topic"],
|
|
492
|
+
# timeout=60
|
|
493
|
+
# )
|
|
494
|
+
|
|
495
|
+
# # Check if Kafka response was received
|
|
496
|
+
# # Response format: {'correlationId': 'id', 'status': 'success'/'error', 'data': ..., 'error': 'error message'}
|
|
497
|
+
# if kafka_response_received:
|
|
498
|
+
# if error:
|
|
499
|
+
# logging.warning("Kafka returned error for update_available_resources: %s. Falling back to REST API.", error)
|
|
500
|
+
# else:
|
|
501
|
+
# return data, error, message
|
|
502
|
+
|
|
503
|
+
# Using REST API directly
|
|
435
504
|
try:
|
|
436
|
-
path = f"/v1/
|
|
505
|
+
path = f"/v1/compute/update_available_resources/{self.instance_id}"
|
|
437
506
|
resp = self.rpc.put(path=path, payload=payload)
|
|
438
507
|
return self.handle_response(
|
|
439
508
|
resp,
|
|
440
|
-
"Resources updated successfully
|
|
441
|
-
"Could not update the resources
|
|
509
|
+
"Resources updated successfully",
|
|
510
|
+
"Could not update the resources",
|
|
442
511
|
)
|
|
443
512
|
except Exception as e:
|
|
444
|
-
logging.error("REST
|
|
445
|
-
|
|
446
|
-
return None, f"Failed to update available resources via Kafka and REST: {e}", "Cached for retry"
|
|
513
|
+
logging.error("REST API failed (update_available_resources): %s", e)
|
|
514
|
+
return None, f"Failed to update available resources via REST: {e}", "REST API failed"
|
|
447
515
|
|
|
448
516
|
@log_errors(log_error=True)
|
|
449
517
|
def update_action_docker_logs(self, action_record_id, log_content):
|
|
450
|
-
"""Update docker logs for an action using
|
|
518
|
+
"""Update docker logs for an action using REST API.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
action_record_id: ID of the action record
|
|
522
|
+
log_content: Content of the logs to update
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
Tuple of (data, error, message) from API response
|
|
526
|
+
"""
|
|
451
527
|
logging.info("Updating docker logs for action %s", action_record_id)
|
|
452
|
-
|
|
528
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
529
|
+
# api = "update_action_docker_logs"
|
|
453
530
|
payload = {
|
|
454
531
|
"actionRecordId": action_record_id,
|
|
455
532
|
"logContent": log_content,
|
|
456
|
-
|
|
457
533
|
}
|
|
458
|
-
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
if
|
|
467
|
-
|
|
534
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
535
|
+
# api=api,
|
|
536
|
+
# payload=payload,
|
|
537
|
+
# request_topic=self.kafka_config["api_request_topic"],
|
|
538
|
+
# response_topic=self.kafka_config["api_response_topic"],
|
|
539
|
+
# timeout=60
|
|
540
|
+
# )
|
|
541
|
+
|
|
542
|
+
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
543
|
+
# if kafka_response_received:
|
|
544
|
+
# if error:
|
|
545
|
+
# logging.warning("Kafka returned error for update_action_docker_logs: %s. Falling back to REST API.", error)
|
|
546
|
+
# else:
|
|
547
|
+
# return data, error, message
|
|
548
|
+
|
|
549
|
+
# Using REST API directly
|
|
468
550
|
try:
|
|
469
551
|
path = "/v1/actions/update_action_docker_logs"
|
|
470
552
|
resp = self.rpc.put(path=path, payload=payload)
|
|
471
553
|
return self.handle_response(
|
|
472
554
|
resp,
|
|
473
|
-
"Docker logs updated successfully
|
|
474
|
-
"Could not update the docker logs
|
|
555
|
+
"Docker logs updated successfully",
|
|
556
|
+
"Could not update the docker logs",
|
|
475
557
|
)
|
|
476
558
|
except Exception as e:
|
|
477
|
-
logging.error("REST
|
|
559
|
+
logging.error("REST API failed (update_action_docker_logs): %s", e)
|
|
560
|
+
return None, f"Failed via REST: {e}", "REST API failed"
|
|
478
561
|
|
|
479
562
|
|
|
480
563
|
@log_errors(log_error=True)
|
|
@@ -533,7 +616,8 @@ class Scaling:
|
|
|
533
616
|
if port in self.used_ports:
|
|
534
617
|
continue
|
|
535
618
|
self.used_ports.add(port)
|
|
536
|
-
|
|
619
|
+
ports_value = ",".join(str(p) for p in self.used_ports)
|
|
620
|
+
os.environ["USED_PORTS"] = str(ports_value)
|
|
537
621
|
logging.info("Found available port: %s", port)
|
|
538
622
|
return port
|
|
539
623
|
logging.error(
|
|
@@ -563,7 +647,7 @@ class Scaling:
|
|
|
563
647
|
Returns:
|
|
564
648
|
Tuple of (data, error, message) from API response
|
|
565
649
|
"""
|
|
566
|
-
path = f"/v1/
|
|
650
|
+
path = f"/v1/compute/get_models_secret_keys?secret_name={secret_name}"
|
|
567
651
|
resp = self.rpc.get(path=path)
|
|
568
652
|
return self.handle_response(
|
|
569
653
|
resp,
|
|
@@ -790,98 +874,99 @@ class Scaling:
|
|
|
790
874
|
"Could not fetch internal keys",
|
|
791
875
|
)
|
|
792
876
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
877
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
878
|
+
# @log_errors(log_error=True)
|
|
879
|
+
# def handle_kafka_response(self, msg, success_message, error_message):
|
|
880
|
+
# """
|
|
881
|
+
# Helper to process Kafka response messages in a consistent way.
|
|
882
|
+
# """
|
|
883
|
+
# if msg.get("status") == "success":
|
|
884
|
+
# data = msg.get("data")
|
|
885
|
+
# error = None
|
|
886
|
+
# message = success_message
|
|
887
|
+
# logging.info(message)
|
|
888
|
+
# else:
|
|
889
|
+
# data = msg.get("data")
|
|
890
|
+
# error = msg.get("error", "Unknown error")
|
|
891
|
+
# message = error_message
|
|
892
|
+
# logging.error("%s: %s", message, error)
|
|
893
|
+
# return data, error, message
|
|
894
|
+
|
|
895
|
+
# def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=60):
|
|
896
|
+
# """
|
|
897
|
+
# Helper to send a request to Kafka and wait for a response.
|
|
898
|
+
# Returns (data, error, message, kafka_response_received) where kafka_response_received is True if a response was received (even if error), False if transport error/timeout.
|
|
899
|
+
# """
|
|
900
|
+
# correlation_id = str(uuid.uuid4())
|
|
901
|
+
# request_message = {
|
|
902
|
+
# "correlationId": correlation_id,
|
|
903
|
+
# "api": api,
|
|
904
|
+
# "payload": payload,
|
|
905
|
+
# }
|
|
906
|
+
|
|
907
|
+
# consumer = KafkaConsumer(
|
|
908
|
+
# response_topic,
|
|
909
|
+
# bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
910
|
+
# group_id=None,
|
|
911
|
+
# value_deserializer=lambda m: json.loads(m.decode("utf-8")),
|
|
912
|
+
# auto_offset_reset='latest',
|
|
913
|
+
# enable_auto_commit=True,
|
|
914
|
+
# )
|
|
915
|
+
|
|
916
|
+
# try:
|
|
917
|
+
# if hasattr(self.session.rpc, 'AUTH_TOKEN'):
|
|
918
|
+
# self.session.rpc.AUTH_TOKEN.set_bearer_token()
|
|
919
|
+
# auth_token = self.session.rpc.AUTH_TOKEN.bearer_token
|
|
920
|
+
# auth_token = auth_token.replace("Bearer ", "")
|
|
921
|
+
# headers = [("Authorization", bytes(f"{auth_token}", "utf-8"))]
|
|
922
|
+
# else:
|
|
923
|
+
# headers = None
|
|
924
|
+
# self.kafka_producer.send(request_topic, request_message, headers=headers)
|
|
925
|
+
# # self.kafka_producer.flush()
|
|
926
|
+
# logging.info("Sent %s request to Kafka topic %s", api, request_topic)
|
|
927
|
+
# except Exception as e:
|
|
928
|
+
# logging.error("Kafka producer error: %s", e)
|
|
929
|
+
# return None, f"Kafka producer error: {e}", "Kafka send failed", False
|
|
930
|
+
# try:
|
|
931
|
+
# start = time.time()
|
|
932
|
+
# while time.time() - start < timeout:
|
|
933
|
+
# # Poll for messages with a short timeout to avoid blocking forever
|
|
934
|
+
# message_batch = consumer.poll(timeout_ms=1000)
|
|
935
|
+
# if message_batch:
|
|
936
|
+
# for topic_partition, messages in message_batch.items():
|
|
937
|
+
# for message in messages:
|
|
938
|
+
# print("trying to fetch message")
|
|
939
|
+
# msg = message.value
|
|
940
|
+
# if msg.get("correlationId") == correlation_id:
|
|
941
|
+
# consumer.close()
|
|
942
|
+
# # Always treat a received response as final, even if error
|
|
943
|
+
# return self.handle_kafka_response(
|
|
944
|
+
# msg,
|
|
945
|
+
# f"Fetched via Kafka for {api}",
|
|
946
|
+
# f"Kafka error response for {api}"
|
|
947
|
+
# ) + (True,)
|
|
948
|
+
# else:
|
|
949
|
+
# print(f"No messages received, waiting... ({time.time() - start:.1f}s/{timeout}s)")
|
|
950
|
+
#
|
|
951
|
+
# consumer.close()
|
|
952
|
+
# logging.warning("Kafka response timeout for %s after %d seconds", api, timeout)
|
|
953
|
+
# return None, "Kafka response timeout", "Kafka response timeout", False
|
|
954
|
+
# except Exception as e:
|
|
955
|
+
# logging.error("Kafka consumer error: %s", e)
|
|
956
|
+
# return None, f"Kafka consumer error: {e}", "Kafka consumer error", False
|
|
957
|
+
|
|
958
|
+
# def _cache_failed_request(self, api, payload):
|
|
959
|
+
# """Cache the failed request for retry. Here, we use a simple file cache as a placeholder."""
|
|
960
|
+
# try:
|
|
961
|
+
# cache_file = os.path.join(os.path.dirname(__file__), 'request_cache.json')
|
|
962
|
+
# if os.path.exists(cache_file):
|
|
963
|
+
# with open(cache_file, 'r') as f:
|
|
964
|
+
# cache = json.load(f)
|
|
965
|
+
# else:
|
|
966
|
+
# cache = []
|
|
967
|
+
# cache.append({"api": api, "payload": payload, "ts": time.time()})
|
|
968
|
+
# with open(cache_file, 'w') as f:
|
|
969
|
+
# json.dump(cache, f)
|
|
970
|
+
# logging.info("Cached failed request for api %s", api)
|
|
971
|
+
# except Exception as e:
|
|
972
|
+
# logging.error("Failed to cache request: %s", e)
|
matrice_compute/task_utils.py
CHANGED
|
@@ -29,8 +29,8 @@ def setup_workspace_and_run_task(
|
|
|
29
29
|
workspace_dir = f"{work_fs}/{action_id}"
|
|
30
30
|
codebase_zip_path = f"{workspace_dir}/file.zip"
|
|
31
31
|
requirements_txt_path = f"{workspace_dir}/requirements.txt"
|
|
32
|
-
if os.path.exists(workspace_dir):
|
|
33
|
-
|
|
32
|
+
# if os.path.exists(workspace_dir): # don't skip if workspace already exists, override it
|
|
33
|
+
# return
|
|
34
34
|
os.makedirs(workspace_dir, exist_ok=True)
|
|
35
35
|
|
|
36
36
|
# Download codebase ZIP file
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
|
|
2
|
+
matrice_compute/action_instance.py,sha256=aYNpRySPatxFltn_ekVmCd5h69I992_YerUTZwGWyHA,59763
|
|
3
|
+
matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
|
|
4
|
+
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
|
+
matrice_compute/instance_manager.py,sha256=8USyX09ZxLvnVNIrjRogbyUeMCfgWnasuRqYkkVF4tQ,10146
|
|
6
|
+
matrice_compute/instance_utils.py,sha256=7jnWurSpq8PQxPGlSTc0qmpNdD5jIL8pjYKdjhVhS60,22310
|
|
7
|
+
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
8
|
+
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
matrice_compute/resources_tracker.py,sha256=2hLKVxYihROtQ6fO4V_BplTgvkN8qH2H9_qxpOIpZkc,18521
|
|
10
|
+
matrice_compute/scaling.py,sha256=3F8SWvy9wWczpJ6dbY5RrXWw5ByZlIzAPJklir3KIFI,35359
|
|
11
|
+
matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
|
|
12
|
+
matrice_compute/task_utils.py,sha256=3qIutiQdYPyGRxH9ZwLbqdg8sZcnp6jp08pszWCRFl0,2820
|
|
13
|
+
matrice_compute-0.1.14.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
14
|
+
matrice_compute-0.1.14.dist-info/METADATA,sha256=u8ZIOoIX3uMEA4Lgaiuh73xsoPSdcHTZXAJuIBpn6KE,1038
|
|
15
|
+
matrice_compute-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
16
|
+
matrice_compute-0.1.14.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
17
|
+
matrice_compute-0.1.14.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
matrice_compute/__init__.py,sha256=HG5yzsY2dcQ0sGKwxMj-Sv2zDhbSC00slAdkcfS9nng,304
|
|
2
|
-
matrice_compute/action_instance.py,sha256=6tyZehK7SfIu6NjXp4wFeYMY0BINShmXtoCXyimDKN0,58002
|
|
3
|
-
matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
|
|
4
|
-
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
|
-
matrice_compute/instance_manager.py,sha256=LhPOqrzmrs-QdorqgDOuBDHjpUkLPgCZovBdCBiVmVw,10103
|
|
6
|
-
matrice_compute/instance_utils.py,sha256=tIFVUi8HJPy4GY-jtfVx2zIgmXNta7s3jCIRzBga1hI,21977
|
|
7
|
-
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
8
|
-
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
matrice_compute/resources_tracker.py,sha256=My26LPglDHcQcTkxxiXwpfdqkpEAt3clrqJ-k1fAl1M,17878
|
|
10
|
-
matrice_compute/scaling.py,sha256=8HfbKMsR7EI0rrLfKl_gz6FMO2Q4sLXELxGc3DcLwz8,31743
|
|
11
|
-
matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
|
|
12
|
-
matrice_compute/task_utils.py,sha256=ML9uTrYQiWgEMJitYxoGlVOa9KUXNKV_WqnousOTK6k,2762
|
|
13
|
-
matrice_compute-0.1.12.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
14
|
-
matrice_compute-0.1.12.dist-info/METADATA,sha256=__gJ0pkG07q5s8kOmdk8ItFWV3pfcHa-EUKDEe48xrY,1038
|
|
15
|
-
matrice_compute-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
16
|
-
matrice_compute-0.1.12.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
17
|
-
matrice_compute-0.1.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|