matrice-compute 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +9 -0
- matrice_compute/action_instance.py +1508 -0
- matrice_compute/actions_manager.py +226 -0
- matrice_compute/actions_scaledown_manager.py +57 -0
- matrice_compute/instance_manager.py +270 -0
- matrice_compute/instance_utils.py +707 -0
- matrice_compute/prechecks.py +538 -0
- matrice_compute/py.typed +0 -0
- matrice_compute/resources_tracker.py +478 -0
- matrice_compute/scaling.py +880 -0
- matrice_compute/shutdown_manager.py +314 -0
- matrice_compute/task_utils.py +77 -0
- matrice_compute-0.1.1.dist-info/METADATA +28 -0
- matrice_compute-0.1.1.dist-info/RECORD +17 -0
- matrice_compute-0.1.1.dist-info/WHEEL +5 -0
- matrice_compute-0.1.1.dist-info/licenses/LICENSE.txt +21 -0
- matrice_compute-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1508 @@
|
|
|
1
|
+
"""Module providing action_instance functionality."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shlex
|
|
6
|
+
import subprocess
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import signal
|
|
10
|
+
import urllib.request
|
|
11
|
+
from matrice_compute.instance_utils import (
|
|
12
|
+
get_gpu_with_sufficient_memory_for_action,
|
|
13
|
+
get_decrypted_access_key_pair,
|
|
14
|
+
get_max_file_system,
|
|
15
|
+
)
|
|
16
|
+
from matrice_compute.task_utils import (
|
|
17
|
+
setup_workspace_and_run_task,
|
|
18
|
+
)
|
|
19
|
+
from matrice_compute.scaling import (
|
|
20
|
+
Scaling,
|
|
21
|
+
)
|
|
22
|
+
from matrice_common.utils import log_errors
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ActionInstance:
|
|
26
|
+
"""Base class for tasks that run in Action containers."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, scaling: Scaling, action_info: dict):
|
|
29
|
+
"""Initialize an action instance.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
scaling (Scaling): Scaling service instance
|
|
33
|
+
action_info (dict): Action information dictionary
|
|
34
|
+
"""
|
|
35
|
+
self.scaling = scaling
|
|
36
|
+
self.process: subprocess.Popen | None = None
|
|
37
|
+
self.stop_thread = False
|
|
38
|
+
self.log_thread: threading.Thread | None = None
|
|
39
|
+
self.log_path: str | None = None
|
|
40
|
+
self.cmd: str | None = None
|
|
41
|
+
self.matrice_access_key_id: str | None = None
|
|
42
|
+
self.matrice_secret_access_key: str | None = None
|
|
43
|
+
self.action_info = action_info
|
|
44
|
+
self.action_record_id = action_info["_id"]
|
|
45
|
+
self.action_type = action_info["action"]
|
|
46
|
+
self.action_details = action_info["actionDetails"]
|
|
47
|
+
self.docker_container = self.action_details.get(
|
|
48
|
+
"docker",
|
|
49
|
+
self.action_details.get(
|
|
50
|
+
"docker_container",
|
|
51
|
+
self.scaling.get_data_processing_image(),
|
|
52
|
+
),
|
|
53
|
+
)
|
|
54
|
+
self.actions_map = {
|
|
55
|
+
"model_train": model_train_execute,
|
|
56
|
+
"model_eval": model_eval_execute,
|
|
57
|
+
"model_export": model_export_execute,
|
|
58
|
+
"deploy_add": model_deploy_execute,
|
|
59
|
+
"data_import": data_processing_execute,
|
|
60
|
+
"data_add": data_processing_execute,
|
|
61
|
+
"data_split": data_split_execute,
|
|
62
|
+
"data_prep": data_preparation_execute,
|
|
63
|
+
"dataset_annotation": dataset_annotation_execute,
|
|
64
|
+
"dataset_augmentation": dataset_augmentation_execute,
|
|
65
|
+
"augmentation_setup": augmentation_server_creation_execute,
|
|
66
|
+
"dataset_generation": synthetic_dataset_generation_execute,
|
|
67
|
+
"synthetic_data_setup": synthetic_data_setup_execute, # start
|
|
68
|
+
"image_build": image_build_execute,
|
|
69
|
+
"resource_clone": resource_clone_execute,
|
|
70
|
+
"database_setup": database_setup_execute,
|
|
71
|
+
"kafka_setup": kafka_setup_execute,
|
|
72
|
+
"inference_aggregator": deploy_aggregator_execute,
|
|
73
|
+
"redis_setup": redis_setup_execute,
|
|
74
|
+
"streaming_gateway": streaming_gateway_execute,
|
|
75
|
+
"facial_recognition_setup": facial_recognition_setup_execute,
|
|
76
|
+
"fe_fs_streaming": fe_fs_streaming_execute,
|
|
77
|
+
"inference_ws_server": inference_ws_server_execute
|
|
78
|
+
}
|
|
79
|
+
if self.action_type not in self.actions_map:
|
|
80
|
+
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
81
|
+
self.task = self.actions_map[self.action_type]
|
|
82
|
+
|
|
83
|
+
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
84
|
+
def _init_credentials(self):
|
|
85
|
+
"""Initialize Matrice credentials.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
dict: Dictionary containing access key ID and secret access key
|
|
89
|
+
"""
|
|
90
|
+
self.matrice_access_key_id = self.scaling.session.access_key
|
|
91
|
+
self.matrice_secret_access_key = self.scaling.session.secret_key
|
|
92
|
+
if not all(
|
|
93
|
+
[
|
|
94
|
+
self.matrice_access_key_id,
|
|
95
|
+
self.matrice_secret_access_key,
|
|
96
|
+
]
|
|
97
|
+
):
|
|
98
|
+
raise ValueError(
|
|
99
|
+
"Matrice credentials not found - both access key ID and secret access key are required"
|
|
100
|
+
)
|
|
101
|
+
return {
|
|
102
|
+
"matrice_access_key_id": self.matrice_access_key_id,
|
|
103
|
+
"matrice_secret_access_key": self.matrice_secret_access_key,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
@log_errors(default_return="logs", raise_exception=False, log_error=False)
|
|
107
|
+
def get_log_path(self):
|
|
108
|
+
"""Get log directory path, creating if needed.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
str: Path to log directory
|
|
112
|
+
"""
|
|
113
|
+
os.makedirs("logs", exist_ok=True)
|
|
114
|
+
return "logs"
|
|
115
|
+
|
|
116
|
+
@log_errors(default_return=False, raise_exception=False, log_error=False)
|
|
117
|
+
def is_running(self) -> bool:
|
|
118
|
+
"""Check if task process is running.
|
|
119
|
+
|
|
120
|
+
This method performs a thorough check to determine if the process is still running:
|
|
121
|
+
1. Verifies that the process attribute exists and is not None
|
|
122
|
+
2. Checks if the process has terminated using poll() method
|
|
123
|
+
3. Additional safeguards against zombie processes
|
|
124
|
+
4. Coordinates with log monitoring to ensure all logs are sent before cleanup
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
bool: True if process exists and is still running, False if process
|
|
128
|
+
does not exist or has terminated
|
|
129
|
+
"""
|
|
130
|
+
# Basic check if process exists
|
|
131
|
+
if not hasattr(self, "process") or self.process is None:
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
# Check if process has terminated
|
|
136
|
+
poll_result = self.process.poll()
|
|
137
|
+
|
|
138
|
+
# poll() returns None if the process is still running
|
|
139
|
+
is_running = poll_result is None
|
|
140
|
+
|
|
141
|
+
# If process has terminated, ensure we do proper cleanup
|
|
142
|
+
if not is_running:
|
|
143
|
+
# Log termination with action ID for debugging
|
|
144
|
+
action_id = getattr(self, "action_record_id", "unknown")
|
|
145
|
+
logging.info(
|
|
146
|
+
"Process for action %s has terminated with exit code: %s",
|
|
147
|
+
action_id,
|
|
148
|
+
poll_result,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# CRITICAL: Ensure all logs are sent before cleaning up process
|
|
152
|
+
self._ensure_final_logs_sent()
|
|
153
|
+
|
|
154
|
+
# Try to explicitly clean up the process to avoid zombies
|
|
155
|
+
try:
|
|
156
|
+
# Wait for process with a short timeout to ensure it's fully terminated
|
|
157
|
+
self.process.wait(timeout=1)
|
|
158
|
+
except subprocess.TimeoutExpired:
|
|
159
|
+
# If still running after timeout (unlikely at this point)
|
|
160
|
+
logging.warning(
|
|
161
|
+
f"Process for action {action_id} failed to terminate properly"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Set process to None to help garbage collection - BUT ONLY after logs are handled
|
|
165
|
+
self.process = None
|
|
166
|
+
|
|
167
|
+
return is_running
|
|
168
|
+
|
|
169
|
+
except Exception as e:
|
|
170
|
+
# Something went wrong while checking the process status
|
|
171
|
+
logging.error(f"Error checking process status: {str(e)}")
|
|
172
|
+
# Ensure logs are sent even in error cases
|
|
173
|
+
self._ensure_final_logs_sent()
|
|
174
|
+
# To be safe, assume process is not running when we can't check it
|
|
175
|
+
self.process = None
|
|
176
|
+
return False
|
|
177
|
+
|
|
178
|
+
def _ensure_final_logs_sent(self):
|
|
179
|
+
"""Ensure all remaining logs are sent when a process terminates.
|
|
180
|
+
|
|
181
|
+
This method performs a final log flush to ensure no logs are lost
|
|
182
|
+
when a container crashes or shuts down.
|
|
183
|
+
"""
|
|
184
|
+
if (
|
|
185
|
+
not hasattr(self, "log_path")
|
|
186
|
+
or not self.log_path
|
|
187
|
+
or not os.path.exists(self.log_path)
|
|
188
|
+
):
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
# Set flag to stop continuous logging thread
|
|
193
|
+
self.stop_thread = True
|
|
194
|
+
|
|
195
|
+
# Give log thread a moment to finish current operation
|
|
196
|
+
time.sleep(1)
|
|
197
|
+
|
|
198
|
+
# Perform final log flush
|
|
199
|
+
logging.info(
|
|
200
|
+
"Performing final log flush for action %s",
|
|
201
|
+
getattr(self, "action_record_id", "unknown"),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Read any remaining logs that haven't been sent
|
|
205
|
+
with open(self.log_path, "rb") as log_file:
|
|
206
|
+
# Get the last position that was read (if tracked)
|
|
207
|
+
last_position = getattr(self, "_last_log_position", 0)
|
|
208
|
+
log_file.seek(last_position)
|
|
209
|
+
remaining_content = log_file.read()
|
|
210
|
+
|
|
211
|
+
if remaining_content:
|
|
212
|
+
try:
|
|
213
|
+
decoded_content = remaining_content.decode("utf-8")
|
|
214
|
+
except UnicodeDecodeError:
|
|
215
|
+
decoded_content = remaining_content.decode(
|
|
216
|
+
"utf-8", errors="replace"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Send final logs
|
|
220
|
+
self._send_logs_to_scaling(decoded_content)
|
|
221
|
+
self._check_cuda(decoded_content)
|
|
222
|
+
|
|
223
|
+
logging.info(
|
|
224
|
+
"Sent %d bytes of final logs for action %s",
|
|
225
|
+
len(remaining_content),
|
|
226
|
+
getattr(self, "action_record_id", "unknown"),
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
logging.debug(
|
|
230
|
+
"No additional logs to send for action %s",
|
|
231
|
+
getattr(self, "action_record_id", "unknown"),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
except Exception as e:
|
|
235
|
+
logging.error(
|
|
236
|
+
"Error during final log flush for action %s: %s",
|
|
237
|
+
getattr(self, "action_record_id", "unknown"),
|
|
238
|
+
str(e),
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
@log_errors(default_return=None, raise_exception=False, log_error=False)
|
|
242
|
+
def get_action_details(self):
|
|
243
|
+
"""Get action details from scaling service.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
dict: Action details if successful, None otherwise
|
|
247
|
+
"""
|
|
248
|
+
resp, error, message = self.scaling.get_action_details(self.action_record_id)
|
|
249
|
+
if error:
|
|
250
|
+
logging.error(
|
|
251
|
+
"Error getting action details: %s",
|
|
252
|
+
error,
|
|
253
|
+
)
|
|
254
|
+
return None
|
|
255
|
+
return resp
|
|
256
|
+
|
|
257
|
+
@log_errors(default_return="", raise_exception=False)
|
|
258
|
+
def get_gpu_config(self, action_details):
|
|
259
|
+
"""Get GPU configuration string based on available GPUs.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
action_details (dict): Action details containing GPU requirements
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
str: GPU configuration string
|
|
266
|
+
"""
|
|
267
|
+
if not action_details["actionDetails"].get("gpuRequired", False):
|
|
268
|
+
return ""
|
|
269
|
+
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
270
|
+
action_details=action_details
|
|
271
|
+
)
|
|
272
|
+
if gpu_indices:
|
|
273
|
+
gpu_str = ",".join(map(str, gpu_indices))
|
|
274
|
+
logging.info("Using GPUs: %s", gpu_str)
|
|
275
|
+
return f'--gpus "device={gpu_str}"'
|
|
276
|
+
logging.info("No GPUs with sufficient memory found.")
|
|
277
|
+
return ""
|
|
278
|
+
|
|
279
|
+
@log_errors(default_return="", raise_exception=False)
|
|
280
|
+
def get_base_docker_cmd(
|
|
281
|
+
self,
|
|
282
|
+
work_fs: str = "",
|
|
283
|
+
use_gpu: str = "",
|
|
284
|
+
mount_docker_sock: bool = False,
|
|
285
|
+
action_id: str = "",
|
|
286
|
+
model_key: str = "",
|
|
287
|
+
extra_env_vars: dict = {},
|
|
288
|
+
port_mapping: dict = {},
|
|
289
|
+
network_config: str = "",
|
|
290
|
+
destination_workspace_path: str = "/usr/src/workspace",
|
|
291
|
+
docker_workdir: str = "",
|
|
292
|
+
extra_pkgs: list = [],
|
|
293
|
+
):
|
|
294
|
+
"""Build base Docker command with common options.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
work_fs (str): Work filesystem path
|
|
298
|
+
use_gpu (str): GPU configuration string
|
|
299
|
+
mount_docker_sock (bool): Whether to mount Docker socket
|
|
300
|
+
action_id (str): Action ID
|
|
301
|
+
model_key (str): Model key
|
|
302
|
+
extra_env_vars (dict): Additional environment variables
|
|
303
|
+
port_mapping (dict): Port mappings {host_port: container_port}
|
|
304
|
+
destination_workspace_path (str): Container workspace path
|
|
305
|
+
docker_workdir (str): Docker working directory
|
|
306
|
+
extra_pkgs (list): List of extra packages to install
|
|
307
|
+
Returns:
|
|
308
|
+
str: Base Docker command
|
|
309
|
+
"""
|
|
310
|
+
env = os.environ.get("ENV", "prod")
|
|
311
|
+
env_vars = {
|
|
312
|
+
"ENV": env,
|
|
313
|
+
"MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
|
|
314
|
+
"MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
|
|
315
|
+
}
|
|
316
|
+
if self.get_hugging_face_token(model_key):
|
|
317
|
+
env_vars["HUGGING_FACE_ACCESS_TOKEN"] = self.get_hugging_face_token(
|
|
318
|
+
model_key
|
|
319
|
+
)
|
|
320
|
+
if extra_env_vars:
|
|
321
|
+
env_vars.update(extra_env_vars)
|
|
322
|
+
|
|
323
|
+
if network_config == "":
|
|
324
|
+
network_config = (
|
|
325
|
+
"--net=host"
|
|
326
|
+
if not port_mapping
|
|
327
|
+
else " ".join(
|
|
328
|
+
f"-p {host}:{container}" for host, container in port_mapping.items()
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
if not docker_workdir:
|
|
333
|
+
if action_id:
|
|
334
|
+
docker_workdir = f"/usr/src/{action_id}"
|
|
335
|
+
else:
|
|
336
|
+
docker_workdir = "."
|
|
337
|
+
volumes = [
|
|
338
|
+
( # Mount workspace if work_fs is provided
|
|
339
|
+
f"-v {work_fs}/workspace:{destination_workspace_path}"
|
|
340
|
+
if work_fs and work_fs not in ["/"]
|
|
341
|
+
else ""
|
|
342
|
+
),
|
|
343
|
+
( # Mount action directory if work_fs and action_id are provided
|
|
344
|
+
f"-v {work_fs}/{action_id}:/usr/src/{action_id}"
|
|
345
|
+
if work_fs and work_fs not in ["/"] and action_id
|
|
346
|
+
else ""
|
|
347
|
+
),
|
|
348
|
+
"-v /var/run/docker.sock:/var/run/docker.sock" if mount_docker_sock else "",
|
|
349
|
+
]
|
|
350
|
+
pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
|
|
351
|
+
pkgs = ["matrice_common", "matrice"]
|
|
352
|
+
pkgs.extend(extra_pkgs)
|
|
353
|
+
pip_install_matrice = f"pip install --upgrade --force-reinstall --index-url {pypi_index} {' '.join(pkgs)}"
|
|
354
|
+
pip_install_requirements = (
|
|
355
|
+
"if [ -f requirements.txt ]; then pip install -r requirements.txt; fi "
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# Create export statements for environment variables to ensure they're available in subshells
|
|
359
|
+
env_exports = " && ".join(
|
|
360
|
+
[
|
|
361
|
+
f"export {key}={shlex.quote(str(value))}"
|
|
362
|
+
for key, value in env_vars.items()
|
|
363
|
+
]
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
cmd_parts = [
|
|
367
|
+
f"docker run {use_gpu} ",
|
|
368
|
+
network_config,
|
|
369
|
+
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
370
|
+
*volumes,
|
|
371
|
+
# Container configuration and startup commands
|
|
372
|
+
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
373
|
+
f'/bin/bash -c "cd {docker_workdir} && '
|
|
374
|
+
f"{env_exports} && "
|
|
375
|
+
f"{pip_install_requirements} && "
|
|
376
|
+
f"{pip_install_matrice} && ",
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
# Join all non-empty parts with spaces
|
|
380
|
+
return " ".join(filter(None, cmd_parts))
|
|
381
|
+
|
|
382
|
+
@log_errors(default_return="", raise_exception=False)
|
|
383
|
+
def get_hugging_face_token(self, model_key):
|
|
384
|
+
"""Get Hugging Face token for specific model keys.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
model_key (str): Model key to check
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
str: Hugging Face token if available, empty string otherwise
|
|
391
|
+
"""
|
|
392
|
+
hugging_face_token = ""
|
|
393
|
+
if model_key and (
|
|
394
|
+
model_key.startswith("microsoft") or model_key.startswith("timm")
|
|
395
|
+
):
|
|
396
|
+
secret_name = "hugging_face"
|
|
397
|
+
resp, error, message = self.scaling.get_model_secret_keys(secret_name)
|
|
398
|
+
if error is not None:
|
|
399
|
+
logging.error(
|
|
400
|
+
"Error getting Hugging Face token: %s",
|
|
401
|
+
message,
|
|
402
|
+
)
|
|
403
|
+
else:
|
|
404
|
+
hugging_face_token = resp["user_access_token"]
|
|
405
|
+
return hugging_face_token
|
|
406
|
+
|
|
407
|
+
@log_errors(default_return="", raise_exception=False)
|
|
408
|
+
def get_hugging_face_token_for_data_generation(self):
|
|
409
|
+
secret_name = "hugging_face"
|
|
410
|
+
resp, error, message = self.scaling.get_model_secret_keys(secret_name)
|
|
411
|
+
if error is not None:
|
|
412
|
+
logging.error(
|
|
413
|
+
"Error getting Hugging Face token: %s",
|
|
414
|
+
message,
|
|
415
|
+
)
|
|
416
|
+
else:
|
|
417
|
+
hugging_face_token = resp["user_access_token"]
|
|
418
|
+
return hugging_face_token
|
|
419
|
+
|
|
420
|
+
@log_errors(default_return="", raise_exception=False)
|
|
421
|
+
def get_internal_api_key(self, action_id):
|
|
422
|
+
"""Get internal API key for action.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
action_id (str): Action ID
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
str: Internal API key if available, empty string otherwise
|
|
429
|
+
"""
|
|
430
|
+
internal_api_key = ""
|
|
431
|
+
resp, error, message = self.scaling.get_internal_api_key(action_id)
|
|
432
|
+
if error is not None:
|
|
433
|
+
logging.error(
|
|
434
|
+
"Error getting internal api key: %s",
|
|
435
|
+
message,
|
|
436
|
+
)
|
|
437
|
+
else:
|
|
438
|
+
internal_api_key = resp["internal_api_key"]
|
|
439
|
+
return internal_api_key
|
|
440
|
+
|
|
441
|
+
@log_errors(raise_exception=True)
|
|
442
|
+
def setup_action_requirements(
|
|
443
|
+
self,
|
|
444
|
+
action_details,
|
|
445
|
+
work_fs="",
|
|
446
|
+
model_family="",
|
|
447
|
+
action_id="",
|
|
448
|
+
):
|
|
449
|
+
"""Setup action requirements.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
action_details (dict): Action details
|
|
453
|
+
work_fs (str): Work filesystem path
|
|
454
|
+
model_family (str): Model family name
|
|
455
|
+
action_id (str): Action ID
|
|
456
|
+
|
|
457
|
+
Raises:
|
|
458
|
+
Exception: If setup fails
|
|
459
|
+
"""
|
|
460
|
+
# Get job parameters from action_details
|
|
461
|
+
job_params = action_details.get("jobParams", {})
|
|
462
|
+
|
|
463
|
+
# Setup model codebase if model_family is provided
|
|
464
|
+
if model_family:
|
|
465
|
+
# Try to get model codebase URLs from action_details first
|
|
466
|
+
model_codebase_url = job_params.get("model_codebase_url")
|
|
467
|
+
model_requirements_url = job_params.get("model_requirements_url")
|
|
468
|
+
|
|
469
|
+
# Fallback to API calls if not provided in action_details
|
|
470
|
+
if not model_codebase_url:
|
|
471
|
+
model_codebase_url, error, message = self.scaling.get_model_codebase(
|
|
472
|
+
model_family
|
|
473
|
+
)
|
|
474
|
+
if error:
|
|
475
|
+
logging.warning(f"Failed to get model codebase URL: {message}")
|
|
476
|
+
model_codebase_url = None
|
|
477
|
+
|
|
478
|
+
# Handle requirements URL - use from job_params or get from API
|
|
479
|
+
if model_requirements_url:
|
|
480
|
+
model_codebase_requirements_url = model_requirements_url
|
|
481
|
+
else:
|
|
482
|
+
model_codebase_requirements_url, error, message = (
|
|
483
|
+
self.scaling.get_model_codebase_requirements(model_family)
|
|
484
|
+
)
|
|
485
|
+
if error:
|
|
486
|
+
logging.warning(
|
|
487
|
+
f"Failed to get model codebase requirements URL: {message}"
|
|
488
|
+
)
|
|
489
|
+
model_codebase_requirements_url = None
|
|
490
|
+
|
|
491
|
+
# Setup workspace if we have the URLs
|
|
492
|
+
if model_codebase_url:
|
|
493
|
+
setup_workspace_and_run_task(
|
|
494
|
+
work_fs,
|
|
495
|
+
action_id,
|
|
496
|
+
model_codebase_url,
|
|
497
|
+
model_codebase_requirements_url,
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
# Setup Docker credentials
|
|
501
|
+
try:
|
|
502
|
+
# Try to get Docker credentials from action_details first
|
|
503
|
+
docker_username = job_params.get("Username")
|
|
504
|
+
docker_password = job_params.get("Password")
|
|
505
|
+
if docker_username and docker_password:
|
|
506
|
+
username = docker_username
|
|
507
|
+
password = docker_password
|
|
508
|
+
logging.info("Using Docker credentials from action_details")
|
|
509
|
+
else:
|
|
510
|
+
# Fallback to API call
|
|
511
|
+
creds, error, message = self.scaling.get_docker_hub_credentials()
|
|
512
|
+
if error:
|
|
513
|
+
raise Exception(f"Failed to get Docker credentials: {message}")
|
|
514
|
+
username = creds["username"]
|
|
515
|
+
password = creds["password"]
|
|
516
|
+
logging.info("Using Docker credentials from API call")
|
|
517
|
+
|
|
518
|
+
if username and password:
|
|
519
|
+
login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
|
|
520
|
+
subprocess.run(login_cmd, shell=True, check=True)
|
|
521
|
+
logging.info("Docker login successful")
|
|
522
|
+
else:
|
|
523
|
+
logging.warning(
|
|
524
|
+
"Docker credentials not available, skipping Docker login"
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
except Exception as err:
|
|
528
|
+
logging.error(
|
|
529
|
+
"Docker login failed: %s",
|
|
530
|
+
str(err),
|
|
531
|
+
)
|
|
532
|
+
raise
|
|
533
|
+
|
|
534
|
+
# Setup user access credentials
|
|
535
|
+
try:
|
|
536
|
+
# Try to get access key and secret key from job_params first
|
|
537
|
+
access_key = job_params.get("access_key")
|
|
538
|
+
secret_key = job_params.get("secret_key")
|
|
539
|
+
|
|
540
|
+
if access_key and secret_key:
|
|
541
|
+
logging.info("Using access key and secret key from job_params")
|
|
542
|
+
(
|
|
543
|
+
self.matrice_access_key_id,
|
|
544
|
+
self.matrice_secret_access_key,
|
|
545
|
+
) = get_decrypted_access_key_pair(access_key, secret_key)
|
|
546
|
+
else:
|
|
547
|
+
# Fallback to API call
|
|
548
|
+
logging.info(
|
|
549
|
+
"Access key and secret key not found in job_params, falling back to API call"
|
|
550
|
+
)
|
|
551
|
+
(
|
|
552
|
+
user_access_key_pair,
|
|
553
|
+
error,
|
|
554
|
+
message,
|
|
555
|
+
) = self.scaling.get_user_access_key_pair(action_details["_idUser"])
|
|
556
|
+
if error:
|
|
557
|
+
raise Exception(f"Failed to get user access key pair: {message}")
|
|
558
|
+
access_key = user_access_key_pair["access_key"]
|
|
559
|
+
secret_key = user_access_key_pair["secret_key"]
|
|
560
|
+
(
|
|
561
|
+
self.matrice_access_key_id,
|
|
562
|
+
self.matrice_secret_access_key,
|
|
563
|
+
) = get_decrypted_access_key_pair(access_key, secret_key)
|
|
564
|
+
|
|
565
|
+
except Exception as err:
|
|
566
|
+
logging.error(
|
|
567
|
+
"Failed to setup credentials: %s",
|
|
568
|
+
str(err),
|
|
569
|
+
)
|
|
570
|
+
raise
|
|
571
|
+
|
|
572
|
+
@log_errors(raise_exception=False)
|
|
573
|
+
def create_redis_container(self, redis_image=None, redis_password=None):
|
|
574
|
+
"""Create and start a Redis container using Docker.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
tuple: (container_info, error, message)
|
|
581
|
+
"""
|
|
582
|
+
if redis_image is None:
|
|
583
|
+
redis_image = "redis:latest"
|
|
584
|
+
|
|
585
|
+
network_name = f"redis_network_{int(time.time())}"
|
|
586
|
+
subprocess.run(f"docker network create {network_name}", shell=True, check=True)
|
|
587
|
+
|
|
588
|
+
try:
|
|
589
|
+
# Get an available port for Redis
|
|
590
|
+
external_port = "6379"
|
|
591
|
+
|
|
592
|
+
# Generate a unique container name and password
|
|
593
|
+
container_name = f"redis_container_{int(time.time())}"
|
|
594
|
+
|
|
595
|
+
# Build the docker command to create Redis container with password
|
|
596
|
+
cmd = (
|
|
597
|
+
f"docker run -d "
|
|
598
|
+
f"--network {network_name} "
|
|
599
|
+
f"--name {container_name} "
|
|
600
|
+
f"-p {external_port}:6379 "
|
|
601
|
+
f"--restart unless-stopped "
|
|
602
|
+
f"{redis_image} "
|
|
603
|
+
f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
logging.info("Creating Redis container with command: %s", cmd)
|
|
607
|
+
|
|
608
|
+
# Execute the command
|
|
609
|
+
result = subprocess.run(
|
|
610
|
+
cmd, shell=True, capture_output=True, text=True, timeout=60
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
if result.returncode == 0:
|
|
614
|
+
container_id = result.stdout.strip()
|
|
615
|
+
container_info = {
|
|
616
|
+
"container_id": container_id,
|
|
617
|
+
"container_name": container_name,
|
|
618
|
+
"network_name": network_name,
|
|
619
|
+
"external_port": external_port,
|
|
620
|
+
"internal_port": 6379,
|
|
621
|
+
"password": redis_password,
|
|
622
|
+
"image": redis_image,
|
|
623
|
+
"status": "running",
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
logging.info("Redis container created successfully: %s", container_info)
|
|
627
|
+
return container_info, None, "Redis container created successfully"
|
|
628
|
+
else:
|
|
629
|
+
error_message = f"Failed to create Redis container: {result.stderr}"
|
|
630
|
+
logging.error(error_message)
|
|
631
|
+
return None, "ContainerCreationError", error_message
|
|
632
|
+
|
|
633
|
+
except subprocess.TimeoutExpired:
|
|
634
|
+
error_message = "Timeout while creating Redis container"
|
|
635
|
+
logging.error(error_message)
|
|
636
|
+
return None, "TimeoutError", error_message
|
|
637
|
+
except Exception as e:
|
|
638
|
+
error_message = f"Unexpected error creating Redis container: {str(e)}"
|
|
639
|
+
logging.error(error_message)
|
|
640
|
+
return None, "UnexpectedError", error_message
|
|
641
|
+
|
|
642
|
+
@log_errors(raise_exception=False, log_error=False)
|
|
643
|
+
def send_logs_continuously(self):
|
|
644
|
+
"""Continuously read and send logs from the log file to the scaling service.
|
|
645
|
+
|
|
646
|
+
Enhanced version that tracks log position and handles graceful shutdown.
|
|
647
|
+
"""
|
|
648
|
+
last_position = 0
|
|
649
|
+
self._last_log_position = 0 # Track position for final flush
|
|
650
|
+
|
|
651
|
+
while not self.stop_thread and os.path.exists(self.log_path):
|
|
652
|
+
try:
|
|
653
|
+
with open(self.log_path, "rb") as log_file:
|
|
654
|
+
log_file.seek(last_position)
|
|
655
|
+
new_content = log_file.read()
|
|
656
|
+
if new_content:
|
|
657
|
+
try:
|
|
658
|
+
decoded_content = new_content.decode("utf-8")
|
|
659
|
+
except UnicodeDecodeError:
|
|
660
|
+
# Handle invalid UTF-8 bytes by replacing them
|
|
661
|
+
decoded_content = new_content.decode(
|
|
662
|
+
"utf-8",
|
|
663
|
+
errors="replace",
|
|
664
|
+
)
|
|
665
|
+
self._send_logs_to_scaling(decoded_content)
|
|
666
|
+
self._check_cuda(decoded_content)
|
|
667
|
+
|
|
668
|
+
# Update tracked position
|
|
669
|
+
last_position = log_file.tell()
|
|
670
|
+
self._last_log_position = last_position
|
|
671
|
+
|
|
672
|
+
except Exception as e:
|
|
673
|
+
logging.error(
|
|
674
|
+
"Error reading logs for action %s: %s",
|
|
675
|
+
getattr(self, "action_record_id", "unknown"),
|
|
676
|
+
str(e),
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
# Use shorter sleep interval for more responsive log monitoring
|
|
680
|
+
time.sleep(10) # Reduced from 30 to 10 seconds for better responsiveness
|
|
681
|
+
|
|
682
|
+
# Final attempt to send any remaining logs when thread is stopping
|
|
683
|
+
logging.info(
|
|
684
|
+
"Log monitoring thread stopping for action %s, performing final check",
|
|
685
|
+
getattr(self, "action_record_id", "unknown"),
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
# One more final read attempt
|
|
689
|
+
try:
|
|
690
|
+
if os.path.exists(self.log_path):
|
|
691
|
+
with open(self.log_path, "rb") as log_file:
|
|
692
|
+
log_file.seek(last_position)
|
|
693
|
+
final_content = log_file.read()
|
|
694
|
+
if final_content:
|
|
695
|
+
try:
|
|
696
|
+
decoded_content = final_content.decode("utf-8")
|
|
697
|
+
except UnicodeDecodeError:
|
|
698
|
+
decoded_content = final_content.decode(
|
|
699
|
+
"utf-8", errors="replace"
|
|
700
|
+
)
|
|
701
|
+
self._send_logs_to_scaling(decoded_content)
|
|
702
|
+
self._check_cuda(decoded_content)
|
|
703
|
+
logging.info(
|
|
704
|
+
"Sent final %d bytes of logs for action %s",
|
|
705
|
+
len(final_content),
|
|
706
|
+
getattr(self, "action_record_id", "unknown"),
|
|
707
|
+
)
|
|
708
|
+
except Exception as e:
|
|
709
|
+
logging.error(
|
|
710
|
+
"Error in final log read for action %s: %s",
|
|
711
|
+
getattr(self, "action_record_id", "unknown"),
|
|
712
|
+
str(e),
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
@log_errors(raise_exception=False, log_error=False)
|
|
716
|
+
def _send_logs_to_scaling(self, log_content):
|
|
717
|
+
"""Send logs to the scaling service.
|
|
718
|
+
|
|
719
|
+
Args:
|
|
720
|
+
log_content (str): Log content to send
|
|
721
|
+
"""
|
|
722
|
+
_, error, message = self.scaling.update_action_docker_logs(
|
|
723
|
+
action_record_id=self.action_record_id,
|
|
724
|
+
log_content=log_content,
|
|
725
|
+
)
|
|
726
|
+
if error:
|
|
727
|
+
logging.error(
|
|
728
|
+
"Error from update_action_docker_logs: %s",
|
|
729
|
+
error,
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
@log_errors(raise_exception=False, log_error=False)
|
|
733
|
+
def _check_cuda(self, log_content):
|
|
734
|
+
"""Check for CUDA out of memory errors in logs and update action status.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
log_content (str): Log content to check
|
|
738
|
+
"""
|
|
739
|
+
if "CUDA error: out of memory" in log_content:
|
|
740
|
+
action_details = self.get_action_details()
|
|
741
|
+
if not action_details:
|
|
742
|
+
return
|
|
743
|
+
self.scaling.update_action(
|
|
744
|
+
id=self.action_record_id,
|
|
745
|
+
step_code="ERROR",
|
|
746
|
+
action_type=action_details["action"],
|
|
747
|
+
status="ERROR",
|
|
748
|
+
status_description="CUDA error: out of memory",
|
|
749
|
+
service="bg-job-scheduler",
|
|
750
|
+
job_params=action_details["jobParams"],
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
@log_errors(raise_exception=True)
|
|
754
|
+
def start_process(self, cmd, log_name):
|
|
755
|
+
"""Start the process and initialize logging.
|
|
756
|
+
|
|
757
|
+
Args:
|
|
758
|
+
cmd (str): Command to execute
|
|
759
|
+
log_name (str): Name for log file
|
|
760
|
+
|
|
761
|
+
Raises:
|
|
762
|
+
Exception: If process fails to start
|
|
763
|
+
"""
|
|
764
|
+
self.cmd = cmd
|
|
765
|
+
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
766
|
+
with open(self.log_path, "wb") as out:
|
|
767
|
+
self.process = subprocess.Popen(
|
|
768
|
+
shlex.split(self.cmd),
|
|
769
|
+
stdout=out,
|
|
770
|
+
stderr=out,
|
|
771
|
+
env={**os.environ},
|
|
772
|
+
start_new_session=True,
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
@log_errors(raise_exception=False)
|
|
776
|
+
def start_logger(self):
|
|
777
|
+
"""Start the log monitoring thread."""
|
|
778
|
+
self.log_thread = threading.Thread(
|
|
779
|
+
target=self.send_logs_continuously,
|
|
780
|
+
daemon=False, # CRITICAL: Make thread non-daemon to ensure it completes
|
|
781
|
+
)
|
|
782
|
+
self.log_thread.start()
|
|
783
|
+
|
|
784
|
+
@log_errors(raise_exception=False)
|
|
785
|
+
def start(self, cmd: str = "", log_name: str = ""):
|
|
786
|
+
"""Start the process and log monitoring thread.
|
|
787
|
+
|
|
788
|
+
Args:
|
|
789
|
+
cmd (str): Command to execute
|
|
790
|
+
log_name (str): Name for log file
|
|
791
|
+
"""
|
|
792
|
+
self.start_process(cmd, log_name)
|
|
793
|
+
self.start_logger()
|
|
794
|
+
self.scaling.update_status(
|
|
795
|
+
self.action_record_id,
|
|
796
|
+
self.action_type,
|
|
797
|
+
"bg-job-scheduler",
|
|
798
|
+
"DKR_CMD",
|
|
799
|
+
"OK",
|
|
800
|
+
f"Start docker container with command: {cmd.replace(self.matrice_access_key_id, 'MATRICE_ACCESS_KEY_ID').replace(self.matrice_secret_access_key, 'MATRICE_SECRET_ACCESS_KEY')}",
|
|
801
|
+
)
|
|
802
|
+
|
|
803
|
+
@log_errors(raise_exception=False, log_error=False)
|
|
804
|
+
def stop(self):
|
|
805
|
+
"""Stop the process and log monitoring thread.
|
|
806
|
+
|
|
807
|
+
Enhanced version that ensures proper cleanup sequencing and log completion.
|
|
808
|
+
"""
|
|
809
|
+
logging.info("Stopping action %s", getattr(self, "action_record_id", "unknown"))
|
|
810
|
+
|
|
811
|
+
# Step 1: Signal log thread to stop
|
|
812
|
+
self.stop_thread = True
|
|
813
|
+
|
|
814
|
+
# Step 2: Stop the process
|
|
815
|
+
try:
|
|
816
|
+
if self.process:
|
|
817
|
+
logging.info(
|
|
818
|
+
"Terminating process for action %s",
|
|
819
|
+
getattr(self, "action_record_id", "unknown"),
|
|
820
|
+
)
|
|
821
|
+
os.killpg(
|
|
822
|
+
os.getpgid(self.process.pid),
|
|
823
|
+
signal.SIGTERM,
|
|
824
|
+
)
|
|
825
|
+
# Give process time to terminate gracefully
|
|
826
|
+
try:
|
|
827
|
+
self.process.wait(timeout=15)
|
|
828
|
+
logging.info(
|
|
829
|
+
"Process terminated gracefully for action %s",
|
|
830
|
+
getattr(self, "action_record_id", "unknown"),
|
|
831
|
+
)
|
|
832
|
+
except subprocess.TimeoutExpired:
|
|
833
|
+
logging.warning(
|
|
834
|
+
"Process didn't terminate gracefully, forcing kill for action %s",
|
|
835
|
+
getattr(self, "action_record_id", "unknown"),
|
|
836
|
+
)
|
|
837
|
+
try:
|
|
838
|
+
os.killpg(os.getpgid(self.process.pid), signal.SIGKILL)
|
|
839
|
+
self.process.wait(timeout=5)
|
|
840
|
+
except Exception as kill_err:
|
|
841
|
+
logging.error(
|
|
842
|
+
"Error force-killing process for action %s: %s",
|
|
843
|
+
getattr(self, "action_record_id", "unknown"),
|
|
844
|
+
str(kill_err),
|
|
845
|
+
)
|
|
846
|
+
except Exception as proc_err:
|
|
847
|
+
logging.error(
|
|
848
|
+
"Error stopping process for action %s: %s",
|
|
849
|
+
getattr(self, "action_record_id", "unknown"),
|
|
850
|
+
str(proc_err),
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
# Step 3: Ensure final logs are sent
|
|
854
|
+
self._ensure_final_logs_sent()
|
|
855
|
+
|
|
856
|
+
# Step 4: Wait for log thread to complete
|
|
857
|
+
if self.log_thread and self.log_thread.is_alive():
|
|
858
|
+
logging.info(
|
|
859
|
+
"Waiting for log thread to complete for action %s",
|
|
860
|
+
getattr(self, "action_record_id", "unknown"),
|
|
861
|
+
)
|
|
862
|
+
try:
|
|
863
|
+
self.log_thread.join(
|
|
864
|
+
timeout=30
|
|
865
|
+
) # Wait up to 30 seconds for logs to complete
|
|
866
|
+
if self.log_thread.is_alive():
|
|
867
|
+
logging.warning(
|
|
868
|
+
"Log thread didn't complete within timeout for action %s",
|
|
869
|
+
getattr(self, "action_record_id", "unknown"),
|
|
870
|
+
)
|
|
871
|
+
else:
|
|
872
|
+
logging.info(
|
|
873
|
+
"Log thread completed successfully for action %s",
|
|
874
|
+
getattr(self, "action_record_id", "unknown"),
|
|
875
|
+
)
|
|
876
|
+
except Exception as thread_err:
|
|
877
|
+
logging.error(
|
|
878
|
+
"Error waiting for log thread for action %s: %s",
|
|
879
|
+
getattr(self, "action_record_id", "unknown"),
|
|
880
|
+
str(thread_err),
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
@log_errors(raise_exception=False)
|
|
884
|
+
def execute(self):
|
|
885
|
+
"""Execute the task."""
|
|
886
|
+
self.task(self)
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
@log_errors(raise_exception=False)
|
|
890
|
+
def data_preparation_execute(
|
|
891
|
+
self: ActionInstance,
|
|
892
|
+
):
|
|
893
|
+
"""Execute data preparation task."""
|
|
894
|
+
work_fs = get_max_file_system()
|
|
895
|
+
action_details = self.get_action_details()
|
|
896
|
+
if not action_details:
|
|
897
|
+
return
|
|
898
|
+
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
899
|
+
action = {"jobParams": action_details["jobParams"]}
|
|
900
|
+
dataset_id_version = (
|
|
901
|
+
action_details["jobParams"]["dataset_id"]
|
|
902
|
+
+ action_details["jobParams"]["dataset_version"]
|
|
903
|
+
)
|
|
904
|
+
action["jobParams"].update(
|
|
905
|
+
{
|
|
906
|
+
"dataset_host_path_map": {dataset_id_version: f"{work_fs}/workspace"},
|
|
907
|
+
"dataset_local_path_map": {dataset_id_version: "/usr/src/app/workspace"},
|
|
908
|
+
"host_file_system": work_fs,
|
|
909
|
+
}
|
|
910
|
+
)
|
|
911
|
+
self.scaling.update_action(
|
|
912
|
+
id=self.action_record_id,
|
|
913
|
+
step_code="DCK_LNCH",
|
|
914
|
+
action_type=action_details["action"],
|
|
915
|
+
status=action_details["status"],
|
|
916
|
+
sub_action=action_details["subAction"],
|
|
917
|
+
status_description="Job is assigned to docker",
|
|
918
|
+
service="bg-job-scheduler",
|
|
919
|
+
job_params=action["jobParams"],
|
|
920
|
+
)
|
|
921
|
+
if action["jobParams"].get("model_train_docker"):
|
|
922
|
+
logging.info("Pulling the docker image")
|
|
923
|
+
pull_cmd = f"docker pull {action['jobParams']['model_train_docker']}"
|
|
924
|
+
process = subprocess.Popen(
|
|
925
|
+
pull_cmd,
|
|
926
|
+
shell=True,
|
|
927
|
+
stdout=subprocess.PIPE,
|
|
928
|
+
stderr=subprocess.PIPE,
|
|
929
|
+
)
|
|
930
|
+
logging.info(
|
|
931
|
+
"Started pulling Docker image with PID: %s",
|
|
932
|
+
process.pid,
|
|
933
|
+
)
|
|
934
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
935
|
+
logging.info("cmd is: %s", cmd)
|
|
936
|
+
self.start(cmd, "data_preparation_log")
|
|
937
|
+
|
|
938
|
+
|
|
939
|
+
@log_errors(raise_exception=False)
|
|
940
|
+
def data_processing_execute(self: ActionInstance):
|
|
941
|
+
"""Execute data processing task."""
|
|
942
|
+
work_fs = get_max_file_system()
|
|
943
|
+
action_details = self.get_action_details()
|
|
944
|
+
if not action_details:
|
|
945
|
+
return
|
|
946
|
+
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
947
|
+
action = {"jobParams": action_details["jobParams"]}
|
|
948
|
+
action["jobParams"].update(
|
|
949
|
+
{
|
|
950
|
+
"dp_dv_host_paths": [f"{work_fs}/workspace"],
|
|
951
|
+
"dp_dv_local_paths": ["/usr/src/app/workspace"],
|
|
952
|
+
}
|
|
953
|
+
)
|
|
954
|
+
self.scaling.update_action(
|
|
955
|
+
id=self.action_record_id,
|
|
956
|
+
step_code="DCK_LNCH",
|
|
957
|
+
action_type=action_details["action"],
|
|
958
|
+
status="ACK",
|
|
959
|
+
status_description="Job is assigned to docker",
|
|
960
|
+
service="bg-job-scheduler",
|
|
961
|
+
job_params=action["jobParams"],
|
|
962
|
+
)
|
|
963
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
964
|
+
logging.info("cmd: %s", cmd)
|
|
965
|
+
self.start(cmd, "data_processing_log")
|
|
966
|
+
|
|
967
|
+
|
|
968
|
+
@log_errors(raise_exception=False)
|
|
969
|
+
def data_split_execute(self: ActionInstance):
|
|
970
|
+
"""Execute data split task."""
|
|
971
|
+
work_fs = get_max_file_system()
|
|
972
|
+
action_details = self.get_action_details()
|
|
973
|
+
if not action_details:
|
|
974
|
+
return
|
|
975
|
+
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
976
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
977
|
+
logging.info("cmd: %s", cmd)
|
|
978
|
+
self.start(cmd, "data_split")
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
@log_errors(raise_exception=False)
|
|
982
|
+
def dataset_annotation_execute(
|
|
983
|
+
self: ActionInstance,
|
|
984
|
+
):
|
|
985
|
+
"""Execute dataset annotation task."""
|
|
986
|
+
work_fs = get_max_file_system()
|
|
987
|
+
action_details = self.get_action_details()
|
|
988
|
+
if not action_details:
|
|
989
|
+
return
|
|
990
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
991
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
992
|
+
logging.info("cmd: %s", cmd)
|
|
993
|
+
self.start(cmd, "dataset_annotation")
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
@log_errors(raise_exception=False)
|
|
997
|
+
def dataset_augmentation_execute(
|
|
998
|
+
self: ActionInstance,
|
|
999
|
+
):
|
|
1000
|
+
"""Execute dataset augmentation task."""
|
|
1001
|
+
work_fs = get_max_file_system()
|
|
1002
|
+
action_details = self.get_action_details()
|
|
1003
|
+
if not action_details:
|
|
1004
|
+
return
|
|
1005
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1006
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1007
|
+
logging.info("cmd: %s", cmd)
|
|
1008
|
+
self.start(cmd, "dataset_augmentation")
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
@log_errors(raise_exception=False)
|
|
1012
|
+
def augmentation_server_creation_execute(
|
|
1013
|
+
self: ActionInstance,
|
|
1014
|
+
):
|
|
1015
|
+
"""Create Augmentation Server"""
|
|
1016
|
+
work_fs = get_max_file_system()
|
|
1017
|
+
action_details = self.get_action_details()
|
|
1018
|
+
external_port = self.scaling.get_open_port()
|
|
1019
|
+
if not action_details:
|
|
1020
|
+
return
|
|
1021
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1022
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1023
|
+
logging.info("cmd: %s", cmd)
|
|
1024
|
+
self.start(cmd, "augmentation_setup")
|
|
1025
|
+
|
|
1026
|
+
|
|
1027
|
+
@log_errors(raise_exception=False)
|
|
1028
|
+
def database_setup_execute(self: ActionInstance):
|
|
1029
|
+
"""
|
|
1030
|
+
Creates and setup the database for facial recognition server.
|
|
1031
|
+
"""
|
|
1032
|
+
action_details = self.get_action_details()
|
|
1033
|
+
if not action_details:
|
|
1034
|
+
return
|
|
1035
|
+
image = action_details["actionDetails"].get("docker")
|
|
1036
|
+
|
|
1037
|
+
self.setup_action_requirements(action_details)
|
|
1038
|
+
|
|
1039
|
+
project_id = action_details["_idProject"]
|
|
1040
|
+
|
|
1041
|
+
# Run docker compose up
|
|
1042
|
+
|
|
1043
|
+
cmd = (
|
|
1044
|
+
f"docker run --pull=always -p 27020:27017 "
|
|
1045
|
+
f"--name mongodbdatabase "
|
|
1046
|
+
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1047
|
+
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1048
|
+
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
1049
|
+
f"-e PROJECT_ID={project_id} "
|
|
1050
|
+
f"-e ENV=dev "
|
|
1051
|
+
f"{image} "
|
|
1052
|
+
)
|
|
1053
|
+
print("Docker command", cmd)
|
|
1054
|
+
|
|
1055
|
+
qdrant_cmd = (
|
|
1056
|
+
f"docker run --pull=always "
|
|
1057
|
+
f"--name qdrant "
|
|
1058
|
+
f"-p 6333:6333 "
|
|
1059
|
+
f"-p 6334:6334 "
|
|
1060
|
+
f"{'qdrant/qdrant:latest'} "
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
# Docker Command run
|
|
1064
|
+
self.start(cmd, "database_setup")
|
|
1065
|
+
|
|
1066
|
+
# Docker for qdrant
|
|
1067
|
+
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1068
|
+
|
|
1069
|
+
@log_errors(raise_exception=False)
|
|
1070
|
+
def facial_recognition_setup_execute(self: ActionInstance):
|
|
1071
|
+
"""
|
|
1072
|
+
Creates and setup the database for facial recognition server.
|
|
1073
|
+
"""
|
|
1074
|
+
action_details = self.get_action_details()
|
|
1075
|
+
|
|
1076
|
+
if not action_details:
|
|
1077
|
+
return
|
|
1078
|
+
image = action_details["actionDetails"].get("docker")
|
|
1079
|
+
|
|
1080
|
+
self.setup_action_requirements(action_details)
|
|
1081
|
+
|
|
1082
|
+
# Add worker container run command
|
|
1083
|
+
worker_cmd = (
|
|
1084
|
+
f"docker run -d --pull=always "
|
|
1085
|
+
f"--name worker "
|
|
1086
|
+
f"-p 8081:8081 "
|
|
1087
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1088
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1089
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1090
|
+
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1091
|
+
f"{image}"
|
|
1092
|
+
)
|
|
1093
|
+
print("Worker docker run command:", worker_cmd)
|
|
1094
|
+
|
|
1095
|
+
# Docker Command run
|
|
1096
|
+
self.start(worker_cmd, "facial_recognition_setup")
|
|
1097
|
+
|
|
1098
|
+
@log_errors(raise_exception=False)
|
|
1099
|
+
def inference_ws_server_execute(self: ActionInstance):
|
|
1100
|
+
"""
|
|
1101
|
+
Creates and start inference pipline.
|
|
1102
|
+
"""
|
|
1103
|
+
action_details = self.get_action_details()
|
|
1104
|
+
|
|
1105
|
+
if not action_details:
|
|
1106
|
+
return
|
|
1107
|
+
image = action_details["actionDetails"].get("docker")
|
|
1108
|
+
|
|
1109
|
+
self.setup_action_requirements(action_details)
|
|
1110
|
+
|
|
1111
|
+
# Add worker container run command
|
|
1112
|
+
worker_cmd = (
|
|
1113
|
+
f"docker run -d --pull=always "
|
|
1114
|
+
f"--name inference "
|
|
1115
|
+
f"-p 8102:8102 "
|
|
1116
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1117
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1118
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1119
|
+
f"{image}"
|
|
1120
|
+
)
|
|
1121
|
+
print("inference docker run command:", worker_cmd)
|
|
1122
|
+
|
|
1123
|
+
# Docker Command run
|
|
1124
|
+
self.start(worker_cmd, "inference_ws_server")
|
|
1125
|
+
|
|
1126
|
+
|
|
1127
|
+
@log_errors(raise_exception=False)
|
|
1128
|
+
def fe_fs_streaming_execute(self: ActionInstance):
|
|
1129
|
+
"""
|
|
1130
|
+
Creates and setup the frontend for fs streaming
|
|
1131
|
+
"""
|
|
1132
|
+
action_details = self.get_action_details()
|
|
1133
|
+
|
|
1134
|
+
if not action_details:
|
|
1135
|
+
return
|
|
1136
|
+
image = action_details["actionDetails"].get("docker")
|
|
1137
|
+
|
|
1138
|
+
self.setup_action_requirements(action_details)
|
|
1139
|
+
|
|
1140
|
+
# Add worker container run command
|
|
1141
|
+
worker_cmd = (
|
|
1142
|
+
f"docker run -d --pull=always "
|
|
1143
|
+
f"--name fe_streaming "
|
|
1144
|
+
f"-p 3000:3000 "
|
|
1145
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1146
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1147
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1148
|
+
f"{image}"
|
|
1149
|
+
)
|
|
1150
|
+
print("fe_fs_stremaing docker run command:", worker_cmd)
|
|
1151
|
+
|
|
1152
|
+
# Docker Command run
|
|
1153
|
+
self.start(worker_cmd, "fe_fs_streaming")
|
|
1154
|
+
|
|
1155
|
+
|
|
1156
|
+
@log_errors(raise_exception=False)
|
|
1157
|
+
def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
1158
|
+
"""Execute synthetic dataset generation task."""
|
|
1159
|
+
work_fs = get_max_file_system()
|
|
1160
|
+
action_details = self.get_action_details()
|
|
1161
|
+
if not action_details:
|
|
1162
|
+
return
|
|
1163
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1164
|
+
extra_env_vars = {}
|
|
1165
|
+
hf_token = self.get_hugging_face_token_for_data_generation()
|
|
1166
|
+
extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
|
|
1167
|
+
if hf_token:
|
|
1168
|
+
extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
|
|
1169
|
+
else:
|
|
1170
|
+
return
|
|
1171
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1172
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1173
|
+
logging.info("cmd is: %s", cmd)
|
|
1174
|
+
self.start(cmd, "dataset_generation")
|
|
1175
|
+
|
|
1176
|
+
|
|
1177
|
+
@log_errors(raise_exception=False)
|
|
1178
|
+
def synthetic_data_setup_execute(self: ActionInstance):
|
|
1179
|
+
"""Execute synthetic data setup task."""
|
|
1180
|
+
work_fs = get_max_file_system()
|
|
1181
|
+
action_details = self.get_action_details()
|
|
1182
|
+
external_port = self.scaling.get_open_port()
|
|
1183
|
+
if not action_details:
|
|
1184
|
+
return
|
|
1185
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1186
|
+
extra_env_vars = {}
|
|
1187
|
+
hf_token = self.get_hugging_face_token_for_data_generation()
|
|
1188
|
+
if hf_token:
|
|
1189
|
+
extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
|
|
1190
|
+
else:
|
|
1191
|
+
return
|
|
1192
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1193
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1194
|
+
logging.info("cmd is: %s", cmd)
|
|
1195
|
+
self.start(cmd, "synthetic_data_setup")
|
|
1196
|
+
|
|
1197
|
+
|
|
1198
|
+
@log_errors(raise_exception=False)
|
|
1199
|
+
def redis_setup_execute(self: ActionInstance):
|
|
1200
|
+
"""
|
|
1201
|
+
Creates and starts a Redis container using Docker.
|
|
1202
|
+
"""
|
|
1203
|
+
external_port = self.scaling.get_open_port()
|
|
1204
|
+
work_fs = get_max_file_system()
|
|
1205
|
+
|
|
1206
|
+
action_details = self.get_action_details()
|
|
1207
|
+
if not action_details:
|
|
1208
|
+
return
|
|
1209
|
+
action_id = action_details["_id"]
|
|
1210
|
+
|
|
1211
|
+
redis_password = action_details["jobParams"].get(
|
|
1212
|
+
"password", f"redis_pass_{int(time.time())}"
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
container_info, error, message = self.create_redis_container(
|
|
1216
|
+
action_details["actionDetails"].get("redis_image", "redis:latest"),
|
|
1217
|
+
redis_password=redis_password,
|
|
1218
|
+
)
|
|
1219
|
+
if error:
|
|
1220
|
+
logging.error(
|
|
1221
|
+
"Error creating Redis container: %s",
|
|
1222
|
+
message,
|
|
1223
|
+
)
|
|
1224
|
+
return
|
|
1225
|
+
logging.info("Redis container created successfully: %s", container_info)
|
|
1226
|
+
|
|
1227
|
+
# Initialize redis container
|
|
1228
|
+
self.setup_action_requirements(
|
|
1229
|
+
action_details,
|
|
1230
|
+
work_fs,
|
|
1231
|
+
model_family="",
|
|
1232
|
+
action_id=action_id,
|
|
1233
|
+
)
|
|
1234
|
+
|
|
1235
|
+
env_vars = {
|
|
1236
|
+
"REDIS_URL": f"{container_info['container_name']}:{container_info['external_port']}",
|
|
1237
|
+
"REDIS_PASSWORD": container_info["password"],
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
network_config = f" --network {container_info['network_name']} -p 8082:8082"
|
|
1241
|
+
|
|
1242
|
+
# Make the docker file here
|
|
1243
|
+
cmd = (
|
|
1244
|
+
f"docker run "
|
|
1245
|
+
f"{network_config} "
|
|
1246
|
+
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1247
|
+
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1248
|
+
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
1249
|
+
f"-e MATRICE_SECRET_ACCESS_KEY={shlex.quote(self.matrice_secret_access_key)} "
|
|
1250
|
+
f"-e ENV={shlex.quote(os.environ.get('ENV', 'prod'))} "
|
|
1251
|
+
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1252
|
+
f"--shm-size=30G --pull=always "
|
|
1253
|
+
f"{self.docker_container} "
|
|
1254
|
+
f"{self.action_record_id} "
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1257
|
+
logging.info("cmd is: %s", cmd)
|
|
1258
|
+
|
|
1259
|
+
self.start(cmd, "redis_setup")
|
|
1260
|
+
|
|
1261
|
+
|
|
1262
|
+
@log_errors(raise_exception=False)
|
|
1263
|
+
def deploy_aggregator_execute(
|
|
1264
|
+
self: ActionInstance,
|
|
1265
|
+
):
|
|
1266
|
+
"""Execute deploy aggregator task."""
|
|
1267
|
+
work_fs = get_max_file_system()
|
|
1268
|
+
action_details = self.get_action_details()
|
|
1269
|
+
if not action_details:
|
|
1270
|
+
return
|
|
1271
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1272
|
+
cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1273
|
+
logging.info("cmd: %s", cmd)
|
|
1274
|
+
self.start(cmd, "deploy_aggregator")
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
@log_errors(raise_exception=False)
|
|
1278
|
+
def model_deploy_execute(self: ActionInstance):
|
|
1279
|
+
"""Execute model deployment task."""
|
|
1280
|
+
external_port = self.scaling.get_open_port()
|
|
1281
|
+
internal_port = self.scaling.get_open_port()
|
|
1282
|
+
work_fs = get_max_file_system()
|
|
1283
|
+
action_details = self.get_action_details()
|
|
1284
|
+
if not action_details:
|
|
1285
|
+
return
|
|
1286
|
+
action_id = action_details["_id"]
|
|
1287
|
+
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1288
|
+
self.setup_action_requirements(
|
|
1289
|
+
action_details,
|
|
1290
|
+
work_fs,
|
|
1291
|
+
model_family=model_family,
|
|
1292
|
+
action_id=action_id,
|
|
1293
|
+
)
|
|
1294
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1295
|
+
extra_env_vars = {"INTERNAL_PORT": internal_port}
|
|
1296
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1297
|
+
logging.info("cmd is: %s", cmd)
|
|
1298
|
+
self.start(cmd, "deploy_log")
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
@log_errors(raise_exception=False)
|
|
1302
|
+
def model_train_execute(self: ActionInstance):
|
|
1303
|
+
"""Execute model training task."""
|
|
1304
|
+
action_details = self.get_action_details()
|
|
1305
|
+
if not action_details:
|
|
1306
|
+
return
|
|
1307
|
+
action_id = action_details["_id"]
|
|
1308
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1309
|
+
work_fs = action_details["jobParams"]["host_file_system"]
|
|
1310
|
+
model_key = action_details["actionDetails"]["modelKey"]
|
|
1311
|
+
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1312
|
+
self.setup_action_requirements(
|
|
1313
|
+
action_details,
|
|
1314
|
+
work_fs,
|
|
1315
|
+
model_family=model_family,
|
|
1316
|
+
action_id=action_id,
|
|
1317
|
+
)
|
|
1318
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
|
|
1319
|
+
logging.info("cmd is: %s", cmd)
|
|
1320
|
+
self.start(cmd, "train_log")
|
|
1321
|
+
|
|
1322
|
+
|
|
1323
|
+
@log_errors(raise_exception=False)
|
|
1324
|
+
def model_eval_execute(self: ActionInstance):
|
|
1325
|
+
"""Execute model evaluation task."""
|
|
1326
|
+
action_details = self.get_action_details()
|
|
1327
|
+
if not action_details:
|
|
1328
|
+
return
|
|
1329
|
+
action_id = action_details["_id"]
|
|
1330
|
+
work_fs = action_details["jobParams"]["host_file_system"]
|
|
1331
|
+
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1332
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1333
|
+
self.setup_action_requirements(
|
|
1334
|
+
action_details,
|
|
1335
|
+
work_fs,
|
|
1336
|
+
model_family=model_family,
|
|
1337
|
+
action_id=action_id,
|
|
1338
|
+
)
|
|
1339
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
|
|
1340
|
+
logging.info("cmd is: %s", cmd)
|
|
1341
|
+
self.start(cmd, "eval_log")
|
|
1342
|
+
|
|
1343
|
+
|
|
1344
|
+
@log_errors(raise_exception=False)
|
|
1345
|
+
def model_export_execute(self: ActionInstance):
|
|
1346
|
+
"""Execute model export task."""
|
|
1347
|
+
work_fs = get_max_file_system()
|
|
1348
|
+
action_details = self.get_action_details()
|
|
1349
|
+
if not action_details:
|
|
1350
|
+
return
|
|
1351
|
+
action_id = action_details["_id"]
|
|
1352
|
+
if "host_file_system" in action_details["jobParams"]:
|
|
1353
|
+
work_fs = action_details["jobParams"]["host_file_system"]
|
|
1354
|
+
logging.info("host_file_system: %s", work_fs)
|
|
1355
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1356
|
+
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1357
|
+
self.setup_action_requirements(
|
|
1358
|
+
action_details,
|
|
1359
|
+
work_fs,
|
|
1360
|
+
model_family=model_family,
|
|
1361
|
+
action_id=action_id,
|
|
1362
|
+
)
|
|
1363
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
|
|
1364
|
+
logging.info("cmd is: %s", cmd)
|
|
1365
|
+
self.start(cmd, "export_log")
|
|
1366
|
+
|
|
1367
|
+
|
|
1368
|
+
@log_errors(raise_exception=False)
|
|
1369
|
+
def image_build_execute(self: ActionInstance):
|
|
1370
|
+
"""Execute image building task."""
|
|
1371
|
+
action_details = self.get_action_details()
|
|
1372
|
+
if not action_details:
|
|
1373
|
+
return
|
|
1374
|
+
self.setup_action_requirements(action_details)
|
|
1375
|
+
model_family_id = action_details["_idService"]
|
|
1376
|
+
action_id = action_details["_id"]
|
|
1377
|
+
internal_api_key = self.get_internal_api_key(action_id)
|
|
1378
|
+
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1379
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
|
|
1380
|
+
logging.info("cmd is: %s", cmd)
|
|
1381
|
+
self.start(cmd, "image_build_log")
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
@log_errors(raise_exception=False)
|
|
1385
|
+
def resource_clone_execute(self: ActionInstance):
|
|
1386
|
+
"""Execute resource clone task."""
|
|
1387
|
+
action_details = self.get_action_details()
|
|
1388
|
+
if not action_details:
|
|
1389
|
+
return
|
|
1390
|
+
self.setup_action_requirements(action_details)
|
|
1391
|
+
cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
|
|
1392
|
+
logging.info("cmd is: %s", cmd)
|
|
1393
|
+
self.start(cmd, "resource_clone")
|
|
1394
|
+
|
|
1395
|
+
|
|
1396
|
+
@log_errors(raise_exception=False)
|
|
1397
|
+
def streaming_gateway_execute(self: ActionInstance):
|
|
1398
|
+
"""Execute streaming gateway task."""
|
|
1399
|
+
action_details = self.get_action_details()
|
|
1400
|
+
if not action_details:
|
|
1401
|
+
return
|
|
1402
|
+
self.setup_action_requirements(action_details)
|
|
1403
|
+
if not self.docker_container:
|
|
1404
|
+
self.docker_container = (
|
|
1405
|
+
f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
|
|
1406
|
+
)
|
|
1407
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1408
|
+
logging.info("cmd is: %s", cmd)
|
|
1409
|
+
self.start(cmd, "streaming_gateway")
|
|
1410
|
+
|
|
1411
|
+
|
|
1412
|
+
@log_errors(raise_exception=False)
|
|
1413
|
+
def kafka_setup_execute(self: ActionInstance):
|
|
1414
|
+
"""Execute kafka server task."""
|
|
1415
|
+
action_details = self.get_action_details()
|
|
1416
|
+
if not action_details:
|
|
1417
|
+
return
|
|
1418
|
+
host_port = self.scaling.get_open_port()
|
|
1419
|
+
host_ip = (
|
|
1420
|
+
urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
|
|
1421
|
+
)
|
|
1422
|
+
container_port = 9092
|
|
1423
|
+
# Setup credentials
|
|
1424
|
+
self.setup_action_requirements(action_details)
|
|
1425
|
+
|
|
1426
|
+
# Get Docker disk usage to calculate log retention
|
|
1427
|
+
from matrice_compute.instance_utils import get_docker_disk_space_usage
|
|
1428
|
+
|
|
1429
|
+
docker_disk_usage = get_docker_disk_space_usage()
|
|
1430
|
+
log_retention_bytes = 0
|
|
1431
|
+
if docker_disk_usage:
|
|
1432
|
+
# Calculate 90% of total Docker disk space in bytes
|
|
1433
|
+
available_disk_gb = docker_disk_usage["available"]
|
|
1434
|
+
log_retention_bytes = int(
|
|
1435
|
+
available_disk_gb * 0.9 * 1024 * 1024 * 1024
|
|
1436
|
+
) # Convert GB to bytes
|
|
1437
|
+
logging.info(
|
|
1438
|
+
"Kafka log retention set to %d bytes (90%% of %f GB Docker disk)",
|
|
1439
|
+
log_retention_bytes,
|
|
1440
|
+
available_disk_gb,
|
|
1441
|
+
)
|
|
1442
|
+
else:
|
|
1443
|
+
# Fallback if Docker disk usage cannot be determined
|
|
1444
|
+
log_retention_bytes = 500 * 1024 * 1024 * 1024 # 10GB default
|
|
1445
|
+
logging.warning(
|
|
1446
|
+
"Could not determine Docker disk usage, using default 10GB log retention"
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1449
|
+
# Prepare environment variables for Kafka
|
|
1450
|
+
env = os.environ.get("ENV", "prod")
|
|
1451
|
+
env_vars = {
|
|
1452
|
+
"ENV": env,
|
|
1453
|
+
"MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
|
|
1454
|
+
"MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
|
|
1455
|
+
"KAFKA_NODE_ID": 1,
|
|
1456
|
+
"KAFKA_PROCESS_ROLES": "broker,controller",
|
|
1457
|
+
"KAFKA_LISTENERS": "SASL_PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093",
|
|
1458
|
+
"KAFKA_ADVERTISED_LISTENERS": f"SASL_PLAINTEXT://{host_ip}:{host_port}",
|
|
1459
|
+
"KAFKA_LISTENER_SECURITY_PROTOCOL_MAP": "CONTROLLER:PLAINTEXT,SASL_PLAINTEXT:SASL_PLAINTEXT",
|
|
1460
|
+
"KAFKA_CONTROLLER_LISTENER_NAMES": "CONTROLLER",
|
|
1461
|
+
"KAFKA_CONTROLLER_QUORUM_VOTERS": "1@localhost:9093",
|
|
1462
|
+
"KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR": 1,
|
|
1463
|
+
"KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR": 1,
|
|
1464
|
+
"KAFKA_TRANSACTION_STATE_LOG_MIN_ISR": 1,
|
|
1465
|
+
"KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS": 0,
|
|
1466
|
+
"KAFKA_NUM_PARTITIONS": 5,
|
|
1467
|
+
"KAFKA_SASL_ENABLED_MECHANISMS": "SCRAM-SHA-256",
|
|
1468
|
+
"KAFKA_SASL_MECHANISM_INTER_BROKER_PROTOCOL": "SCRAM-SHA-256",
|
|
1469
|
+
"KAFKA_INTER_BROKER_LISTENER_NAME": "SASL_PLAINTEXT",
|
|
1470
|
+
"KAFKA_MESSAGE_MAX_BYTES": 25000000,
|
|
1471
|
+
"KAFKA_HEAP_OPTS": "-Xms2G -Xmx8G",
|
|
1472
|
+
"KAFKA_NUM_NETWORK_THREADS": 6,
|
|
1473
|
+
"KAFKA_NUM_IO_THREADS": 8,
|
|
1474
|
+
"KAFKA_REPLICA_FETCH_MAX_BYTES": 25000000,
|
|
1475
|
+
"KAFKA_FETCH_MESSAGE_MAX_BYTES": 25000000,
|
|
1476
|
+
"KAFKA_REPLICA_FETCH_RESPONSE_MAX_BYTES": 25000000,
|
|
1477
|
+
"KAFKA_REPLICA_FETCH_RESPONSE_MAX_BYTES": 25000000,
|
|
1478
|
+
# Log retention settings based on Docker disk space
|
|
1479
|
+
"KAFKA_LOG_RETENTION_BYTES": log_retention_bytes,
|
|
1480
|
+
"KAFKA_LOG_SEGMENT_BYTES": min(
|
|
1481
|
+
1073741824, log_retention_bytes // 10
|
|
1482
|
+
), # 1GB or 10% of retention, whichever is smaller
|
|
1483
|
+
}
|
|
1484
|
+
|
|
1485
|
+
# Build environment variable command parts
|
|
1486
|
+
env_args = " ".join(
|
|
1487
|
+
[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()]
|
|
1488
|
+
)
|
|
1489
|
+
|
|
1490
|
+
# Build the docker command directly to match user's pattern
|
|
1491
|
+
pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
|
|
1492
|
+
|
|
1493
|
+
cmd = (
|
|
1494
|
+
f"docker run -p {host_port}:{container_port} "
|
|
1495
|
+
f"{env_args} "
|
|
1496
|
+
f"--shm-size=30G --pull=always "
|
|
1497
|
+
f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
|
|
1498
|
+
f"cd /opt/kafka/bin && "
|
|
1499
|
+
f"source venv/bin/activate && "
|
|
1500
|
+
f"/opt/kafka/bin/startup.sh & "
|
|
1501
|
+
f"if [ -f requirements.txt ]; then venv/bin/python3 -m pip install -r requirements.txt; fi && "
|
|
1502
|
+
f"venv/bin/python3 -m pip install --upgrade --force-reinstall --index-url {pypi_index} matrice_common matrice && "
|
|
1503
|
+
f"sleep 20 && "
|
|
1504
|
+
f'venv/bin/python3 main.py {self.action_record_id} {host_port}"'
|
|
1505
|
+
)
|
|
1506
|
+
|
|
1507
|
+
logging.info("cmd is: %s", cmd)
|
|
1508
|
+
self.start(cmd, "kafka_setup")
|