matrice-compute 0.1.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +20 -0
- matrice_compute/action_instance.py +2023 -0
- matrice_compute/actions_manager.py +467 -0
- matrice_compute/actions_scaledown_manager.py +57 -0
- matrice_compute/compute_operations_handler.py +490 -0
- matrice_compute/instance_manager.py +470 -0
- matrice_compute/instance_utils.py +1266 -0
- matrice_compute/prechecks.py +538 -0
- matrice_compute/py.typed +0 -0
- matrice_compute/resources_tracker.py +842 -0
- matrice_compute/scaling.py +1395 -0
- matrice_compute/shutdown_manager.py +314 -0
- matrice_compute/task_utils.py +77 -0
- matrice_compute-0.1.29.dist-info/METADATA +28 -0
- matrice_compute-0.1.29.dist-info/RECORD +18 -0
- matrice_compute-0.1.29.dist-info/WHEEL +5 -0
- matrice_compute-0.1.29.dist-info/licenses/LICENSE.txt +21 -0
- matrice_compute-0.1.29.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2023 @@
|
|
|
1
|
+
"""Module providing action_instance functionality."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shlex
|
|
6
|
+
import subprocess
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import signal
|
|
10
|
+
import urllib.request
|
|
11
|
+
from matrice_compute.instance_utils import (
|
|
12
|
+
get_gpu_with_sufficient_memory_for_action,
|
|
13
|
+
get_gpu_config_for_deployment,
|
|
14
|
+
get_decrypted_access_key_pair,
|
|
15
|
+
get_max_file_system,
|
|
16
|
+
get_best_service_ip_and_network,
|
|
17
|
+
)
|
|
18
|
+
from matrice_compute.task_utils import (
|
|
19
|
+
setup_workspace_and_run_task,
|
|
20
|
+
)
|
|
21
|
+
from matrice_compute.scaling import (
|
|
22
|
+
Scaling,
|
|
23
|
+
)
|
|
24
|
+
from matrice_common.utils import log_errors
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ActionInstance:
|
|
28
|
+
"""Base class for tasks that run in Action containers."""
|
|
29
|
+
|
|
30
|
+
# Class-level dictionary to track deployed services and their ports
|
|
31
|
+
# Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
|
|
32
|
+
_deployed_services = {}
|
|
33
|
+
|
|
34
|
+
def __init__(self, scaling: Scaling, action_info: dict):
|
|
35
|
+
"""Initialize an action instance.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
scaling (Scaling): Scaling service instance
|
|
39
|
+
action_info (dict): Action information dictionary
|
|
40
|
+
"""
|
|
41
|
+
self.scaling = scaling
|
|
42
|
+
self.process: subprocess.Popen | None = None
|
|
43
|
+
self.stop_thread = False
|
|
44
|
+
self.log_thread: threading.Thread | None = None
|
|
45
|
+
self.log_path: str | None = None
|
|
46
|
+
self.cmd: str | None = None
|
|
47
|
+
self.matrice_access_key_id: str | None = None
|
|
48
|
+
self.matrice_secret_access_key: str | None = None
|
|
49
|
+
self.action_info = action_info
|
|
50
|
+
self.action_record_id = action_info["_id"]
|
|
51
|
+
self.action_type = action_info["action"]
|
|
52
|
+
self.action_details = action_info["actionDetails"]
|
|
53
|
+
self.docker_container = self.action_details.get(
|
|
54
|
+
"docker",
|
|
55
|
+
self.action_details.get(
|
|
56
|
+
"docker_container",
|
|
57
|
+
self.scaling.get_data_processing_image(),
|
|
58
|
+
),
|
|
59
|
+
)
|
|
60
|
+
self.actions_map = {
|
|
61
|
+
"model_train": model_train_execute,
|
|
62
|
+
"model_eval": model_eval_execute,
|
|
63
|
+
"model_export": model_export_execute,
|
|
64
|
+
"deploy_add": model_deploy_execute,
|
|
65
|
+
"data_import": data_processing_execute,
|
|
66
|
+
"data_add": data_processing_execute,
|
|
67
|
+
"data_split": data_split_execute,
|
|
68
|
+
"data_prep": data_preparation_execute,
|
|
69
|
+
"dataset_annotation": dataset_annotation_execute,
|
|
70
|
+
"dataset_augmentation": dataset_augmentation_execute,
|
|
71
|
+
"augmentation_setup": augmentation_server_creation_execute,
|
|
72
|
+
"dataset_generation": synthetic_dataset_generation_execute,
|
|
73
|
+
"synthetic_data_setup": synthetic_data_setup_execute, # start
|
|
74
|
+
"image_build": image_build_execute,
|
|
75
|
+
"resource_clone": resource_clone_execute,
|
|
76
|
+
"database_setup": database_setup_execute,
|
|
77
|
+
"kafka_setup": kafka_setup_execute,
|
|
78
|
+
"inference_aggregator": deploy_aggregator_execute,
|
|
79
|
+
"redis_setup": redis_setup_execute,
|
|
80
|
+
"streaming_gateway": streaming_gateway_execute,
|
|
81
|
+
"facial_recognition_setup": facial_recognition_setup_execute,
|
|
82
|
+
"fe_fs_streaming": fe_fs_streaming_execute,
|
|
83
|
+
"inference_ws_server": inference_ws_server_execute,
|
|
84
|
+
"fe_analytics_service": fe_analytics_service_execute,
|
|
85
|
+
"lpr_setup": lpr_setup_execute,
|
|
86
|
+
"inference_tracker_server": inference_tracker_setup_execute
|
|
87
|
+
}
|
|
88
|
+
if self.action_type not in self.actions_map:
|
|
89
|
+
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
90
|
+
self.task = self.actions_map[self.action_type]
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def is_first_deployment_for_service(cls, service_id):
|
|
94
|
+
"""Check if this is the first deployment for a given service.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
service_id (str): Service ID (_idService)
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
bool: True if this is the first deployment, False otherwise
|
|
101
|
+
"""
|
|
102
|
+
if not service_id:
|
|
103
|
+
return False
|
|
104
|
+
return service_id not in cls._deployed_services
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def get_or_create_triton_ports(cls, service_id, scaling_instance):
|
|
108
|
+
"""Get existing TRITON_PORTS for a service or create new ones.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
service_id (str): Service ID (_idService)
|
|
112
|
+
scaling_instance: Scaling instance to get open ports
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
|
|
116
|
+
"""
|
|
117
|
+
if not service_id:
|
|
118
|
+
# No service_id, generate new ports
|
|
119
|
+
port1 = scaling_instance.get_open_port()
|
|
120
|
+
port2 = scaling_instance.get_open_port()
|
|
121
|
+
port3 = scaling_instance.get_open_port()
|
|
122
|
+
return f"{port1},{port2},{port3}"
|
|
123
|
+
|
|
124
|
+
# Check if ports already exist for this service
|
|
125
|
+
if service_id in cls._deployed_services:
|
|
126
|
+
triton_ports = cls._deployed_services[service_id]["triton_ports"]
|
|
127
|
+
logging.info(
|
|
128
|
+
"Reusing TRITON_PORTS for service %s: %s",
|
|
129
|
+
service_id,
|
|
130
|
+
triton_ports
|
|
131
|
+
)
|
|
132
|
+
return triton_ports
|
|
133
|
+
|
|
134
|
+
# First deployment: generate new ports and store them
|
|
135
|
+
port1 = scaling_instance.get_open_port()
|
|
136
|
+
port2 = scaling_instance.get_open_port()
|
|
137
|
+
port3 = scaling_instance.get_open_port()
|
|
138
|
+
triton_ports = f"{port1},{port2},{port3}"
|
|
139
|
+
|
|
140
|
+
# Store for future use
|
|
141
|
+
cls._deployed_services[service_id] = {
|
|
142
|
+
"triton_ports": triton_ports,
|
|
143
|
+
"is_first": False
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
logging.info(
|
|
147
|
+
"First deployment for service %s - generated TRITON_PORTS: %s",
|
|
148
|
+
service_id,
|
|
149
|
+
triton_ports
|
|
150
|
+
)
|
|
151
|
+
return triton_ports
|
|
152
|
+
|
|
153
|
+
@log_errors(default_return={}, raise_exception=True, log_error=False)
|
|
154
|
+
def _init_credentials(self):
|
|
155
|
+
"""Initialize Matrice credentials.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
dict: Dictionary containing access key ID and secret access key
|
|
159
|
+
"""
|
|
160
|
+
self.matrice_access_key_id = self.scaling.session.access_key
|
|
161
|
+
self.matrice_secret_access_key = self.scaling.session.secret_key
|
|
162
|
+
if not all(
|
|
163
|
+
[
|
|
164
|
+
self.matrice_access_key_id,
|
|
165
|
+
self.matrice_secret_access_key,
|
|
166
|
+
]
|
|
167
|
+
):
|
|
168
|
+
raise ValueError(
|
|
169
|
+
"Matrice credentials not found - both access key ID and secret access key are required"
|
|
170
|
+
)
|
|
171
|
+
return {
|
|
172
|
+
"matrice_access_key_id": self.matrice_access_key_id,
|
|
173
|
+
"matrice_secret_access_key": self.matrice_secret_access_key,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
@log_errors(default_return="logs", raise_exception=False, log_error=False)
|
|
177
|
+
def get_log_path(self):
|
|
178
|
+
"""Get log directory path, creating if needed.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
str: Path to log directory
|
|
182
|
+
"""
|
|
183
|
+
os.makedirs("logs", exist_ok=True)
|
|
184
|
+
return "logs"
|
|
185
|
+
|
|
186
|
+
@log_errors(default_return=False, raise_exception=False, log_error=False)
|
|
187
|
+
def is_running(self) -> bool:
|
|
188
|
+
"""Check if task process is running.
|
|
189
|
+
|
|
190
|
+
This method performs a thorough check to determine if the process is still running:
|
|
191
|
+
1. Verifies that the process attribute exists and is not None
|
|
192
|
+
2. Checks if the process has terminated using poll() method
|
|
193
|
+
3. Additional safeguards against zombie processes
|
|
194
|
+
4. Coordinates with log monitoring to ensure all logs are sent before cleanup
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
bool: True if process exists and is still running, False if process
|
|
198
|
+
does not exist or has terminated
|
|
199
|
+
"""
|
|
200
|
+
# Basic check if process exists
|
|
201
|
+
if not hasattr(self, "process") or self.process is None:
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
# Check if process has terminated
|
|
206
|
+
poll_result = self.process.poll()
|
|
207
|
+
|
|
208
|
+
# poll() returns None if the process is still running
|
|
209
|
+
is_running = poll_result is None
|
|
210
|
+
|
|
211
|
+
# If process has terminated, ensure we do proper cleanup
|
|
212
|
+
if not is_running:
|
|
213
|
+
# Log termination with action ID for debugging
|
|
214
|
+
action_id = getattr(self, "action_record_id", "unknown")
|
|
215
|
+
logging.info(
|
|
216
|
+
"Process for action %s has terminated with exit code: %s",
|
|
217
|
+
action_id,
|
|
218
|
+
poll_result,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# CRITICAL: Ensure all logs are sent before cleaning up process
|
|
222
|
+
self._ensure_final_logs_sent()
|
|
223
|
+
|
|
224
|
+
# Try to explicitly clean up the process to avoid zombies
|
|
225
|
+
try:
|
|
226
|
+
# Wait for process with a short timeout to ensure it's fully terminated
|
|
227
|
+
self.process.wait(timeout=1)
|
|
228
|
+
except subprocess.TimeoutExpired:
|
|
229
|
+
# If still running after timeout (unlikely at this point)
|
|
230
|
+
logging.warning(
|
|
231
|
+
f"Process for action {action_id} failed to terminate properly"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Set process to None to help garbage collection - BUT ONLY after logs are handled
|
|
235
|
+
self.process = None
|
|
236
|
+
|
|
237
|
+
return is_running
|
|
238
|
+
|
|
239
|
+
except Exception as e:
|
|
240
|
+
# Something went wrong while checking the process status
|
|
241
|
+
logging.error(f"Error checking process status: {str(e)}")
|
|
242
|
+
# Ensure logs are sent even in error cases
|
|
243
|
+
self._ensure_final_logs_sent()
|
|
244
|
+
# To be safe, assume process is not running when we can't check it
|
|
245
|
+
self.process = None
|
|
246
|
+
return False
|
|
247
|
+
|
|
248
|
+
def _ensure_final_logs_sent(self):
|
|
249
|
+
"""Ensure all remaining logs are sent when a process terminates.
|
|
250
|
+
|
|
251
|
+
This method performs a final log flush to ensure no logs are lost
|
|
252
|
+
when a container crashes or shuts down.
|
|
253
|
+
"""
|
|
254
|
+
if (
|
|
255
|
+
not hasattr(self, "log_path")
|
|
256
|
+
or not self.log_path
|
|
257
|
+
or not os.path.exists(self.log_path)
|
|
258
|
+
):
|
|
259
|
+
return
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
# Set flag to stop continuous logging thread
|
|
263
|
+
self.stop_thread = True
|
|
264
|
+
|
|
265
|
+
# Give log thread a moment to finish current operation
|
|
266
|
+
time.sleep(1)
|
|
267
|
+
|
|
268
|
+
# Perform final log flush
|
|
269
|
+
logging.info(
|
|
270
|
+
"Performing final log flush for action %s",
|
|
271
|
+
getattr(self, "action_record_id", "unknown"),
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Read any remaining logs that haven't been sent
|
|
275
|
+
with open(self.log_path, "rb") as log_file:
|
|
276
|
+
# Get the last position that was read (if tracked)
|
|
277
|
+
last_position = getattr(self, "_last_log_position", 0)
|
|
278
|
+
log_file.seek(last_position)
|
|
279
|
+
remaining_content = log_file.read()
|
|
280
|
+
|
|
281
|
+
if remaining_content:
|
|
282
|
+
try:
|
|
283
|
+
decoded_content = remaining_content.decode("utf-8")
|
|
284
|
+
except UnicodeDecodeError:
|
|
285
|
+
decoded_content = remaining_content.decode(
|
|
286
|
+
"utf-8", errors="replace"
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Send final logs
|
|
290
|
+
self._send_logs_to_scaling(decoded_content)
|
|
291
|
+
self._check_cuda(decoded_content)
|
|
292
|
+
|
|
293
|
+
logging.info(
|
|
294
|
+
"Sent %d bytes of final logs for action %s",
|
|
295
|
+
len(remaining_content),
|
|
296
|
+
getattr(self, "action_record_id", "unknown"),
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
logging.debug(
|
|
300
|
+
"No additional logs to send for action %s",
|
|
301
|
+
getattr(self, "action_record_id", "unknown"),
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logging.error(
|
|
306
|
+
"Error during final log flush for action %s: %s",
|
|
307
|
+
getattr(self, "action_record_id", "unknown"),
|
|
308
|
+
str(e),
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
@log_errors(default_return=None, raise_exception=False, log_error=False)
|
|
312
|
+
def get_action_details(self):
|
|
313
|
+
"""Get action details from scaling service.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
dict: Action details if successful, None otherwise
|
|
317
|
+
"""
|
|
318
|
+
resp, error, message = self.scaling.get_action_details(self.action_record_id)
|
|
319
|
+
if error:
|
|
320
|
+
logging.error(
|
|
321
|
+
"Error getting action details: %s",
|
|
322
|
+
error,
|
|
323
|
+
)
|
|
324
|
+
return None
|
|
325
|
+
return resp
|
|
326
|
+
|
|
327
|
+
@log_errors(default_return="", raise_exception=False)
|
|
328
|
+
def get_gpu_config(self, action_details):
|
|
329
|
+
"""Get GPU configuration string based on available GPUs.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
action_details (dict): Action details containing GPU requirements
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
str: GPU configuration string
|
|
336
|
+
"""
|
|
337
|
+
action_id = action_details.get("_id", "unknown")
|
|
338
|
+
|
|
339
|
+
# Check if GPU is required
|
|
340
|
+
gpu_required = action_details["actionDetails"].get("gpuRequired", False)
|
|
341
|
+
if not gpu_required:
|
|
342
|
+
logging.info(
|
|
343
|
+
"Action %s does not require GPU - will run on CPU",
|
|
344
|
+
action_id
|
|
345
|
+
)
|
|
346
|
+
return ""
|
|
347
|
+
|
|
348
|
+
# Get required GPU memory for logging
|
|
349
|
+
required_memory = action_details.get("actionDetails", {}).get(
|
|
350
|
+
"expectedResources", {}
|
|
351
|
+
).get("gpuMemory", 0)
|
|
352
|
+
|
|
353
|
+
logging.info(
|
|
354
|
+
"Action %s requires GPU with %d MB memory - selecting GPU(s) with most free memory",
|
|
355
|
+
action_id,
|
|
356
|
+
required_memory
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
# Get the GPU(s) with most free memory that have sufficient memory
|
|
361
|
+
gpu_indices = get_gpu_with_sufficient_memory_for_action(
|
|
362
|
+
action_details=action_details
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if gpu_indices:
|
|
366
|
+
gpu_str = ",".join(map(str, gpu_indices))
|
|
367
|
+
logging.info(
|
|
368
|
+
"Action %s: Selected GPU device(s): %s (required memory: %d MB)",
|
|
369
|
+
action_id,
|
|
370
|
+
gpu_str,
|
|
371
|
+
required_memory
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Return Docker GPU configuration
|
|
375
|
+
# Format: --gpus "device=0" or --gpus "device=0,1,2"
|
|
376
|
+
return f'--gpus "device={gpu_str}"'
|
|
377
|
+
else:
|
|
378
|
+
logging.warning(
|
|
379
|
+
"Action %s: No GPUs with sufficient memory found (required: %d MB)",
|
|
380
|
+
action_id,
|
|
381
|
+
required_memory
|
|
382
|
+
)
|
|
383
|
+
return ""
|
|
384
|
+
|
|
385
|
+
except ValueError as e:
|
|
386
|
+
logging.error(
|
|
387
|
+
"Action %s: Error selecting GPU - %s",
|
|
388
|
+
action_id,
|
|
389
|
+
str(e)
|
|
390
|
+
)
|
|
391
|
+
return ""
|
|
392
|
+
except Exception as e:
|
|
393
|
+
logging.error(
|
|
394
|
+
"Action %s: Unexpected error in GPU selection - %s",
|
|
395
|
+
action_id,
|
|
396
|
+
str(e)
|
|
397
|
+
)
|
|
398
|
+
return ""
|
|
399
|
+
|
|
400
|
+
@log_errors(default_return="", raise_exception=False)
|
|
401
|
+
def get_base_docker_cmd(
|
|
402
|
+
self,
|
|
403
|
+
work_fs: str = "",
|
|
404
|
+
use_gpu: str = "",
|
|
405
|
+
mount_docker_sock: bool = False,
|
|
406
|
+
action_id: str = "",
|
|
407
|
+
model_key: str = "",
|
|
408
|
+
extra_env_vars: dict = {},
|
|
409
|
+
port_mapping: dict = {},
|
|
410
|
+
network_config: str = "",
|
|
411
|
+
destination_workspace_path: str = "/usr/src/workspace",
|
|
412
|
+
docker_workdir: str = "",
|
|
413
|
+
extra_pkgs: list = [],
|
|
414
|
+
):
|
|
415
|
+
"""Build base Docker command with common options.
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
work_fs (str): Work filesystem path
|
|
419
|
+
use_gpu (str): GPU configuration string
|
|
420
|
+
mount_docker_sock (bool): Whether to mount Docker socket
|
|
421
|
+
action_id (str): Action ID
|
|
422
|
+
model_key (str): Model key
|
|
423
|
+
extra_env_vars (dict): Additional environment variables
|
|
424
|
+
port_mapping (dict): Port mappings {host_port: container_port}
|
|
425
|
+
destination_workspace_path (str): Container workspace path
|
|
426
|
+
docker_workdir (str): Docker working directory
|
|
427
|
+
extra_pkgs (list): List of extra packages to install
|
|
428
|
+
Returns:
|
|
429
|
+
str: Base Docker command
|
|
430
|
+
"""
|
|
431
|
+
env = os.environ.get("ENV", "prod")
|
|
432
|
+
env_vars = {
|
|
433
|
+
"ENV": env,
|
|
434
|
+
"MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
|
|
435
|
+
"MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
|
|
436
|
+
}
|
|
437
|
+
if self.get_hugging_face_token(model_key):
|
|
438
|
+
env_vars["HUGGING_FACE_ACCESS_TOKEN"] = self.get_hugging_face_token(
|
|
439
|
+
model_key
|
|
440
|
+
)
|
|
441
|
+
if extra_env_vars:
|
|
442
|
+
env_vars.update(extra_env_vars)
|
|
443
|
+
|
|
444
|
+
if network_config == "":
|
|
445
|
+
network_config = (
|
|
446
|
+
"--net=host"
|
|
447
|
+
if not port_mapping
|
|
448
|
+
else " ".join(
|
|
449
|
+
f"-p {host}:{container}" for host, container in port_mapping.items()
|
|
450
|
+
)
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
if not docker_workdir:
|
|
454
|
+
if action_id:
|
|
455
|
+
docker_workdir = f"/usr/src/{action_id}"
|
|
456
|
+
else:
|
|
457
|
+
docker_workdir = "."
|
|
458
|
+
volumes = [
|
|
459
|
+
( # Mount workspace if work_fs is provided
|
|
460
|
+
f"-v {work_fs}/workspace:{destination_workspace_path}"
|
|
461
|
+
if work_fs and work_fs not in ["/"]
|
|
462
|
+
else ""
|
|
463
|
+
),
|
|
464
|
+
( # Mount action directory if work_fs and action_id are provided
|
|
465
|
+
f"-v {work_fs}/{action_id}:/usr/src/{action_id}"
|
|
466
|
+
if work_fs and work_fs not in ["/"] and action_id
|
|
467
|
+
else ""
|
|
468
|
+
),
|
|
469
|
+
"-v /var/run/docker.sock:/var/run/docker.sock" if mount_docker_sock else "",
|
|
470
|
+
]
|
|
471
|
+
pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
|
|
472
|
+
|
|
473
|
+
pkgs = ["matrice_common", "matrice"]
|
|
474
|
+
pkgs.extend(extra_pkgs)
|
|
475
|
+
if env == 'dev':
|
|
476
|
+
pkgs = [pkg + ">=1.0.0" for pkg in pkgs]
|
|
477
|
+
pip_install_matrice = f"pip install --pre --upgrade --force-reinstall --index-url {pypi_index} {' '.join(pkgs)}"
|
|
478
|
+
else:
|
|
479
|
+
pip_install_matrice = f"pip install --upgrade --force-reinstall --index-url {pypi_index} {' '.join(pkgs)}"
|
|
480
|
+
pip_install_requirements = (
|
|
481
|
+
"if [ -f requirements.txt ]; then pip install -r requirements.txt; fi "
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Create export statements for environment variables to ensure they're available in subshells
|
|
485
|
+
env_exports = " && ".join(
|
|
486
|
+
[
|
|
487
|
+
f"export {key}={shlex.quote(str(value))}"
|
|
488
|
+
for key, value in env_vars.items()
|
|
489
|
+
]
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
cmd_parts = [
|
|
493
|
+
f"docker run {use_gpu} ",
|
|
494
|
+
network_config,
|
|
495
|
+
*[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
|
|
496
|
+
*volumes,
|
|
497
|
+
# Container configuration and startup commands
|
|
498
|
+
f"--cidfile ./{self.action_record_id}.cid ",
|
|
499
|
+
f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
|
|
500
|
+
f'/bin/bash -c "cd {docker_workdir} && '
|
|
501
|
+
f"{env_exports} && "
|
|
502
|
+
f"{pip_install_requirements} && "
|
|
503
|
+
f"{pip_install_matrice} && ",
|
|
504
|
+
]
|
|
505
|
+
|
|
506
|
+
# Join all non-empty parts with spaces
|
|
507
|
+
return " ".join(filter(None, cmd_parts))
|
|
508
|
+
|
|
509
|
+
@log_errors(default_return="", raise_exception=False)
|
|
510
|
+
def get_hugging_face_token(self, model_key):
|
|
511
|
+
"""Get Hugging Face token for specific model keys.
|
|
512
|
+
|
|
513
|
+
Args:
|
|
514
|
+
model_key (str): Model key to check
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
str: Hugging Face token if available, empty string otherwise
|
|
518
|
+
"""
|
|
519
|
+
hugging_face_token = ""
|
|
520
|
+
if model_key and (
|
|
521
|
+
model_key.startswith("microsoft") or model_key.startswith("timm")
|
|
522
|
+
):
|
|
523
|
+
secret_name = "hugging_face"
|
|
524
|
+
resp, error, message = self.scaling.get_model_secret_keys(secret_name)
|
|
525
|
+
if error is not None:
|
|
526
|
+
logging.error(
|
|
527
|
+
"Error getting Hugging Face token: %s",
|
|
528
|
+
message,
|
|
529
|
+
)
|
|
530
|
+
else:
|
|
531
|
+
hugging_face_token = resp["user_access_token"]
|
|
532
|
+
return hugging_face_token
|
|
533
|
+
|
|
534
|
+
@log_errors(default_return="", raise_exception=False)
|
|
535
|
+
def get_hugging_face_token_for_data_generation(self):
|
|
536
|
+
secret_name = "hugging_face"
|
|
537
|
+
resp, error, message = self.scaling.get_model_secret_keys(secret_name)
|
|
538
|
+
if error is not None:
|
|
539
|
+
logging.error(
|
|
540
|
+
"Error getting Hugging Face token: %s",
|
|
541
|
+
message,
|
|
542
|
+
)
|
|
543
|
+
else:
|
|
544
|
+
hugging_face_token = resp["user_access_token"]
|
|
545
|
+
return hugging_face_token
|
|
546
|
+
|
|
547
|
+
@log_errors(default_return="", raise_exception=False)
|
|
548
|
+
def get_internal_api_key(self, action_id):
|
|
549
|
+
"""Get internal API key for action.
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
action_id (str): Action ID
|
|
553
|
+
|
|
554
|
+
Returns:
|
|
555
|
+
str: Internal API key if available, empty string otherwise
|
|
556
|
+
"""
|
|
557
|
+
internal_api_key = ""
|
|
558
|
+
resp, error, message = self.scaling.get_internal_api_key(action_id)
|
|
559
|
+
if error is not None:
|
|
560
|
+
logging.error(
|
|
561
|
+
"Error getting internal api key: %s",
|
|
562
|
+
message,
|
|
563
|
+
)
|
|
564
|
+
else:
|
|
565
|
+
internal_api_key = resp["internal_api_key"]
|
|
566
|
+
return internal_api_key
|
|
567
|
+
|
|
568
|
+
@log_errors(raise_exception=True)
|
|
569
|
+
def setup_action_requirements(
|
|
570
|
+
self,
|
|
571
|
+
action_details,
|
|
572
|
+
work_fs="",
|
|
573
|
+
model_family="",
|
|
574
|
+
action_id="",
|
|
575
|
+
):
|
|
576
|
+
"""Setup action requirements.
|
|
577
|
+
|
|
578
|
+
Args:
|
|
579
|
+
action_details (dict): Action details
|
|
580
|
+
work_fs (str): Work filesystem path
|
|
581
|
+
model_family (str): Model family name
|
|
582
|
+
action_id (str): Action ID
|
|
583
|
+
|
|
584
|
+
Raises:
|
|
585
|
+
Exception: If setup fails
|
|
586
|
+
"""
|
|
587
|
+
# Get job parameters from action_details
|
|
588
|
+
job_params = action_details.get("jobParams", {})
|
|
589
|
+
|
|
590
|
+
# Setup model codebase if model_family is provided
|
|
591
|
+
if model_family:
|
|
592
|
+
# Try to get model codebase URLs from action_details first
|
|
593
|
+
model_codebase_url = job_params.get("model_codebase_url")
|
|
594
|
+
model_requirements_url = job_params.get("model_requirements_url")
|
|
595
|
+
dockerId = job_params.get("_idDocker")
|
|
596
|
+
|
|
597
|
+
# Fallback to API calls if not provided in action_details
|
|
598
|
+
if not model_codebase_url:
|
|
599
|
+
model_codebase_url, error, message = self.scaling.get_model_codebase(
|
|
600
|
+
dockerId
|
|
601
|
+
)
|
|
602
|
+
if error:
|
|
603
|
+
logging.warning(f"Failed to get model codebase URL: {message}")
|
|
604
|
+
model_codebase_url = None
|
|
605
|
+
|
|
606
|
+
# Handle requirements URL - use from job_params or get from API
|
|
607
|
+
if model_requirements_url:
|
|
608
|
+
model_codebase_requirements_url = model_requirements_url
|
|
609
|
+
else:
|
|
610
|
+
model_codebase_requirements_url, error, message = (
|
|
611
|
+
self.scaling.get_model_codebase_requirements(dockerId)
|
|
612
|
+
)
|
|
613
|
+
if error:
|
|
614
|
+
logging.warning(
|
|
615
|
+
f"Failed to get model codebase requirements URL: {message}"
|
|
616
|
+
)
|
|
617
|
+
model_codebase_requirements_url = None
|
|
618
|
+
|
|
619
|
+
# Setup workspace if we have the URLs
|
|
620
|
+
if model_codebase_url:
|
|
621
|
+
setup_workspace_and_run_task(
|
|
622
|
+
work_fs,
|
|
623
|
+
action_id,
|
|
624
|
+
model_codebase_url,
|
|
625
|
+
model_codebase_requirements_url,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
# Setup Docker credentials
|
|
629
|
+
try:
|
|
630
|
+
# Try to get Docker credentials from action_details first
|
|
631
|
+
docker_username = job_params.get("Username")
|
|
632
|
+
docker_password = job_params.get("Password")
|
|
633
|
+
if docker_username and docker_password:
|
|
634
|
+
username = docker_username
|
|
635
|
+
password = docker_password
|
|
636
|
+
logging.info("Using Docker credentials from action_details")
|
|
637
|
+
else:
|
|
638
|
+
# Fallback to API call
|
|
639
|
+
creds, error, message = self.scaling.get_docker_hub_credentials()
|
|
640
|
+
if error:
|
|
641
|
+
raise Exception(f"Failed to get Docker credentials: {message}")
|
|
642
|
+
username = creds["username"]
|
|
643
|
+
password = creds["password"]
|
|
644
|
+
logging.info("Using Docker credentials from API call")
|
|
645
|
+
|
|
646
|
+
if username and password:
|
|
647
|
+
login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
|
|
648
|
+
result = subprocess.run(login_cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
|
|
649
|
+
if result.returncode != 0:
|
|
650
|
+
raise Exception(f"Docker login failed with exit code {result.returncode}: {result.stderr}")
|
|
651
|
+
logging.info("Docker login successful")
|
|
652
|
+
else:
|
|
653
|
+
logging.warning(
|
|
654
|
+
"Docker credentials not available, skipping Docker login"
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
except subprocess.TimeoutExpired:
|
|
658
|
+
logging.error("Docker login timed out after 30 seconds")
|
|
659
|
+
raise Exception("Docker login timed out")
|
|
660
|
+
except Exception as err:
|
|
661
|
+
logging.error(
|
|
662
|
+
"Docker login failed: %s",
|
|
663
|
+
str(err),
|
|
664
|
+
)
|
|
665
|
+
raise
|
|
666
|
+
|
|
667
|
+
# Setup user access credentials
|
|
668
|
+
try:
|
|
669
|
+
# Try to get access key and secret key from job_params first
|
|
670
|
+
access_key = job_params.get("access_key")
|
|
671
|
+
secret_key = job_params.get("secret_key")
|
|
672
|
+
|
|
673
|
+
if access_key and secret_key:
|
|
674
|
+
logging.info("Using access key and secret key from job_params")
|
|
675
|
+
(
|
|
676
|
+
self.matrice_access_key_id,
|
|
677
|
+
self.matrice_secret_access_key,
|
|
678
|
+
) = get_decrypted_access_key_pair(access_key, secret_key)
|
|
679
|
+
else:
|
|
680
|
+
# Fallback to API call
|
|
681
|
+
logging.info(
|
|
682
|
+
"Access key and secret key not found in job_params, falling back to API call"
|
|
683
|
+
)
|
|
684
|
+
(
|
|
685
|
+
user_access_key_pair,
|
|
686
|
+
error,
|
|
687
|
+
message,
|
|
688
|
+
) = self.scaling.get_user_access_key_pair(action_details["_idUser"])
|
|
689
|
+
if error:
|
|
690
|
+
raise Exception(f"Failed to get user access key pair: {message}")
|
|
691
|
+
access_key = user_access_key_pair["access_key"]
|
|
692
|
+
secret_key = user_access_key_pair["secret_key"]
|
|
693
|
+
(
|
|
694
|
+
self.matrice_access_key_id,
|
|
695
|
+
self.matrice_secret_access_key,
|
|
696
|
+
) = get_decrypted_access_key_pair(access_key, secret_key)
|
|
697
|
+
|
|
698
|
+
except Exception as err:
|
|
699
|
+
logging.error(
|
|
700
|
+
"Failed to setup credentials: %s",
|
|
701
|
+
str(err),
|
|
702
|
+
)
|
|
703
|
+
raise
|
|
704
|
+
|
|
705
|
+
# @log_errors(raise_exception=False)
|
|
706
|
+
# def create_redis_container(self, redis_image=None, redis_password=None):
|
|
707
|
+
# """Create and start a Redis container using Docker.
|
|
708
|
+
|
|
709
|
+
# Args:
|
|
710
|
+
# redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
|
|
711
|
+
|
|
712
|
+
# Returns:
|
|
713
|
+
# tuple: (container_info, error, message)
|
|
714
|
+
# """
|
|
715
|
+
# if redis_image is None:
|
|
716
|
+
# redis_image = "redis:latest"
|
|
717
|
+
|
|
718
|
+
# network_name = f"redis_network_{int(time.time())}"
|
|
719
|
+
# subprocess.run(f"docker network create {network_name}", shell=True, check=True)
|
|
720
|
+
|
|
721
|
+
# try:
|
|
722
|
+
# # Get an available port for Redis
|
|
723
|
+
# external_port = "6379"
|
|
724
|
+
|
|
725
|
+
# # Generate a unique container name and password
|
|
726
|
+
# container_name = f"redis_container_{int(time.time())}"
|
|
727
|
+
|
|
728
|
+
# # Build the docker command to create Redis container with password
|
|
729
|
+
# cmd = (
|
|
730
|
+
# f"docker run -d "
|
|
731
|
+
# f"--network {network_name} "
|
|
732
|
+
# f"--name {container_name} "
|
|
733
|
+
# f"-p {external_port}:6379 "
|
|
734
|
+
# f"--restart unless-stopped "
|
|
735
|
+
# f"{redis_image} "
|
|
736
|
+
# f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
|
|
737
|
+
# )
|
|
738
|
+
|
|
739
|
+
# logging.info("Creating Redis container with command: %s", cmd)
|
|
740
|
+
|
|
741
|
+
# # Execute the command
|
|
742
|
+
# result = subprocess.run(
|
|
743
|
+
# cmd, shell=True, capture_output=True, text=True, timeout=60
|
|
744
|
+
# )
|
|
745
|
+
|
|
746
|
+
# if result.returncode == 0:
|
|
747
|
+
# container_id = result.stdout.strip()
|
|
748
|
+
# container_info = {
|
|
749
|
+
# "container_id": container_id,
|
|
750
|
+
# "container_name": container_name,
|
|
751
|
+
# "network_name": network_name,
|
|
752
|
+
# "external_port": external_port,
|
|
753
|
+
# "internal_port": 6379,
|
|
754
|
+
# "password": redis_password,
|
|
755
|
+
# "image": redis_image,
|
|
756
|
+
# "status": "running",
|
|
757
|
+
# }
|
|
758
|
+
|
|
759
|
+
# logging.info("Redis container created successfully: %s", container_info)
|
|
760
|
+
# return container_info, None, "Redis container created successfully"
|
|
761
|
+
# else:
|
|
762
|
+
# error_message = f"Failed to create Redis container: {result.stderr}"
|
|
763
|
+
# logging.error(error_message)
|
|
764
|
+
# return None, "ContainerCreationError", error_message
|
|
765
|
+
|
|
766
|
+
# except subprocess.TimeoutExpired:
|
|
767
|
+
# error_message = "Timeout while creating Redis container"
|
|
768
|
+
# logging.error(error_message)
|
|
769
|
+
# return None, "TimeoutError", error_message
|
|
770
|
+
# except Exception as e:
|
|
771
|
+
# error_message = f"Unexpected error creating Redis container: {str(e)}"
|
|
772
|
+
# logging.error(error_message)
|
|
773
|
+
# return None, "UnexpectedError", error_message
|
|
774
|
+
|
|
775
|
+
@log_errors(raise_exception=False, log_error=False)
|
|
776
|
+
def send_logs_continuously(self):
|
|
777
|
+
"""Continuously read and send logs from the log file to the scaling service.
|
|
778
|
+
|
|
779
|
+
Enhanced version that tracks log position and handles graceful shutdown.
|
|
780
|
+
"""
|
|
781
|
+
last_position = 0
|
|
782
|
+
self._last_log_position = 0 # Track position for final flush
|
|
783
|
+
|
|
784
|
+
while not self.stop_thread and os.path.exists(self.log_path):
|
|
785
|
+
try:
|
|
786
|
+
with open(self.log_path, "rb") as log_file:
|
|
787
|
+
log_file.seek(last_position)
|
|
788
|
+
new_content = log_file.read()
|
|
789
|
+
if new_content:
|
|
790
|
+
try:
|
|
791
|
+
decoded_content = new_content.decode("utf-8")
|
|
792
|
+
except UnicodeDecodeError:
|
|
793
|
+
# Handle invalid UTF-8 bytes by replacing them
|
|
794
|
+
decoded_content = new_content.decode(
|
|
795
|
+
"utf-8",
|
|
796
|
+
errors="replace",
|
|
797
|
+
)
|
|
798
|
+
self._send_logs_to_scaling(decoded_content)
|
|
799
|
+
self._check_cuda(decoded_content)
|
|
800
|
+
|
|
801
|
+
# Update tracked position
|
|
802
|
+
last_position = log_file.tell()
|
|
803
|
+
self._last_log_position = last_position
|
|
804
|
+
|
|
805
|
+
except Exception as e:
|
|
806
|
+
logging.error(
|
|
807
|
+
"Error reading logs for action %s: %s",
|
|
808
|
+
getattr(self, "action_record_id", "unknown"),
|
|
809
|
+
str(e),
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
# Use shorter sleep interval for more responsive log monitoring
|
|
813
|
+
time.sleep(10) # Reduced from 30 to 10 seconds for better responsiveness
|
|
814
|
+
|
|
815
|
+
# Final attempt to send any remaining logs when thread is stopping
|
|
816
|
+
logging.info(
|
|
817
|
+
"Log monitoring thread stopping for action %s, performing final check",
|
|
818
|
+
getattr(self, "action_record_id", "unknown"),
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
# One more final read attempt
|
|
822
|
+
try:
|
|
823
|
+
if os.path.exists(self.log_path):
|
|
824
|
+
with open(self.log_path, "rb") as log_file:
|
|
825
|
+
log_file.seek(last_position)
|
|
826
|
+
final_content = log_file.read()
|
|
827
|
+
if final_content:
|
|
828
|
+
try:
|
|
829
|
+
decoded_content = final_content.decode("utf-8")
|
|
830
|
+
except UnicodeDecodeError:
|
|
831
|
+
decoded_content = final_content.decode(
|
|
832
|
+
"utf-8", errors="replace"
|
|
833
|
+
)
|
|
834
|
+
self._send_logs_to_scaling(decoded_content)
|
|
835
|
+
self._check_cuda(decoded_content)
|
|
836
|
+
logging.info(
|
|
837
|
+
"Sent final %d bytes of logs for action %s",
|
|
838
|
+
len(final_content),
|
|
839
|
+
getattr(self, "action_record_id", "unknown"),
|
|
840
|
+
)
|
|
841
|
+
except Exception as e:
|
|
842
|
+
logging.error(
|
|
843
|
+
"Error in final log read for action %s: %s",
|
|
844
|
+
getattr(self, "action_record_id", "unknown"),
|
|
845
|
+
str(e),
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
@log_errors(raise_exception=False, log_error=False)
|
|
849
|
+
def _send_logs_to_scaling(self, log_content):
|
|
850
|
+
"""Send logs to the scaling service.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
log_content (str): Log content to send
|
|
854
|
+
"""
|
|
855
|
+
_, error, message = self.scaling.update_action_docker_logs(
|
|
856
|
+
action_record_id=self.action_record_id,
|
|
857
|
+
log_content=log_content,
|
|
858
|
+
)
|
|
859
|
+
if error:
|
|
860
|
+
logging.error(
|
|
861
|
+
"Error from update_action_docker_logs: %s",
|
|
862
|
+
error,
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
@log_errors(raise_exception=False, log_error=False)
|
|
866
|
+
def _check_cuda(self, log_content):
|
|
867
|
+
"""Check for CUDA out of memory errors in logs and update action status.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
log_content (str): Log content to check
|
|
871
|
+
"""
|
|
872
|
+
if "CUDA error: out of memory" in log_content:
|
|
873
|
+
action_details = self.get_action_details()
|
|
874
|
+
if not action_details:
|
|
875
|
+
return
|
|
876
|
+
self.scaling.update_action(
|
|
877
|
+
id=self.action_record_id,
|
|
878
|
+
step_code="ERROR",
|
|
879
|
+
action_type=action_details["action"],
|
|
880
|
+
status="ERROR",
|
|
881
|
+
status_description="CUDA error: out of memory",
|
|
882
|
+
service="bg-job-scheduler",
|
|
883
|
+
job_params=action_details["jobParams"],
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
@log_errors(raise_exception=True)
|
|
887
|
+
def start_process(self, cmd, log_name):
|
|
888
|
+
"""Start the process and initialize logging.
|
|
889
|
+
|
|
890
|
+
Args:
|
|
891
|
+
cmd (str): Command to execute
|
|
892
|
+
log_name (str): Name for log file
|
|
893
|
+
|
|
894
|
+
Raises:
|
|
895
|
+
Exception: If process fails to start
|
|
896
|
+
"""
|
|
897
|
+
self.cmd = cmd
|
|
898
|
+
self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
|
|
899
|
+
|
|
900
|
+
with open(self.log_path, "wb") as out:
|
|
901
|
+
self.process = subprocess.Popen(
|
|
902
|
+
shlex.split(self.cmd),
|
|
903
|
+
stdout=out,
|
|
904
|
+
stderr=out,
|
|
905
|
+
env={**os.environ},
|
|
906
|
+
start_new_session=True,
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
self.container_id = None
|
|
910
|
+
|
|
911
|
+
cid_file_path = f"./{self.action_record_id}.cid"
|
|
912
|
+
max_retries = 5
|
|
913
|
+
retry_delay = 1 # seconds
|
|
914
|
+
for attempt in range(max_retries):
|
|
915
|
+
try:
|
|
916
|
+
with open(cid_file_path, "r") as cid_file:
|
|
917
|
+
container_id = cid_file.read().strip()
|
|
918
|
+
self.container_id = container_id
|
|
919
|
+
logging.info(
|
|
920
|
+
"Started process for action %s with container ID: %s",
|
|
921
|
+
self.action_record_id,
|
|
922
|
+
self.container_id,
|
|
923
|
+
)
|
|
924
|
+
break
|
|
925
|
+
except FileNotFoundError:
|
|
926
|
+
logging.warning(
|
|
927
|
+
"CID file not found for action %s, attempt %d/%d",
|
|
928
|
+
self.action_record_id,
|
|
929
|
+
attempt + 1,
|
|
930
|
+
max_retries,
|
|
931
|
+
)
|
|
932
|
+
time.sleep(retry_delay)
|
|
933
|
+
except Exception as e:
|
|
934
|
+
logging.error(
|
|
935
|
+
"Error reading CID file for action %s: %s",
|
|
936
|
+
self.action_record_id,
|
|
937
|
+
str(e),
|
|
938
|
+
)
|
|
939
|
+
time.sleep(retry_delay)
|
|
940
|
+
else:
|
|
941
|
+
logging.error(
|
|
942
|
+
"Failed to read CID file for action %s after %d attempts",
|
|
943
|
+
self.action_record_id,
|
|
944
|
+
max_retries,
|
|
945
|
+
)
|
|
946
|
+
raise Exception("Failed to start process: CID file not found")
|
|
947
|
+
|
|
948
|
+
# report container id to scaling service
|
|
949
|
+
self.scaling.update_action_container_id(
|
|
950
|
+
action_record_id=self.action_record_id,
|
|
951
|
+
container_id=self.container_id,
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
@log_errors(raise_exception=False)
|
|
956
|
+
def start_logger(self):
|
|
957
|
+
"""Start the log monitoring thread."""
|
|
958
|
+
self.log_thread = threading.Thread(
|
|
959
|
+
target=self.send_logs_continuously,
|
|
960
|
+
daemon=False, # CRITICAL: Make thread non-daemon to ensure it completes
|
|
961
|
+
)
|
|
962
|
+
self.log_thread.start()
|
|
963
|
+
|
|
964
|
+
@log_errors(raise_exception=False)
|
|
965
|
+
def start(self, cmd: str = "", log_name: str = ""):
|
|
966
|
+
"""Start the process and log monitoring thread.
|
|
967
|
+
|
|
968
|
+
Args:
|
|
969
|
+
cmd (str): Command to execute
|
|
970
|
+
log_name (str): Name for log file
|
|
971
|
+
"""
|
|
972
|
+
self.start_process(cmd, log_name)
|
|
973
|
+
self.start_logger()
|
|
974
|
+
self.scaling.update_status(
|
|
975
|
+
self.action_record_id,
|
|
976
|
+
self.action_type,
|
|
977
|
+
"bg-job-scheduler",
|
|
978
|
+
"DKR_CMD",
|
|
979
|
+
"OK",
|
|
980
|
+
f"Start docker container with command: {cmd.replace(self.matrice_access_key_id, 'MATRICE_ACCESS_KEY_ID').replace(self.matrice_secret_access_key, 'MATRICE_SECRET_ACCESS_KEY')}",
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
@log_errors(raise_exception=False, log_error=False)
|
|
984
|
+
def stop(self):
|
|
985
|
+
"""Stop the process and log monitoring thread.
|
|
986
|
+
|
|
987
|
+
Enhanced version that ensures proper cleanup sequencing and log completion.
|
|
988
|
+
"""
|
|
989
|
+
logging.info("Stopping action %s", getattr(self, "action_record_id", "unknown"))
|
|
990
|
+
|
|
991
|
+
# Step 1: Signal log thread to stop
|
|
992
|
+
self.stop_thread = True
|
|
993
|
+
|
|
994
|
+
# Step 2: Stop the process
|
|
995
|
+
try:
|
|
996
|
+
if self.process:
|
|
997
|
+
logging.info(
|
|
998
|
+
"Terminating process for action %s",
|
|
999
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1000
|
+
)
|
|
1001
|
+
os.killpg(
|
|
1002
|
+
os.getpgid(self.process.pid),
|
|
1003
|
+
signal.SIGTERM,
|
|
1004
|
+
)
|
|
1005
|
+
# Give process time to terminate gracefully
|
|
1006
|
+
try:
|
|
1007
|
+
self.process.wait(timeout=15)
|
|
1008
|
+
logging.info(
|
|
1009
|
+
"Process terminated gracefully for action %s",
|
|
1010
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1011
|
+
)
|
|
1012
|
+
except subprocess.TimeoutExpired:
|
|
1013
|
+
logging.warning(
|
|
1014
|
+
"Process didn't terminate gracefully, forcing kill for action %s",
|
|
1015
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1016
|
+
)
|
|
1017
|
+
try:
|
|
1018
|
+
os.killpg(os.getpgid(self.process.pid), signal.SIGKILL)
|
|
1019
|
+
self.process.wait(timeout=5)
|
|
1020
|
+
except Exception as kill_err:
|
|
1021
|
+
logging.error(
|
|
1022
|
+
"Error force-killing process for action %s: %s",
|
|
1023
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1024
|
+
str(kill_err),
|
|
1025
|
+
)
|
|
1026
|
+
except Exception as proc_err:
|
|
1027
|
+
logging.error(
|
|
1028
|
+
"Error stopping process for action %s: %s",
|
|
1029
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1030
|
+
str(proc_err),
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
# Step 3: Ensure final logs are sent
|
|
1034
|
+
self._ensure_final_logs_sent()
|
|
1035
|
+
|
|
1036
|
+
# Step 4: Wait for log thread to complete
|
|
1037
|
+
if self.log_thread and self.log_thread.is_alive():
|
|
1038
|
+
logging.info(
|
|
1039
|
+
"Waiting for log thread to complete for action %s",
|
|
1040
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1041
|
+
)
|
|
1042
|
+
try:
|
|
1043
|
+
self.log_thread.join(
|
|
1044
|
+
timeout=30
|
|
1045
|
+
) # Wait up to 30 seconds for logs to complete
|
|
1046
|
+
if self.log_thread.is_alive():
|
|
1047
|
+
logging.warning(
|
|
1048
|
+
"Log thread didn't complete within timeout for action %s",
|
|
1049
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1050
|
+
)
|
|
1051
|
+
else:
|
|
1052
|
+
logging.info(
|
|
1053
|
+
"Log thread completed successfully for action %s",
|
|
1054
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1055
|
+
)
|
|
1056
|
+
except Exception as thread_err:
|
|
1057
|
+
logging.error(
|
|
1058
|
+
"Error waiting for log thread for action %s: %s",
|
|
1059
|
+
getattr(self, "action_record_id", "unknown"),
|
|
1060
|
+
str(thread_err),
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
@log_errors(raise_exception=False)
|
|
1064
|
+
def execute(self):
|
|
1065
|
+
"""Execute the task."""
|
|
1066
|
+
self.task(self)
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
@log_errors(raise_exception=False)
|
|
1070
|
+
def data_preparation_execute(
|
|
1071
|
+
self: ActionInstance,
|
|
1072
|
+
):
|
|
1073
|
+
"""Execute data preparation task."""
|
|
1074
|
+
work_fs = get_max_file_system()
|
|
1075
|
+
action_details = self.get_action_details()
|
|
1076
|
+
if not action_details:
|
|
1077
|
+
return
|
|
1078
|
+
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1079
|
+
action = {"jobParams": action_details["jobParams"]}
|
|
1080
|
+
dataset_id_version = (
|
|
1081
|
+
action_details["jobParams"]["dataset_id"]
|
|
1082
|
+
+ action_details["jobParams"]["dataset_version"]
|
|
1083
|
+
)
|
|
1084
|
+
action["jobParams"].update(
|
|
1085
|
+
{
|
|
1086
|
+
"dataset_host_path_map": {dataset_id_version: f"{work_fs}/workspace"},
|
|
1087
|
+
"dataset_local_path_map": {dataset_id_version: "/usr/src/app/workspace"},
|
|
1088
|
+
"host_file_system": work_fs,
|
|
1089
|
+
}
|
|
1090
|
+
)
|
|
1091
|
+
self.scaling.update_action(
|
|
1092
|
+
id=self.action_record_id,
|
|
1093
|
+
step_code="DCK_LNCH",
|
|
1094
|
+
action_type=action_details["action"],
|
|
1095
|
+
status=action_details["status"],
|
|
1096
|
+
sub_action=action_details["subAction"],
|
|
1097
|
+
status_description="Job is assigned to docker",
|
|
1098
|
+
service="bg-job-scheduler",
|
|
1099
|
+
job_params=action["jobParams"],
|
|
1100
|
+
)
|
|
1101
|
+
if action["jobParams"].get("model_train_docker"):
|
|
1102
|
+
logging.info("Pulling the docker image")
|
|
1103
|
+
pull_cmd = f"docker pull {action['jobParams']['model_train_docker']}"
|
|
1104
|
+
process = subprocess.Popen(
|
|
1105
|
+
pull_cmd,
|
|
1106
|
+
shell=True,
|
|
1107
|
+
stdout=subprocess.PIPE,
|
|
1108
|
+
stderr=subprocess.PIPE,
|
|
1109
|
+
)
|
|
1110
|
+
logging.info(
|
|
1111
|
+
"Started pulling Docker image with PID: %s",
|
|
1112
|
+
process.pid,
|
|
1113
|
+
)
|
|
1114
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
|
|
1115
|
+
logging.info("cmd is: %s", cmd)
|
|
1116
|
+
self.start(cmd, "data_preparation_log")
|
|
1117
|
+
|
|
1118
|
+
|
|
1119
|
+
@log_errors(raise_exception=False)
|
|
1120
|
+
def data_processing_execute(self: ActionInstance):
|
|
1121
|
+
"""Execute data processing task."""
|
|
1122
|
+
work_fs = get_max_file_system()
|
|
1123
|
+
action_details = self.get_action_details()
|
|
1124
|
+
if not action_details:
|
|
1125
|
+
return
|
|
1126
|
+
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1127
|
+
action = {"jobParams": action_details["jobParams"]}
|
|
1128
|
+
action["jobParams"].update(
|
|
1129
|
+
{
|
|
1130
|
+
"dp_dv_host_paths": [f"{work_fs}/workspace"],
|
|
1131
|
+
"dp_dv_local_paths": ["/usr/src/app/workspace"],
|
|
1132
|
+
}
|
|
1133
|
+
)
|
|
1134
|
+
self.scaling.update_action(
|
|
1135
|
+
id=self.action_record_id,
|
|
1136
|
+
step_code="DCK_LNCH",
|
|
1137
|
+
action_type=action_details["action"],
|
|
1138
|
+
status="ACK",
|
|
1139
|
+
status_description="Job is assigned to docker",
|
|
1140
|
+
service="bg-job-scheduler",
|
|
1141
|
+
job_params=action["jobParams"],
|
|
1142
|
+
)
|
|
1143
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
|
|
1144
|
+
logging.info("cmd: %s", cmd)
|
|
1145
|
+
self.start(cmd, "data_processing_log")
|
|
1146
|
+
|
|
1147
|
+
|
|
1148
|
+
@log_errors(raise_exception=False)
|
|
1149
|
+
def data_split_execute(self: ActionInstance):
|
|
1150
|
+
"""Execute data split task."""
|
|
1151
|
+
work_fs = get_max_file_system()
|
|
1152
|
+
action_details = self.get_action_details()
|
|
1153
|
+
if not action_details:
|
|
1154
|
+
return
|
|
1155
|
+
self.setup_action_requirements(action_details, work_fs, model_family="")
|
|
1156
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
|
|
1157
|
+
logging.info("cmd: %s", cmd)
|
|
1158
|
+
self.start(cmd, "data_split")
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
@log_errors(raise_exception=False)
|
|
1162
|
+
def dataset_annotation_execute(
|
|
1163
|
+
self: ActionInstance,
|
|
1164
|
+
):
|
|
1165
|
+
"""Execute dataset annotation task."""
|
|
1166
|
+
work_fs = get_max_file_system()
|
|
1167
|
+
action_details = self.get_action_details()
|
|
1168
|
+
if not action_details:
|
|
1169
|
+
return
|
|
1170
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1171
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
|
|
1172
|
+
logging.info("cmd: %s", cmd)
|
|
1173
|
+
self.start(cmd, "dataset_annotation")
|
|
1174
|
+
|
|
1175
|
+
|
|
1176
|
+
@log_errors(raise_exception=False)
|
|
1177
|
+
def dataset_augmentation_execute(
|
|
1178
|
+
self: ActionInstance,
|
|
1179
|
+
):
|
|
1180
|
+
"""Execute dataset augmentation task."""
|
|
1181
|
+
work_fs = get_max_file_system()
|
|
1182
|
+
action_details = self.get_action_details()
|
|
1183
|
+
if not action_details:
|
|
1184
|
+
return
|
|
1185
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1186
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
|
|
1187
|
+
logging.info("cmd: %s", cmd)
|
|
1188
|
+
self.start(cmd, "dataset_augmentation")
|
|
1189
|
+
|
|
1190
|
+
|
|
1191
|
+
@log_errors(raise_exception=False)
|
|
1192
|
+
def augmentation_server_creation_execute(
|
|
1193
|
+
self: ActionInstance,
|
|
1194
|
+
):
|
|
1195
|
+
"""Create Augmentation Server"""
|
|
1196
|
+
work_fs = get_max_file_system()
|
|
1197
|
+
action_details = self.get_action_details()
|
|
1198
|
+
external_port = self.scaling.get_open_port()
|
|
1199
|
+
if not action_details:
|
|
1200
|
+
return
|
|
1201
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1202
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
|
|
1203
|
+
logging.info("cmd: %s", cmd)
|
|
1204
|
+
self.start(cmd, "augmentation_setup")
|
|
1205
|
+
|
|
1206
|
+
|
|
1207
|
+
@log_errors(raise_exception=False)
|
|
1208
|
+
def database_setup_execute(self: ActionInstance):
|
|
1209
|
+
"""
|
|
1210
|
+
Creates and setup the database for facial recognition server.
|
|
1211
|
+
MongoDB runs on port 27020:27017 (localhost only with --net=host).
|
|
1212
|
+
Qdrant runs on port 6334 (localhost only with --net=host).
|
|
1213
|
+
"""
|
|
1214
|
+
action_details = self.get_action_details()
|
|
1215
|
+
if not action_details:
|
|
1216
|
+
return
|
|
1217
|
+
image = action_details["actionDetails"].get("docker")
|
|
1218
|
+
|
|
1219
|
+
self.setup_action_requirements(action_details)
|
|
1220
|
+
|
|
1221
|
+
project_id = action_details["_idProject"]
|
|
1222
|
+
|
|
1223
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1224
|
+
logging.info(
|
|
1225
|
+
"Using existing container ID for inference tracker: %s",
|
|
1226
|
+
action_details["actionDetails"]["containerId"],
|
|
1227
|
+
)
|
|
1228
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1229
|
+
cmd = "docker restart " + self.docker_container
|
|
1230
|
+
self.start(cmd, "qdrant_setup")
|
|
1231
|
+
|
|
1232
|
+
#qdrant restart
|
|
1233
|
+
qdrant_cmd = "docker restart qdrant"
|
|
1234
|
+
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1235
|
+
|
|
1236
|
+
return
|
|
1237
|
+
|
|
1238
|
+
# MongoDB container with --net=host (Port: 27020:27017)
|
|
1239
|
+
cmd = (
|
|
1240
|
+
f"docker run --pull=always --net=host "
|
|
1241
|
+
f"--name mongodbdatabase "
|
|
1242
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1243
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1244
|
+
f"-e ACTION_RECORD_ID={self.action_record_id} "
|
|
1245
|
+
f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
|
|
1246
|
+
f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
|
|
1247
|
+
f"-e PROJECT_ID={project_id} "
|
|
1248
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1249
|
+
f"{image} "
|
|
1250
|
+
)
|
|
1251
|
+
logging.info("Starting MongoDB container (Port: 27020:27017): %s", cmd)
|
|
1252
|
+
|
|
1253
|
+
# Qdrant container with --net=host (Port: 6334)
|
|
1254
|
+
qdrant_cmd = (
|
|
1255
|
+
f"docker run --pull=always --net=host "
|
|
1256
|
+
f"--name qdrant "
|
|
1257
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1258
|
+
f"{'qdrant/qdrant:latest'} "
|
|
1259
|
+
)
|
|
1260
|
+
logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
|
|
1261
|
+
|
|
1262
|
+
# Docker Command run
|
|
1263
|
+
self.start(cmd, "database_setup")
|
|
1264
|
+
|
|
1265
|
+
# Docker for qdrant
|
|
1266
|
+
self.start(qdrant_cmd, 'qdrant_setup')
|
|
1267
|
+
|
|
1268
|
+
@log_errors(raise_exception=False)
|
|
1269
|
+
def facial_recognition_setup_execute(self: ActionInstance):
|
|
1270
|
+
"""
|
|
1271
|
+
Creates and setup the facial recognition worker server.
|
|
1272
|
+
Facial recognition worker runs on port 8081 (localhost only with --net=host).
|
|
1273
|
+
"""
|
|
1274
|
+
action_details = self.get_action_details()
|
|
1275
|
+
|
|
1276
|
+
if not action_details:
|
|
1277
|
+
return
|
|
1278
|
+
image = action_details["actionDetails"].get("docker")
|
|
1279
|
+
|
|
1280
|
+
self.setup_action_requirements(action_details)
|
|
1281
|
+
|
|
1282
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1283
|
+
logging.info(
|
|
1284
|
+
"Using existing container ID for facial recognition worker: %s",
|
|
1285
|
+
action_details["actionDetails"]["containerId"],
|
|
1286
|
+
)
|
|
1287
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1288
|
+
cmd = "docker restart " + self.docker_container
|
|
1289
|
+
self.start(cmd, "facial_recognition_setup")
|
|
1290
|
+
return
|
|
1291
|
+
|
|
1292
|
+
# Facial recognition worker container with --net=host (Port: 8081)
|
|
1293
|
+
worker_cmd = (
|
|
1294
|
+
f"docker run -d --pull=always --net=host "
|
|
1295
|
+
f"--name worker "
|
|
1296
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1297
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1298
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1299
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1300
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1301
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1302
|
+
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1303
|
+
f"{image}"
|
|
1304
|
+
)
|
|
1305
|
+
logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
|
|
1306
|
+
|
|
1307
|
+
# Docker Command run
|
|
1308
|
+
self.start(worker_cmd, "facial_recognition_setup")
|
|
1309
|
+
|
|
1310
|
+
@log_errors(raise_exception=False)
|
|
1311
|
+
def lpr_setup_execute(self: ActionInstance):
|
|
1312
|
+
"""
|
|
1313
|
+
Creates and setup the license plate recognition server.
|
|
1314
|
+
LPR worker runs on port 8082 (localhost only with --net=host).
|
|
1315
|
+
"""
|
|
1316
|
+
action_details = self.get_action_details()
|
|
1317
|
+
|
|
1318
|
+
if not action_details:
|
|
1319
|
+
return
|
|
1320
|
+
image = self.docker_container
|
|
1321
|
+
|
|
1322
|
+
self.setup_action_requirements(action_details)
|
|
1323
|
+
|
|
1324
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1325
|
+
logging.info(
|
|
1326
|
+
"Using existing container ID for LPR worker: %s",
|
|
1327
|
+
action_details["actionDetails"]["containerId"],
|
|
1328
|
+
)
|
|
1329
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1330
|
+
cmd = "docker restart " + self.docker_container
|
|
1331
|
+
self.start(cmd, "lpr_setup")
|
|
1332
|
+
return
|
|
1333
|
+
|
|
1334
|
+
# LPR worker container with --net=host (Port: 8082)
|
|
1335
|
+
worker_cmd = (
|
|
1336
|
+
f"docker run -d --net=host --pull=always "
|
|
1337
|
+
f"--name lpr-worker "
|
|
1338
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1339
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1340
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1341
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1342
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1343
|
+
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1344
|
+
f'-e PORT=8082 '
|
|
1345
|
+
f"{image}"
|
|
1346
|
+
)
|
|
1347
|
+
logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
|
|
1348
|
+
|
|
1349
|
+
# Docker Command run
|
|
1350
|
+
self.start(worker_cmd, "lpr_setup")
|
|
1351
|
+
|
|
1352
|
+
@log_errors(raise_exception=False)
|
|
1353
|
+
def inference_ws_server_execute(self: ActionInstance):
|
|
1354
|
+
"""
|
|
1355
|
+
Creates and start inference pipeline.
|
|
1356
|
+
Inference WebSocket server runs on port 8102 (localhost only with --net=host).
|
|
1357
|
+
"""
|
|
1358
|
+
action_details = self.get_action_details()
|
|
1359
|
+
|
|
1360
|
+
if not action_details:
|
|
1361
|
+
return
|
|
1362
|
+
image = action_details["actionDetails"].get("docker")
|
|
1363
|
+
|
|
1364
|
+
self.setup_action_requirements(action_details)
|
|
1365
|
+
|
|
1366
|
+
# Get the best IP and network configuration for port 8102
|
|
1367
|
+
ws_host, use_host_network = get_best_service_ip_and_network(8102)
|
|
1368
|
+
|
|
1369
|
+
# Store ws_host in environment variable for use by other actions (e.g., fe_fs_streaming)
|
|
1370
|
+
if not os.environ.get("INFERENCE_WS_HOST"):
|
|
1371
|
+
os.environ["INFERENCE_WS_HOST"] = ws_host
|
|
1372
|
+
|
|
1373
|
+
logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
|
|
1374
|
+
|
|
1375
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1376
|
+
logging.info(
|
|
1377
|
+
"Using existing container ID for inference WebSocket server: %s",
|
|
1378
|
+
action_details["actionDetails"]["containerId"],
|
|
1379
|
+
)
|
|
1380
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1381
|
+
cmd = "docker restart " + self.docker_container
|
|
1382
|
+
self.start(cmd, "inference_ws_server")
|
|
1383
|
+
return
|
|
1384
|
+
|
|
1385
|
+
# Inference WebSocket server with --net=host (Port: 8102)
|
|
1386
|
+
worker_cmd = (
|
|
1387
|
+
f"docker run -d --pull=always --net=host "
|
|
1388
|
+
f"--name inference "
|
|
1389
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1390
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1391
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1392
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1393
|
+
f"{image} "
|
|
1394
|
+
f"./app "
|
|
1395
|
+
f"{self.action_record_id} "
|
|
1396
|
+
)
|
|
1397
|
+
logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
|
|
1398
|
+
|
|
1399
|
+
# Docker Command run
|
|
1400
|
+
self.start(worker_cmd, "inference_ws_server")
|
|
1401
|
+
|
|
1402
|
+
|
|
1403
|
+
@log_errors(raise_exception=False)
|
|
1404
|
+
def fe_fs_streaming_execute(self: ActionInstance):
|
|
1405
|
+
"""
|
|
1406
|
+
Creates and setup the frontend for fs streaming.
|
|
1407
|
+
Frontend streaming runs on port 3000 (localhost only with --net=host).
|
|
1408
|
+
"""
|
|
1409
|
+
action_details = self.get_action_details()
|
|
1410
|
+
|
|
1411
|
+
if not action_details:
|
|
1412
|
+
return
|
|
1413
|
+
image = action_details["actionDetails"].get("docker")
|
|
1414
|
+
|
|
1415
|
+
self.setup_action_requirements(action_details)
|
|
1416
|
+
|
|
1417
|
+
# Get the ws_host from environment variable set by inference_ws_server_execute
|
|
1418
|
+
ws_host = os.environ.get("INFERENCE_WS_HOST", "localhost")
|
|
1419
|
+
ws_url = f"{ws_host}:8102"
|
|
1420
|
+
|
|
1421
|
+
logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
|
|
1422
|
+
|
|
1423
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1424
|
+
logging.info(
|
|
1425
|
+
"Using existing container ID for frontend streaming: %s",
|
|
1426
|
+
action_details["actionDetails"]["containerId"],
|
|
1427
|
+
)
|
|
1428
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1429
|
+
cmd = "docker restart " + self.docker_container
|
|
1430
|
+
self.start(cmd, "fe_fs_streaming")
|
|
1431
|
+
return
|
|
1432
|
+
|
|
1433
|
+
# Frontend streaming with --net=host (Port: 3000)
|
|
1434
|
+
worker_cmd = (
|
|
1435
|
+
f"docker run -d --pull=always --net=host "
|
|
1436
|
+
f"--name fe_streaming "
|
|
1437
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1438
|
+
f"-v matrice_myvol:/matrice_data "
|
|
1439
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1440
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1441
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1442
|
+
f"-e PORT=3000 "
|
|
1443
|
+
f'-e WS_HOST="{ws_url}" '
|
|
1444
|
+
f"{image}"
|
|
1445
|
+
)
|
|
1446
|
+
logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
|
|
1447
|
+
|
|
1448
|
+
# Docker Command run
|
|
1449
|
+
self.start(worker_cmd, "fe_fs_streaming")
|
|
1450
|
+
|
|
1451
|
+
|
|
1452
|
+
@log_errors(raise_exception=False)
|
|
1453
|
+
def fe_analytics_service_execute(self: ActionInstance):
|
|
1454
|
+
"""
|
|
1455
|
+
Creates and setup the frontend analytics service.
|
|
1456
|
+
Frontend analytics service runs on port 3001 (localhost only with --net=host).
|
|
1457
|
+
"""
|
|
1458
|
+
action_details = self.get_action_details()
|
|
1459
|
+
|
|
1460
|
+
if not action_details:
|
|
1461
|
+
return
|
|
1462
|
+
image = action_details["actionDetails"].get("docker")
|
|
1463
|
+
|
|
1464
|
+
self.setup_action_requirements(action_details)
|
|
1465
|
+
|
|
1466
|
+
project_id = action_details["_idProject"]
|
|
1467
|
+
|
|
1468
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1469
|
+
logging.info(
|
|
1470
|
+
"Using existing container ID for frontend analytics service: %s",
|
|
1471
|
+
action_details["actionDetails"]["containerId"],
|
|
1472
|
+
)
|
|
1473
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1474
|
+
cmd = "docker restart " + self.docker_container
|
|
1475
|
+
self.start(cmd, "fe_analytics_service")
|
|
1476
|
+
return
|
|
1477
|
+
|
|
1478
|
+
# Frontend analytics service with --net=host (Port: 3001)
|
|
1479
|
+
worker_cmd = (
|
|
1480
|
+
f"docker run -d --pull=always --net=host "
|
|
1481
|
+
f"--name fe-analytics "
|
|
1482
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1483
|
+
f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
|
|
1484
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1485
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1486
|
+
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1487
|
+
f"-e PORT=3001 "
|
|
1488
|
+
f'-e PROJECT_ID="{project_id}" '
|
|
1489
|
+
f"{image}"
|
|
1490
|
+
)
|
|
1491
|
+
logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
|
|
1492
|
+
|
|
1493
|
+
# Docker Command run
|
|
1494
|
+
self.start(worker_cmd, "fe_analytics_service")
|
|
1495
|
+
|
|
1496
|
+
|
|
1497
|
+
@log_errors(raise_exception=False)
|
|
1498
|
+
def synthetic_dataset_generation_execute(self: ActionInstance):
|
|
1499
|
+
"""Execute synthetic dataset generation task."""
|
|
1500
|
+
work_fs = get_max_file_system()
|
|
1501
|
+
action_details = self.get_action_details()
|
|
1502
|
+
if not action_details:
|
|
1503
|
+
return
|
|
1504
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1505
|
+
extra_env_vars = {}
|
|
1506
|
+
hf_token = self.get_hugging_face_token_for_data_generation()
|
|
1507
|
+
extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
|
|
1508
|
+
if hf_token:
|
|
1509
|
+
extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
|
|
1510
|
+
else:
|
|
1511
|
+
return
|
|
1512
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1513
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
|
|
1514
|
+
logging.info("cmd is: %s", cmd)
|
|
1515
|
+
self.start(cmd, "dataset_generation")
|
|
1516
|
+
|
|
1517
|
+
|
|
1518
|
+
@log_errors(raise_exception=False)
|
|
1519
|
+
def synthetic_data_setup_execute(self: ActionInstance):
|
|
1520
|
+
"""Execute synthetic data setup task."""
|
|
1521
|
+
work_fs = get_max_file_system()
|
|
1522
|
+
action_details = self.get_action_details()
|
|
1523
|
+
external_port = self.scaling.get_open_port()
|
|
1524
|
+
if not action_details:
|
|
1525
|
+
return
|
|
1526
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1527
|
+
extra_env_vars = {}
|
|
1528
|
+
hf_token = self.get_hugging_face_token_for_data_generation()
|
|
1529
|
+
if hf_token:
|
|
1530
|
+
extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
|
|
1531
|
+
else:
|
|
1532
|
+
return
|
|
1533
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1534
|
+
cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
|
|
1535
|
+
logging.info("cmd is: %s", cmd)
|
|
1536
|
+
self.start(cmd, "synthetic_data_setup")
|
|
1537
|
+
|
|
1538
|
+
|
|
1539
|
+
@log_errors(raise_exception=False)
|
|
1540
|
+
def redis_setup_execute(self: ActionInstance):
|
|
1541
|
+
"""
|
|
1542
|
+
Creates and starts a Redis container using Docker.
|
|
1543
|
+
Redis runs on port 6379 (localhost only with --net=host).
|
|
1544
|
+
"""
|
|
1545
|
+
work_fs = get_max_file_system()
|
|
1546
|
+
|
|
1547
|
+
action_details = self.get_action_details()
|
|
1548
|
+
if not action_details:
|
|
1549
|
+
return
|
|
1550
|
+
action_id = action_details["_id"]
|
|
1551
|
+
|
|
1552
|
+
redis_password = action_details["jobParams"].get(
|
|
1553
|
+
"password", f"redis_pass_{int(time.time())}"
|
|
1554
|
+
)
|
|
1555
|
+
|
|
1556
|
+
# Initialize redis container
|
|
1557
|
+
self.setup_action_requirements(
|
|
1558
|
+
action_details,
|
|
1559
|
+
work_fs,
|
|
1560
|
+
model_family="",
|
|
1561
|
+
action_id=action_id,
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
# Get the best IP for Redis (port 6379)
|
|
1565
|
+
redis_host, _ = get_best_service_ip_and_network(6379)
|
|
1566
|
+
|
|
1567
|
+
logging.info(f"Redis will use IP: {redis_host} on port 6379")
|
|
1568
|
+
|
|
1569
|
+
redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
|
|
1570
|
+
|
|
1571
|
+
|
|
1572
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1573
|
+
logging.info(
|
|
1574
|
+
"Using existing container ID for redis management: %s",
|
|
1575
|
+
action_details["actionDetails"]["containerId"],
|
|
1576
|
+
)
|
|
1577
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1578
|
+
cmd = "docker restart " + self.docker_container
|
|
1579
|
+
self.start(cmd, "redis_setup")
|
|
1580
|
+
|
|
1581
|
+
# Redis container restart
|
|
1582
|
+
redis_restart_cmd = "docker restart redis_container"
|
|
1583
|
+
self.start(redis_restart_cmd, "redis")
|
|
1584
|
+
|
|
1585
|
+
return
|
|
1586
|
+
|
|
1587
|
+
# Redis container with --net=host (Port: 6379)
|
|
1588
|
+
redis_cmd = (
|
|
1589
|
+
f"docker run -d --net=host "
|
|
1590
|
+
f"--name redis_container "
|
|
1591
|
+
f"--restart unless-stopped "
|
|
1592
|
+
f"{redis_image} "
|
|
1593
|
+
f"redis-server --bind 0.0.0.0 "
|
|
1594
|
+
f"--appendonly no "
|
|
1595
|
+
f'--save "" '
|
|
1596
|
+
f"--maxmemory 30gb "
|
|
1597
|
+
f"--maxmemory-policy allkeys-lru "
|
|
1598
|
+
f"--io-threads 4 "
|
|
1599
|
+
f"--io-threads-do-reads yes "
|
|
1600
|
+
f"--stream-node-max-bytes 8192 "
|
|
1601
|
+
f"--stream-node-max-entries 1000 "
|
|
1602
|
+
f"--hz 100 "
|
|
1603
|
+
f"--tcp-backlog 2048 "
|
|
1604
|
+
f"--timeout 0 "
|
|
1605
|
+
f"--lazyfree-lazy-eviction yes "
|
|
1606
|
+
f"--lazyfree-lazy-expire yes "
|
|
1607
|
+
f"--lazyfree-lazy-server-del yes "
|
|
1608
|
+
f"--activedefrag yes "
|
|
1609
|
+
f"--requirepass {redis_password}"
|
|
1610
|
+
)
|
|
1611
|
+
logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
|
|
1612
|
+
|
|
1613
|
+
# Start Redis container first
|
|
1614
|
+
redis_process = subprocess.Popen(
|
|
1615
|
+
redis_cmd,
|
|
1616
|
+
shell=True,
|
|
1617
|
+
stdout=subprocess.PIPE,
|
|
1618
|
+
stderr=subprocess.PIPE,
|
|
1619
|
+
)
|
|
1620
|
+
logging.info("Redis container started successfully on %s:6379", redis_host)
|
|
1621
|
+
|
|
1622
|
+
# Wait for Redis to be ready
|
|
1623
|
+
time.sleep(5)
|
|
1624
|
+
|
|
1625
|
+
env_vars = {
|
|
1626
|
+
"REDIS_URL": f"{redis_host}:6379",
|
|
1627
|
+
"REDIS_PASSWORD": redis_password,
|
|
1628
|
+
}
|
|
1629
|
+
|
|
1630
|
+
# bg-redis management container with --net=host (Port: 8082)
|
|
1631
|
+
cmd = (
|
|
1632
|
+
f"docker run --net=host "
|
|
1633
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
1634
|
+
f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
|
|
1635
|
+
f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
|
|
1636
|
+
f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
|
|
1637
|
+
f"-e MATRICE_SECRET_ACCESS_KEY={shlex.quote(self.matrice_secret_access_key)} "
|
|
1638
|
+
f"-e ENV={shlex.quote(os.environ.get('ENV', 'prod'))} "
|
|
1639
|
+
f"-v /var/run/docker.sock:/var/run/docker.sock "
|
|
1640
|
+
f"--shm-size=30G --pull=always "
|
|
1641
|
+
f"{self.docker_container} "
|
|
1642
|
+
f"{self.action_record_id} "
|
|
1643
|
+
)
|
|
1644
|
+
|
|
1645
|
+
logging.info("Starting bg-redis management (Port: 8082) with REDIS_URL=%s: %s", env_vars['REDIS_URL'], cmd)
|
|
1646
|
+
|
|
1647
|
+
self.start(cmd, "redis_setup")
|
|
1648
|
+
|
|
1649
|
+
|
|
1650
|
+
@log_errors(raise_exception=False)
|
|
1651
|
+
def deploy_aggregator_execute(
|
|
1652
|
+
self: ActionInstance,
|
|
1653
|
+
):
|
|
1654
|
+
"""Execute deploy aggregator task."""
|
|
1655
|
+
work_fs = get_max_file_system()
|
|
1656
|
+
action_details = self.get_action_details()
|
|
1657
|
+
if not action_details:
|
|
1658
|
+
return
|
|
1659
|
+
self.setup_action_requirements(action_details, work_fs)
|
|
1660
|
+
cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
|
|
1661
|
+
logging.info("cmd: %s", cmd)
|
|
1662
|
+
self.start(cmd, "deploy_aggregator")
|
|
1663
|
+
|
|
1664
|
+
|
|
1665
|
+
@log_errors(raise_exception=False)
|
|
1666
|
+
def model_deploy_execute(self: ActionInstance):
|
|
1667
|
+
"""Execute model deployment task."""
|
|
1668
|
+
external_port = self.scaling.get_open_port()
|
|
1669
|
+
internal_port = self.scaling.get_open_port()
|
|
1670
|
+
work_fs = get_max_file_system()
|
|
1671
|
+
action_details = self.get_action_details()
|
|
1672
|
+
if not action_details:
|
|
1673
|
+
return
|
|
1674
|
+
action_id = action_details["_id"]
|
|
1675
|
+
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1676
|
+
|
|
1677
|
+
# Get the service ID to track deployments
|
|
1678
|
+
service_id = action_details.get("_idService")
|
|
1679
|
+
|
|
1680
|
+
self.setup_action_requirements(
|
|
1681
|
+
action_details,
|
|
1682
|
+
work_fs,
|
|
1683
|
+
model_family=model_family,
|
|
1684
|
+
action_id=action_id,
|
|
1685
|
+
)
|
|
1686
|
+
|
|
1687
|
+
# Check if this is the first deployment for this service
|
|
1688
|
+
is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
|
|
1689
|
+
|
|
1690
|
+
# Get GPU configuration (uses utility function with fail-safe fallback)
|
|
1691
|
+
use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
|
|
1692
|
+
|
|
1693
|
+
logging.info(
|
|
1694
|
+
"Action %s: Model deployment GPU config: %s (first_deployment=%s)",
|
|
1695
|
+
action_id,
|
|
1696
|
+
use_gpu if use_gpu else "CPU-only",
|
|
1697
|
+
is_first_deployment
|
|
1698
|
+
)
|
|
1699
|
+
|
|
1700
|
+
# Get or create TRITON_PORTS (uses utility method)
|
|
1701
|
+
triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
|
|
1702
|
+
|
|
1703
|
+
extra_env_vars = {
|
|
1704
|
+
"INTERNAL_PORT": internal_port,
|
|
1705
|
+
"TRITON_PORTS": triton_ports
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1708
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
|
|
1709
|
+
logging.info("cmd is: %s", cmd)
|
|
1710
|
+
self.start(cmd, "deploy_log")
|
|
1711
|
+
|
|
1712
|
+
|
|
1713
|
+
@log_errors(raise_exception=False)
|
|
1714
|
+
def model_train_execute(self: ActionInstance):
|
|
1715
|
+
"""Execute model training task."""
|
|
1716
|
+
action_details = self.get_action_details()
|
|
1717
|
+
if not action_details:
|
|
1718
|
+
return
|
|
1719
|
+
action_id = action_details["_id"]
|
|
1720
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1721
|
+
work_fs = action_details["jobParams"]["host_file_system"]
|
|
1722
|
+
model_key = action_details["actionDetails"]["modelKey"]
|
|
1723
|
+
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1724
|
+
self.setup_action_requirements(
|
|
1725
|
+
action_details,
|
|
1726
|
+
work_fs,
|
|
1727
|
+
model_family=model_family,
|
|
1728
|
+
action_id=action_id,
|
|
1729
|
+
)
|
|
1730
|
+
|
|
1731
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1732
|
+
logging.info(
|
|
1733
|
+
"Using existing container ID for training: %s",
|
|
1734
|
+
action_details["actionDetails"]["containerId"],
|
|
1735
|
+
)
|
|
1736
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1737
|
+
cmd = "docker restart " + self.docker_container
|
|
1738
|
+
self.start(cmd, "train_log")
|
|
1739
|
+
return
|
|
1740
|
+
|
|
1741
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
|
|
1742
|
+
logging.info("cmd is: %s", cmd)
|
|
1743
|
+
self.start(cmd, "train_log")
|
|
1744
|
+
|
|
1745
|
+
|
|
1746
|
+
@log_errors(raise_exception=False)
|
|
1747
|
+
def model_eval_execute(self: ActionInstance):
|
|
1748
|
+
"""Execute model evaluation task."""
|
|
1749
|
+
action_details = self.get_action_details()
|
|
1750
|
+
if not action_details:
|
|
1751
|
+
return
|
|
1752
|
+
action_id = action_details["_id"]
|
|
1753
|
+
work_fs = action_details["jobParams"]["host_file_system"]
|
|
1754
|
+
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1755
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1756
|
+
self.setup_action_requirements(
|
|
1757
|
+
action_details,
|
|
1758
|
+
work_fs,
|
|
1759
|
+
model_family=model_family,
|
|
1760
|
+
action_id=action_id,
|
|
1761
|
+
)
|
|
1762
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1763
|
+
logging.info(
|
|
1764
|
+
"Using existing container ID for training: %s",
|
|
1765
|
+
action_details["actionDetails"]["containerId"],
|
|
1766
|
+
)
|
|
1767
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1768
|
+
cmd = "docker restart " + self.docker_container
|
|
1769
|
+
self.start(cmd, "eval_log")
|
|
1770
|
+
return
|
|
1771
|
+
|
|
1772
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
|
|
1773
|
+
logging.info("cmd is: %s", cmd)
|
|
1774
|
+
self.start(cmd, "eval_log")
|
|
1775
|
+
|
|
1776
|
+
|
|
1777
|
+
@log_errors(raise_exception=False)
|
|
1778
|
+
def model_export_execute(self: ActionInstance):
|
|
1779
|
+
"""Execute model export task."""
|
|
1780
|
+
work_fs = get_max_file_system()
|
|
1781
|
+
action_details = self.get_action_details()
|
|
1782
|
+
if not action_details:
|
|
1783
|
+
return
|
|
1784
|
+
action_id = action_details["_id"]
|
|
1785
|
+
if "host_file_system" in action_details["jobParams"]:
|
|
1786
|
+
work_fs = action_details["jobParams"]["host_file_system"]
|
|
1787
|
+
logging.info("host_file_system: %s", work_fs)
|
|
1788
|
+
use_gpu = self.get_gpu_config(action_details)
|
|
1789
|
+
model_family = action_details["actionDetails"]["modelFamily"]
|
|
1790
|
+
self.setup_action_requirements(
|
|
1791
|
+
action_details,
|
|
1792
|
+
work_fs,
|
|
1793
|
+
model_family=model_family,
|
|
1794
|
+
action_id=action_id,
|
|
1795
|
+
)
|
|
1796
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1797
|
+
logging.info(
|
|
1798
|
+
"Using existing container ID for training: %s",
|
|
1799
|
+
action_details["actionDetails"]["containerId"],
|
|
1800
|
+
)
|
|
1801
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1802
|
+
cmd = "docker restart " + self.docker_container
|
|
1803
|
+
self.start(cmd, "export_log")
|
|
1804
|
+
return
|
|
1805
|
+
|
|
1806
|
+
cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
|
|
1807
|
+
logging.info("cmd is: %s", cmd)
|
|
1808
|
+
self.start(cmd, "export_log")
|
|
1809
|
+
|
|
1810
|
+
|
|
1811
|
+
@log_errors(raise_exception=False)
|
|
1812
|
+
def image_build_execute(self: ActionInstance):
|
|
1813
|
+
"""Execute image building task."""
|
|
1814
|
+
action_details = self.get_action_details()
|
|
1815
|
+
if not action_details:
|
|
1816
|
+
return
|
|
1817
|
+
self.setup_action_requirements(action_details)
|
|
1818
|
+
model_family_id = action_details["_idService"]
|
|
1819
|
+
action_id = action_details["_id"]
|
|
1820
|
+
internal_api_key = self.get_internal_api_key(action_id)
|
|
1821
|
+
extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
|
|
1822
|
+
cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
|
|
1823
|
+
logging.info("cmd is: %s", cmd)
|
|
1824
|
+
self.start(cmd, "image_build_log")
|
|
1825
|
+
|
|
1826
|
+
|
|
1827
|
+
@log_errors(raise_exception=False)
|
|
1828
|
+
def resource_clone_execute(self: ActionInstance):
|
|
1829
|
+
"""Execute resource clone task."""
|
|
1830
|
+
action_details = self.get_action_details()
|
|
1831
|
+
if not action_details:
|
|
1832
|
+
return
|
|
1833
|
+
self.setup_action_requirements(action_details)
|
|
1834
|
+
cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
|
|
1835
|
+
logging.info("cmd is: %s", cmd)
|
|
1836
|
+
self.start(cmd, "resource_clone")
|
|
1837
|
+
|
|
1838
|
+
|
|
1839
|
+
@log_errors(raise_exception=False)
|
|
1840
|
+
def streaming_gateway_execute(self: ActionInstance):
|
|
1841
|
+
"""Execute streaming gateway task."""
|
|
1842
|
+
action_details = self.get_action_details()
|
|
1843
|
+
if not action_details:
|
|
1844
|
+
return
|
|
1845
|
+
self.setup_action_requirements(action_details)
|
|
1846
|
+
if not self.docker_container:
|
|
1847
|
+
self.docker_container = (
|
|
1848
|
+
f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
|
|
1849
|
+
)
|
|
1850
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1851
|
+
logging.info(
|
|
1852
|
+
"Using existing container ID for training: %s",
|
|
1853
|
+
action_details["actionDetails"]["containerId"],
|
|
1854
|
+
)
|
|
1855
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1856
|
+
cmd = "docker restart " + self.docker_container
|
|
1857
|
+
self.start(cmd, "streaming_gateway")
|
|
1858
|
+
return
|
|
1859
|
+
|
|
1860
|
+
cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
|
|
1861
|
+
logging.info("cmd is: %s", cmd)
|
|
1862
|
+
self.start(cmd, "streaming_gateway")
|
|
1863
|
+
|
|
1864
|
+
|
|
1865
|
+
@log_errors(raise_exception=False)
|
|
1866
|
+
def kafka_setup_execute(self: ActionInstance):
|
|
1867
|
+
"""
|
|
1868
|
+
Execute kafka server task.
|
|
1869
|
+
Kafka runs on port 9092 (SASL_PLAINTEXT) and 9093 (CONTROLLER) - localhost only with --net=host.
|
|
1870
|
+
"""
|
|
1871
|
+
action_details = self.get_action_details()
|
|
1872
|
+
if not action_details:
|
|
1873
|
+
return
|
|
1874
|
+
host_port = self.scaling.get_open_port()
|
|
1875
|
+
host_ip = (
|
|
1876
|
+
urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
|
|
1877
|
+
)
|
|
1878
|
+
# Setup credentials
|
|
1879
|
+
self.setup_action_requirements(action_details)
|
|
1880
|
+
|
|
1881
|
+
# Get Docker disk usage to calculate log retention
|
|
1882
|
+
from matrice_compute.instance_utils import get_docker_disk_space_usage
|
|
1883
|
+
|
|
1884
|
+
docker_disk_usage = get_docker_disk_space_usage()
|
|
1885
|
+
log_retention_bytes = 0
|
|
1886
|
+
if docker_disk_usage:
|
|
1887
|
+
# Calculate 90% of total Docker disk space in bytes
|
|
1888
|
+
available_disk_gb = docker_disk_usage["available"]
|
|
1889
|
+
log_retention_bytes = int(
|
|
1890
|
+
available_disk_gb * 0.9 * 1024 * 1024 * 1024
|
|
1891
|
+
) # Convert GB to bytes
|
|
1892
|
+
logging.info(
|
|
1893
|
+
"Kafka log retention set to %d bytes (90%% of %f GB Docker disk)",
|
|
1894
|
+
log_retention_bytes,
|
|
1895
|
+
available_disk_gb,
|
|
1896
|
+
)
|
|
1897
|
+
else:
|
|
1898
|
+
# Fallback if Docker disk usage cannot be determined
|
|
1899
|
+
log_retention_bytes = 500 * 1024 * 1024 * 1024 # 10GB default
|
|
1900
|
+
logging.warning(
|
|
1901
|
+
"Could not determine Docker disk usage, using default 10GB log retention"
|
|
1902
|
+
)
|
|
1903
|
+
|
|
1904
|
+
# Prepare environment variables for Kafka
|
|
1905
|
+
env = os.environ.get("ENV", "prod")
|
|
1906
|
+
env_vars = {
|
|
1907
|
+
"ENV": env,
|
|
1908
|
+
"MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
|
|
1909
|
+
"MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
|
|
1910
|
+
"KAFKA_NODE_ID": 1,
|
|
1911
|
+
"KAFKA_PROCESS_ROLES": "broker,controller",
|
|
1912
|
+
"KAFKA_LISTENERS": "SASL_PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093",
|
|
1913
|
+
"KAFKA_ADVERTISED_LISTENERS": f"SASL_PLAINTEXT://{host_ip}:{host_port}",
|
|
1914
|
+
"KAFKA_LISTENER_SECURITY_PROTOCOL_MAP": "CONTROLLER:PLAINTEXT,SASL_PLAINTEXT:SASL_PLAINTEXT",
|
|
1915
|
+
"KAFKA_CONTROLLER_LISTENER_NAMES": "CONTROLLER",
|
|
1916
|
+
"KAFKA_CONTROLLER_QUORUM_VOTERS": "1@localhost:9093",
|
|
1917
|
+
"KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR": 1,
|
|
1918
|
+
"KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR": 1,
|
|
1919
|
+
"KAFKA_TRANSACTION_STATE_LOG_MIN_ISR": 1,
|
|
1920
|
+
"KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS": 0,
|
|
1921
|
+
"KAFKA_NUM_PARTITIONS": 5,
|
|
1922
|
+
"KAFKA_SASL_ENABLED_MECHANISMS": "SCRAM-SHA-256",
|
|
1923
|
+
"KAFKA_SASL_MECHANISM_INTER_BROKER_PROTOCOL": "SCRAM-SHA-256",
|
|
1924
|
+
"KAFKA_INTER_BROKER_LISTENER_NAME": "SASL_PLAINTEXT",
|
|
1925
|
+
"KAFKA_MESSAGE_MAX_BYTES": 25000000,
|
|
1926
|
+
"KAFKA_HEAP_OPTS": "-Xms2G -Xmx8G",
|
|
1927
|
+
"KAFKA_NUM_NETWORK_THREADS": 6,
|
|
1928
|
+
"KAFKA_NUM_IO_THREADS": 8,
|
|
1929
|
+
"KAFKA_REPLICA_FETCH_MAX_BYTES": 25000000,
|
|
1930
|
+
"KAFKA_FETCH_MESSAGE_MAX_BYTES": 25000000,
|
|
1931
|
+
"KAFKA_REPLICA_FETCH_RESPONSE_MAX_BYTES": 25000000,
|
|
1932
|
+
"KAFKA_REPLICA_FETCH_RESPONSE_MAX_BYTES": 25000000,
|
|
1933
|
+
# Log retention settings based on Docker disk space
|
|
1934
|
+
"KAFKA_LOG_RETENTION_BYTES": log_retention_bytes,
|
|
1935
|
+
"KAFKA_LOG_SEGMENT_BYTES": min(
|
|
1936
|
+
1073741824, log_retention_bytes // 10
|
|
1937
|
+
), # 1GB or 10% of retention, whichever is smaller
|
|
1938
|
+
}
|
|
1939
|
+
|
|
1940
|
+
# Build environment variable command parts
|
|
1941
|
+
env_args = " ".join(
|
|
1942
|
+
[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()]
|
|
1943
|
+
)
|
|
1944
|
+
|
|
1945
|
+
# Build the docker command with --net=host
|
|
1946
|
+
pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
|
|
1947
|
+
|
|
1948
|
+
if env == 'dev':
|
|
1949
|
+
pypi_index = f"https://test.pypi.org/simple/ --pre"
|
|
1950
|
+
pkgs = f"matrice_common>=1.0.0 matrice>=1.0.0"
|
|
1951
|
+
else:
|
|
1952
|
+
pkgs = f"matrice_common matrice"
|
|
1953
|
+
|
|
1954
|
+
if action_details["actionDetails"].get("containerId"):
|
|
1955
|
+
logging.info(
|
|
1956
|
+
"Using existing container ID for training: %s",
|
|
1957
|
+
action_details["actionDetails"]["containerId"],
|
|
1958
|
+
)
|
|
1959
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
1960
|
+
cmd = "docker restart " + self.docker_container
|
|
1961
|
+
self.start(cmd, "kafka_setup")
|
|
1962
|
+
return
|
|
1963
|
+
|
|
1964
|
+
|
|
1965
|
+
# Kafka container with --net=host (Ports: 9092, 9093)
|
|
1966
|
+
cmd = (
|
|
1967
|
+
f"docker run --net=host "
|
|
1968
|
+
f"{env_args} "
|
|
1969
|
+
f"--shm-size=30G --pull=always "
|
|
1970
|
+
f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
|
|
1971
|
+
f"cd /opt/kafka/bin && "
|
|
1972
|
+
f"source venv/bin/activate && "
|
|
1973
|
+
f"/opt/kafka/bin/startup.sh & "
|
|
1974
|
+
f"if [ -f requirements.txt ]; then venv/bin/python3 -m pip install -r requirements.txt; fi && "
|
|
1975
|
+
f"venv/bin/python3 -m pip install --upgrade --force-reinstall --index-url {pypi_index} {pkgs} && "
|
|
1976
|
+
f"sleep 20 && "
|
|
1977
|
+
f'venv/bin/python3 main.py {self.action_record_id} {host_port}"'
|
|
1978
|
+
)
|
|
1979
|
+
|
|
1980
|
+
logging.info("Starting Kafka container (Ports: 9092, 9093): %s", cmd)
|
|
1981
|
+
self.start(cmd, "kafka_setup")
|
|
1982
|
+
|
|
1983
|
+
|
|
1984
|
+
@log_errors(raise_exception=False)
|
|
1985
|
+
def inference_tracker_setup_execute(self: ActionInstance):
|
|
1986
|
+
|
|
1987
|
+
"""
|
|
1988
|
+
Creates and start inference tracker.
|
|
1989
|
+
Inference tracker runs on port 8110 (localhost only with --net=host).
|
|
1990
|
+
"""
|
|
1991
|
+
|
|
1992
|
+
action_details = self.get_action_details()
|
|
1993
|
+
if not action_details:
|
|
1994
|
+
return
|
|
1995
|
+
|
|
1996
|
+
image = self.docker_container
|
|
1997
|
+
|
|
1998
|
+
self.setup_action_requirements(action_details)
|
|
1999
|
+
|
|
2000
|
+
if action_details["actionDetails"].get("containerId"):
|
|
2001
|
+
logging.info(
|
|
2002
|
+
"Using existing container ID for inference tracker: %s",
|
|
2003
|
+
action_details["actionDetails"]["containerId"],
|
|
2004
|
+
)
|
|
2005
|
+
self.docker_container = action_details["actionDetails"]["containerId"]
|
|
2006
|
+
cmd = "docker restart " + self.docker_container
|
|
2007
|
+
self.start(cmd, "inference_tracker_setup")
|
|
2008
|
+
return
|
|
2009
|
+
|
|
2010
|
+
# This is the existing Docker run command
|
|
2011
|
+
worker_cmd = (
|
|
2012
|
+
f"docker run -d --pull=always --net=host "
|
|
2013
|
+
f"--cidfile ./{self.action_record_id}.cid "
|
|
2014
|
+
f"--name inference-tracker-worker "
|
|
2015
|
+
f"-v matrice_myvol:/matrice_data "
|
|
2016
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
2017
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
2018
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
2019
|
+
f'-e ACTION_ID="{self.action_record_id}" '
|
|
2020
|
+
f"{image}"
|
|
2021
|
+
)
|
|
2022
|
+
|
|
2023
|
+
self.start(worker_cmd, "inference_tracker_setup")
|