matrice-compute 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1508 @@
1
+ """Module providing action_instance functionality."""
2
+
3
+ import logging
4
+ import os
5
+ import shlex
6
+ import subprocess
7
+ import threading
8
+ import time
9
+ import signal
10
+ import urllib.request
11
+ from matrice_compute.instance_utils import (
12
+ get_gpu_with_sufficient_memory_for_action,
13
+ get_decrypted_access_key_pair,
14
+ get_max_file_system,
15
+ )
16
+ from matrice_compute.task_utils import (
17
+ setup_workspace_and_run_task,
18
+ )
19
+ from matrice_compute.scaling import (
20
+ Scaling,
21
+ )
22
+ from matrice_common.utils import log_errors
23
+
24
+
25
+ class ActionInstance:
26
+ """Base class for tasks that run in Action containers."""
27
+
28
+ def __init__(self, scaling: Scaling, action_info: dict):
29
+ """Initialize an action instance.
30
+
31
+ Args:
32
+ scaling (Scaling): Scaling service instance
33
+ action_info (dict): Action information dictionary
34
+ """
35
+ self.scaling = scaling
36
+ self.process: subprocess.Popen | None = None
37
+ self.stop_thread = False
38
+ self.log_thread: threading.Thread | None = None
39
+ self.log_path: str | None = None
40
+ self.cmd: str | None = None
41
+ self.matrice_access_key_id: str | None = None
42
+ self.matrice_secret_access_key: str | None = None
43
+ self.action_info = action_info
44
+ self.action_record_id = action_info["_id"]
45
+ self.action_type = action_info["action"]
46
+ self.action_details = action_info["actionDetails"]
47
+ self.docker_container = self.action_details.get(
48
+ "docker",
49
+ self.action_details.get(
50
+ "docker_container",
51
+ self.scaling.get_data_processing_image(),
52
+ ),
53
+ )
54
+ self.actions_map = {
55
+ "model_train": model_train_execute,
56
+ "model_eval": model_eval_execute,
57
+ "model_export": model_export_execute,
58
+ "deploy_add": model_deploy_execute,
59
+ "data_import": data_processing_execute,
60
+ "data_add": data_processing_execute,
61
+ "data_split": data_split_execute,
62
+ "data_prep": data_preparation_execute,
63
+ "dataset_annotation": dataset_annotation_execute,
64
+ "dataset_augmentation": dataset_augmentation_execute,
65
+ "augmentation_setup": augmentation_server_creation_execute,
66
+ "dataset_generation": synthetic_dataset_generation_execute,
67
+ "synthetic_data_setup": synthetic_data_setup_execute, # start
68
+ "image_build": image_build_execute,
69
+ "resource_clone": resource_clone_execute,
70
+ "database_setup": database_setup_execute,
71
+ "kafka_setup": kafka_setup_execute,
72
+ "inference_aggregator": deploy_aggregator_execute,
73
+ "redis_setup": redis_setup_execute,
74
+ "streaming_gateway": streaming_gateway_execute,
75
+ "facial_recognition_setup": facial_recognition_setup_execute,
76
+ "fe_fs_streaming": fe_fs_streaming_execute,
77
+ "inference_ws_server": inference_ws_server_execute
78
+ }
79
+ if self.action_type not in self.actions_map:
80
+ raise ValueError(f"Unknown action type: {self.action_type}")
81
+ self.task = self.actions_map[self.action_type]
82
+
83
+ @log_errors(default_return={}, raise_exception=True, log_error=False)
84
+ def _init_credentials(self):
85
+ """Initialize Matrice credentials.
86
+
87
+ Returns:
88
+ dict: Dictionary containing access key ID and secret access key
89
+ """
90
+ self.matrice_access_key_id = self.scaling.session.access_key
91
+ self.matrice_secret_access_key = self.scaling.session.secret_key
92
+ if not all(
93
+ [
94
+ self.matrice_access_key_id,
95
+ self.matrice_secret_access_key,
96
+ ]
97
+ ):
98
+ raise ValueError(
99
+ "Matrice credentials not found - both access key ID and secret access key are required"
100
+ )
101
+ return {
102
+ "matrice_access_key_id": self.matrice_access_key_id,
103
+ "matrice_secret_access_key": self.matrice_secret_access_key,
104
+ }
105
+
106
+ @log_errors(default_return="logs", raise_exception=False, log_error=False)
107
+ def get_log_path(self):
108
+ """Get log directory path, creating if needed.
109
+
110
+ Returns:
111
+ str: Path to log directory
112
+ """
113
+ os.makedirs("logs", exist_ok=True)
114
+ return "logs"
115
+
116
+ @log_errors(default_return=False, raise_exception=False, log_error=False)
117
+ def is_running(self) -> bool:
118
+ """Check if task process is running.
119
+
120
+ This method performs a thorough check to determine if the process is still running:
121
+ 1. Verifies that the process attribute exists and is not None
122
+ 2. Checks if the process has terminated using poll() method
123
+ 3. Additional safeguards against zombie processes
124
+ 4. Coordinates with log monitoring to ensure all logs are sent before cleanup
125
+
126
+ Returns:
127
+ bool: True if process exists and is still running, False if process
128
+ does not exist or has terminated
129
+ """
130
+ # Basic check if process exists
131
+ if not hasattr(self, "process") or self.process is None:
132
+ return False
133
+
134
+ try:
135
+ # Check if process has terminated
136
+ poll_result = self.process.poll()
137
+
138
+ # poll() returns None if the process is still running
139
+ is_running = poll_result is None
140
+
141
+ # If process has terminated, ensure we do proper cleanup
142
+ if not is_running:
143
+ # Log termination with action ID for debugging
144
+ action_id = getattr(self, "action_record_id", "unknown")
145
+ logging.info(
146
+ "Process for action %s has terminated with exit code: %s",
147
+ action_id,
148
+ poll_result,
149
+ )
150
+
151
+ # CRITICAL: Ensure all logs are sent before cleaning up process
152
+ self._ensure_final_logs_sent()
153
+
154
+ # Try to explicitly clean up the process to avoid zombies
155
+ try:
156
+ # Wait for process with a short timeout to ensure it's fully terminated
157
+ self.process.wait(timeout=1)
158
+ except subprocess.TimeoutExpired:
159
+ # If still running after timeout (unlikely at this point)
160
+ logging.warning(
161
+ f"Process for action {action_id} failed to terminate properly"
162
+ )
163
+
164
+ # Set process to None to help garbage collection - BUT ONLY after logs are handled
165
+ self.process = None
166
+
167
+ return is_running
168
+
169
+ except Exception as e:
170
+ # Something went wrong while checking the process status
171
+ logging.error(f"Error checking process status: {str(e)}")
172
+ # Ensure logs are sent even in error cases
173
+ self._ensure_final_logs_sent()
174
+ # To be safe, assume process is not running when we can't check it
175
+ self.process = None
176
+ return False
177
+
178
+ def _ensure_final_logs_sent(self):
179
+ """Ensure all remaining logs are sent when a process terminates.
180
+
181
+ This method performs a final log flush to ensure no logs are lost
182
+ when a container crashes or shuts down.
183
+ """
184
+ if (
185
+ not hasattr(self, "log_path")
186
+ or not self.log_path
187
+ or not os.path.exists(self.log_path)
188
+ ):
189
+ return
190
+
191
+ try:
192
+ # Set flag to stop continuous logging thread
193
+ self.stop_thread = True
194
+
195
+ # Give log thread a moment to finish current operation
196
+ time.sleep(1)
197
+
198
+ # Perform final log flush
199
+ logging.info(
200
+ "Performing final log flush for action %s",
201
+ getattr(self, "action_record_id", "unknown"),
202
+ )
203
+
204
+ # Read any remaining logs that haven't been sent
205
+ with open(self.log_path, "rb") as log_file:
206
+ # Get the last position that was read (if tracked)
207
+ last_position = getattr(self, "_last_log_position", 0)
208
+ log_file.seek(last_position)
209
+ remaining_content = log_file.read()
210
+
211
+ if remaining_content:
212
+ try:
213
+ decoded_content = remaining_content.decode("utf-8")
214
+ except UnicodeDecodeError:
215
+ decoded_content = remaining_content.decode(
216
+ "utf-8", errors="replace"
217
+ )
218
+
219
+ # Send final logs
220
+ self._send_logs_to_scaling(decoded_content)
221
+ self._check_cuda(decoded_content)
222
+
223
+ logging.info(
224
+ "Sent %d bytes of final logs for action %s",
225
+ len(remaining_content),
226
+ getattr(self, "action_record_id", "unknown"),
227
+ )
228
+ else:
229
+ logging.debug(
230
+ "No additional logs to send for action %s",
231
+ getattr(self, "action_record_id", "unknown"),
232
+ )
233
+
234
+ except Exception as e:
235
+ logging.error(
236
+ "Error during final log flush for action %s: %s",
237
+ getattr(self, "action_record_id", "unknown"),
238
+ str(e),
239
+ )
240
+
241
+ @log_errors(default_return=None, raise_exception=False, log_error=False)
242
+ def get_action_details(self):
243
+ """Get action details from scaling service.
244
+
245
+ Returns:
246
+ dict: Action details if successful, None otherwise
247
+ """
248
+ resp, error, message = self.scaling.get_action_details(self.action_record_id)
249
+ if error:
250
+ logging.error(
251
+ "Error getting action details: %s",
252
+ error,
253
+ )
254
+ return None
255
+ return resp
256
+
257
+ @log_errors(default_return="", raise_exception=False)
258
+ def get_gpu_config(self, action_details):
259
+ """Get GPU configuration string based on available GPUs.
260
+
261
+ Args:
262
+ action_details (dict): Action details containing GPU requirements
263
+
264
+ Returns:
265
+ str: GPU configuration string
266
+ """
267
+ if not action_details["actionDetails"].get("gpuRequired", False):
268
+ return ""
269
+ gpu_indices = get_gpu_with_sufficient_memory_for_action(
270
+ action_details=action_details
271
+ )
272
+ if gpu_indices:
273
+ gpu_str = ",".join(map(str, gpu_indices))
274
+ logging.info("Using GPUs: %s", gpu_str)
275
+ return f'--gpus "device={gpu_str}"'
276
+ logging.info("No GPUs with sufficient memory found.")
277
+ return ""
278
+
279
+ @log_errors(default_return="", raise_exception=False)
280
+ def get_base_docker_cmd(
281
+ self,
282
+ work_fs: str = "",
283
+ use_gpu: str = "",
284
+ mount_docker_sock: bool = False,
285
+ action_id: str = "",
286
+ model_key: str = "",
287
+ extra_env_vars: dict = {},
288
+ port_mapping: dict = {},
289
+ network_config: str = "",
290
+ destination_workspace_path: str = "/usr/src/workspace",
291
+ docker_workdir: str = "",
292
+ extra_pkgs: list = [],
293
+ ):
294
+ """Build base Docker command with common options.
295
+
296
+ Args:
297
+ work_fs (str): Work filesystem path
298
+ use_gpu (str): GPU configuration string
299
+ mount_docker_sock (bool): Whether to mount Docker socket
300
+ action_id (str): Action ID
301
+ model_key (str): Model key
302
+ extra_env_vars (dict): Additional environment variables
303
+ port_mapping (dict): Port mappings {host_port: container_port}
304
+ destination_workspace_path (str): Container workspace path
305
+ docker_workdir (str): Docker working directory
306
+ extra_pkgs (list): List of extra packages to install
307
+ Returns:
308
+ str: Base Docker command
309
+ """
310
+ env = os.environ.get("ENV", "prod")
311
+ env_vars = {
312
+ "ENV": env,
313
+ "MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
314
+ "MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
315
+ }
316
+ if self.get_hugging_face_token(model_key):
317
+ env_vars["HUGGING_FACE_ACCESS_TOKEN"] = self.get_hugging_face_token(
318
+ model_key
319
+ )
320
+ if extra_env_vars:
321
+ env_vars.update(extra_env_vars)
322
+
323
+ if network_config == "":
324
+ network_config = (
325
+ "--net=host"
326
+ if not port_mapping
327
+ else " ".join(
328
+ f"-p {host}:{container}" for host, container in port_mapping.items()
329
+ )
330
+ )
331
+
332
+ if not docker_workdir:
333
+ if action_id:
334
+ docker_workdir = f"/usr/src/{action_id}"
335
+ else:
336
+ docker_workdir = "."
337
+ volumes = [
338
+ ( # Mount workspace if work_fs is provided
339
+ f"-v {work_fs}/workspace:{destination_workspace_path}"
340
+ if work_fs and work_fs not in ["/"]
341
+ else ""
342
+ ),
343
+ ( # Mount action directory if work_fs and action_id are provided
344
+ f"-v {work_fs}/{action_id}:/usr/src/{action_id}"
345
+ if work_fs and work_fs not in ["/"] and action_id
346
+ else ""
347
+ ),
348
+ "-v /var/run/docker.sock:/var/run/docker.sock" if mount_docker_sock else "",
349
+ ]
350
+ pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
351
+ pkgs = ["matrice_common", "matrice"]
352
+ pkgs.extend(extra_pkgs)
353
+ pip_install_matrice = f"pip install --upgrade --force-reinstall --index-url {pypi_index} {' '.join(pkgs)}"
354
+ pip_install_requirements = (
355
+ "if [ -f requirements.txt ]; then pip install -r requirements.txt; fi "
356
+ )
357
+
358
+ # Create export statements for environment variables to ensure they're available in subshells
359
+ env_exports = " && ".join(
360
+ [
361
+ f"export {key}={shlex.quote(str(value))}"
362
+ for key, value in env_vars.items()
363
+ ]
364
+ )
365
+
366
+ cmd_parts = [
367
+ f"docker run {use_gpu} ",
368
+ network_config,
369
+ *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
370
+ *volumes,
371
+ # Container configuration and startup commands
372
+ f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
373
+ f'/bin/bash -c "cd {docker_workdir} && '
374
+ f"{env_exports} && "
375
+ f"{pip_install_requirements} && "
376
+ f"{pip_install_matrice} && ",
377
+ ]
378
+
379
+ # Join all non-empty parts with spaces
380
+ return " ".join(filter(None, cmd_parts))
381
+
382
+ @log_errors(default_return="", raise_exception=False)
383
+ def get_hugging_face_token(self, model_key):
384
+ """Get Hugging Face token for specific model keys.
385
+
386
+ Args:
387
+ model_key (str): Model key to check
388
+
389
+ Returns:
390
+ str: Hugging Face token if available, empty string otherwise
391
+ """
392
+ hugging_face_token = ""
393
+ if model_key and (
394
+ model_key.startswith("microsoft") or model_key.startswith("timm")
395
+ ):
396
+ secret_name = "hugging_face"
397
+ resp, error, message = self.scaling.get_model_secret_keys(secret_name)
398
+ if error is not None:
399
+ logging.error(
400
+ "Error getting Hugging Face token: %s",
401
+ message,
402
+ )
403
+ else:
404
+ hugging_face_token = resp["user_access_token"]
405
+ return hugging_face_token
406
+
407
+ @log_errors(default_return="", raise_exception=False)
408
+ def get_hugging_face_token_for_data_generation(self):
409
+ secret_name = "hugging_face"
410
+ resp, error, message = self.scaling.get_model_secret_keys(secret_name)
411
+ if error is not None:
412
+ logging.error(
413
+ "Error getting Hugging Face token: %s",
414
+ message,
415
+ )
416
+ else:
417
+ hugging_face_token = resp["user_access_token"]
418
+ return hugging_face_token
419
+
420
+ @log_errors(default_return="", raise_exception=False)
421
+ def get_internal_api_key(self, action_id):
422
+ """Get internal API key for action.
423
+
424
+ Args:
425
+ action_id (str): Action ID
426
+
427
+ Returns:
428
+ str: Internal API key if available, empty string otherwise
429
+ """
430
+ internal_api_key = ""
431
+ resp, error, message = self.scaling.get_internal_api_key(action_id)
432
+ if error is not None:
433
+ logging.error(
434
+ "Error getting internal api key: %s",
435
+ message,
436
+ )
437
+ else:
438
+ internal_api_key = resp["internal_api_key"]
439
+ return internal_api_key
440
+
441
+ @log_errors(raise_exception=True)
442
+ def setup_action_requirements(
443
+ self,
444
+ action_details,
445
+ work_fs="",
446
+ model_family="",
447
+ action_id="",
448
+ ):
449
+ """Setup action requirements.
450
+
451
+ Args:
452
+ action_details (dict): Action details
453
+ work_fs (str): Work filesystem path
454
+ model_family (str): Model family name
455
+ action_id (str): Action ID
456
+
457
+ Raises:
458
+ Exception: If setup fails
459
+ """
460
+ # Get job parameters from action_details
461
+ job_params = action_details.get("jobParams", {})
462
+
463
+ # Setup model codebase if model_family is provided
464
+ if model_family:
465
+ # Try to get model codebase URLs from action_details first
466
+ model_codebase_url = job_params.get("model_codebase_url")
467
+ model_requirements_url = job_params.get("model_requirements_url")
468
+
469
+ # Fallback to API calls if not provided in action_details
470
+ if not model_codebase_url:
471
+ model_codebase_url, error, message = self.scaling.get_model_codebase(
472
+ model_family
473
+ )
474
+ if error:
475
+ logging.warning(f"Failed to get model codebase URL: {message}")
476
+ model_codebase_url = None
477
+
478
+ # Handle requirements URL - use from job_params or get from API
479
+ if model_requirements_url:
480
+ model_codebase_requirements_url = model_requirements_url
481
+ else:
482
+ model_codebase_requirements_url, error, message = (
483
+ self.scaling.get_model_codebase_requirements(model_family)
484
+ )
485
+ if error:
486
+ logging.warning(
487
+ f"Failed to get model codebase requirements URL: {message}"
488
+ )
489
+ model_codebase_requirements_url = None
490
+
491
+ # Setup workspace if we have the URLs
492
+ if model_codebase_url:
493
+ setup_workspace_and_run_task(
494
+ work_fs,
495
+ action_id,
496
+ model_codebase_url,
497
+ model_codebase_requirements_url,
498
+ )
499
+
500
+ # Setup Docker credentials
501
+ try:
502
+ # Try to get Docker credentials from action_details first
503
+ docker_username = job_params.get("Username")
504
+ docker_password = job_params.get("Password")
505
+ if docker_username and docker_password:
506
+ username = docker_username
507
+ password = docker_password
508
+ logging.info("Using Docker credentials from action_details")
509
+ else:
510
+ # Fallback to API call
511
+ creds, error, message = self.scaling.get_docker_hub_credentials()
512
+ if error:
513
+ raise Exception(f"Failed to get Docker credentials: {message}")
514
+ username = creds["username"]
515
+ password = creds["password"]
516
+ logging.info("Using Docker credentials from API call")
517
+
518
+ if username and password:
519
+ login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
520
+ subprocess.run(login_cmd, shell=True, check=True)
521
+ logging.info("Docker login successful")
522
+ else:
523
+ logging.warning(
524
+ "Docker credentials not available, skipping Docker login"
525
+ )
526
+
527
+ except Exception as err:
528
+ logging.error(
529
+ "Docker login failed: %s",
530
+ str(err),
531
+ )
532
+ raise
533
+
534
+ # Setup user access credentials
535
+ try:
536
+ # Try to get access key and secret key from job_params first
537
+ access_key = job_params.get("access_key")
538
+ secret_key = job_params.get("secret_key")
539
+
540
+ if access_key and secret_key:
541
+ logging.info("Using access key and secret key from job_params")
542
+ (
543
+ self.matrice_access_key_id,
544
+ self.matrice_secret_access_key,
545
+ ) = get_decrypted_access_key_pair(access_key, secret_key)
546
+ else:
547
+ # Fallback to API call
548
+ logging.info(
549
+ "Access key and secret key not found in job_params, falling back to API call"
550
+ )
551
+ (
552
+ user_access_key_pair,
553
+ error,
554
+ message,
555
+ ) = self.scaling.get_user_access_key_pair(action_details["_idUser"])
556
+ if error:
557
+ raise Exception(f"Failed to get user access key pair: {message}")
558
+ access_key = user_access_key_pair["access_key"]
559
+ secret_key = user_access_key_pair["secret_key"]
560
+ (
561
+ self.matrice_access_key_id,
562
+ self.matrice_secret_access_key,
563
+ ) = get_decrypted_access_key_pair(access_key, secret_key)
564
+
565
+ except Exception as err:
566
+ logging.error(
567
+ "Failed to setup credentials: %s",
568
+ str(err),
569
+ )
570
+ raise
571
+
572
+ @log_errors(raise_exception=False)
573
+ def create_redis_container(self, redis_image=None, redis_password=None):
574
+ """Create and start a Redis container using Docker.
575
+
576
+ Args:
577
+ redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
578
+
579
+ Returns:
580
+ tuple: (container_info, error, message)
581
+ """
582
+ if redis_image is None:
583
+ redis_image = "redis:latest"
584
+
585
+ network_name = f"redis_network_{int(time.time())}"
586
+ subprocess.run(f"docker network create {network_name}", shell=True, check=True)
587
+
588
+ try:
589
+ # Get an available port for Redis
590
+ external_port = "6379"
591
+
592
+ # Generate a unique container name and password
593
+ container_name = f"redis_container_{int(time.time())}"
594
+
595
+ # Build the docker command to create Redis container with password
596
+ cmd = (
597
+ f"docker run -d "
598
+ f"--network {network_name} "
599
+ f"--name {container_name} "
600
+ f"-p {external_port}:6379 "
601
+ f"--restart unless-stopped "
602
+ f"{redis_image} "
603
+ f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
604
+ )
605
+
606
+ logging.info("Creating Redis container with command: %s", cmd)
607
+
608
+ # Execute the command
609
+ result = subprocess.run(
610
+ cmd, shell=True, capture_output=True, text=True, timeout=60
611
+ )
612
+
613
+ if result.returncode == 0:
614
+ container_id = result.stdout.strip()
615
+ container_info = {
616
+ "container_id": container_id,
617
+ "container_name": container_name,
618
+ "network_name": network_name,
619
+ "external_port": external_port,
620
+ "internal_port": 6379,
621
+ "password": redis_password,
622
+ "image": redis_image,
623
+ "status": "running",
624
+ }
625
+
626
+ logging.info("Redis container created successfully: %s", container_info)
627
+ return container_info, None, "Redis container created successfully"
628
+ else:
629
+ error_message = f"Failed to create Redis container: {result.stderr}"
630
+ logging.error(error_message)
631
+ return None, "ContainerCreationError", error_message
632
+
633
+ except subprocess.TimeoutExpired:
634
+ error_message = "Timeout while creating Redis container"
635
+ logging.error(error_message)
636
+ return None, "TimeoutError", error_message
637
+ except Exception as e:
638
+ error_message = f"Unexpected error creating Redis container: {str(e)}"
639
+ logging.error(error_message)
640
+ return None, "UnexpectedError", error_message
641
+
642
+ @log_errors(raise_exception=False, log_error=False)
643
+ def send_logs_continuously(self):
644
+ """Continuously read and send logs from the log file to the scaling service.
645
+
646
+ Enhanced version that tracks log position and handles graceful shutdown.
647
+ """
648
+ last_position = 0
649
+ self._last_log_position = 0 # Track position for final flush
650
+
651
+ while not self.stop_thread and os.path.exists(self.log_path):
652
+ try:
653
+ with open(self.log_path, "rb") as log_file:
654
+ log_file.seek(last_position)
655
+ new_content = log_file.read()
656
+ if new_content:
657
+ try:
658
+ decoded_content = new_content.decode("utf-8")
659
+ except UnicodeDecodeError:
660
+ # Handle invalid UTF-8 bytes by replacing them
661
+ decoded_content = new_content.decode(
662
+ "utf-8",
663
+ errors="replace",
664
+ )
665
+ self._send_logs_to_scaling(decoded_content)
666
+ self._check_cuda(decoded_content)
667
+
668
+ # Update tracked position
669
+ last_position = log_file.tell()
670
+ self._last_log_position = last_position
671
+
672
+ except Exception as e:
673
+ logging.error(
674
+ "Error reading logs for action %s: %s",
675
+ getattr(self, "action_record_id", "unknown"),
676
+ str(e),
677
+ )
678
+
679
+ # Use shorter sleep interval for more responsive log monitoring
680
+ time.sleep(10) # Reduced from 30 to 10 seconds for better responsiveness
681
+
682
+ # Final attempt to send any remaining logs when thread is stopping
683
+ logging.info(
684
+ "Log monitoring thread stopping for action %s, performing final check",
685
+ getattr(self, "action_record_id", "unknown"),
686
+ )
687
+
688
+ # One more final read attempt
689
+ try:
690
+ if os.path.exists(self.log_path):
691
+ with open(self.log_path, "rb") as log_file:
692
+ log_file.seek(last_position)
693
+ final_content = log_file.read()
694
+ if final_content:
695
+ try:
696
+ decoded_content = final_content.decode("utf-8")
697
+ except UnicodeDecodeError:
698
+ decoded_content = final_content.decode(
699
+ "utf-8", errors="replace"
700
+ )
701
+ self._send_logs_to_scaling(decoded_content)
702
+ self._check_cuda(decoded_content)
703
+ logging.info(
704
+ "Sent final %d bytes of logs for action %s",
705
+ len(final_content),
706
+ getattr(self, "action_record_id", "unknown"),
707
+ )
708
+ except Exception as e:
709
+ logging.error(
710
+ "Error in final log read for action %s: %s",
711
+ getattr(self, "action_record_id", "unknown"),
712
+ str(e),
713
+ )
714
+
715
+ @log_errors(raise_exception=False, log_error=False)
716
+ def _send_logs_to_scaling(self, log_content):
717
+ """Send logs to the scaling service.
718
+
719
+ Args:
720
+ log_content (str): Log content to send
721
+ """
722
+ _, error, message = self.scaling.update_action_docker_logs(
723
+ action_record_id=self.action_record_id,
724
+ log_content=log_content,
725
+ )
726
+ if error:
727
+ logging.error(
728
+ "Error from update_action_docker_logs: %s",
729
+ error,
730
+ )
731
+
732
+ @log_errors(raise_exception=False, log_error=False)
733
+ def _check_cuda(self, log_content):
734
+ """Check for CUDA out of memory errors in logs and update action status.
735
+
736
+ Args:
737
+ log_content (str): Log content to check
738
+ """
739
+ if "CUDA error: out of memory" in log_content:
740
+ action_details = self.get_action_details()
741
+ if not action_details:
742
+ return
743
+ self.scaling.update_action(
744
+ id=self.action_record_id,
745
+ step_code="ERROR",
746
+ action_type=action_details["action"],
747
+ status="ERROR",
748
+ status_description="CUDA error: out of memory",
749
+ service="bg-job-scheduler",
750
+ job_params=action_details["jobParams"],
751
+ )
752
+
753
+ @log_errors(raise_exception=True)
754
+ def start_process(self, cmd, log_name):
755
+ """Start the process and initialize logging.
756
+
757
+ Args:
758
+ cmd (str): Command to execute
759
+ log_name (str): Name for log file
760
+
761
+ Raises:
762
+ Exception: If process fails to start
763
+ """
764
+ self.cmd = cmd
765
+ self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
766
+ with open(self.log_path, "wb") as out:
767
+ self.process = subprocess.Popen(
768
+ shlex.split(self.cmd),
769
+ stdout=out,
770
+ stderr=out,
771
+ env={**os.environ},
772
+ start_new_session=True,
773
+ )
774
+
775
+ @log_errors(raise_exception=False)
776
+ def start_logger(self):
777
+ """Start the log monitoring thread."""
778
+ self.log_thread = threading.Thread(
779
+ target=self.send_logs_continuously,
780
+ daemon=False, # CRITICAL: Make thread non-daemon to ensure it completes
781
+ )
782
+ self.log_thread.start()
783
+
784
+ @log_errors(raise_exception=False)
785
+ def start(self, cmd: str = "", log_name: str = ""):
786
+ """Start the process and log monitoring thread.
787
+
788
+ Args:
789
+ cmd (str): Command to execute
790
+ log_name (str): Name for log file
791
+ """
792
+ self.start_process(cmd, log_name)
793
+ self.start_logger()
794
+ self.scaling.update_status(
795
+ self.action_record_id,
796
+ self.action_type,
797
+ "bg-job-scheduler",
798
+ "DKR_CMD",
799
+ "OK",
800
+ f"Start docker container with command: {cmd.replace(self.matrice_access_key_id, 'MATRICE_ACCESS_KEY_ID').replace(self.matrice_secret_access_key, 'MATRICE_SECRET_ACCESS_KEY')}",
801
+ )
802
+
803
+ @log_errors(raise_exception=False, log_error=False)
804
+ def stop(self):
805
+ """Stop the process and log monitoring thread.
806
+
807
+ Enhanced version that ensures proper cleanup sequencing and log completion.
808
+ """
809
+ logging.info("Stopping action %s", getattr(self, "action_record_id", "unknown"))
810
+
811
+ # Step 1: Signal log thread to stop
812
+ self.stop_thread = True
813
+
814
+ # Step 2: Stop the process
815
+ try:
816
+ if self.process:
817
+ logging.info(
818
+ "Terminating process for action %s",
819
+ getattr(self, "action_record_id", "unknown"),
820
+ )
821
+ os.killpg(
822
+ os.getpgid(self.process.pid),
823
+ signal.SIGTERM,
824
+ )
825
+ # Give process time to terminate gracefully
826
+ try:
827
+ self.process.wait(timeout=15)
828
+ logging.info(
829
+ "Process terminated gracefully for action %s",
830
+ getattr(self, "action_record_id", "unknown"),
831
+ )
832
+ except subprocess.TimeoutExpired:
833
+ logging.warning(
834
+ "Process didn't terminate gracefully, forcing kill for action %s",
835
+ getattr(self, "action_record_id", "unknown"),
836
+ )
837
+ try:
838
+ os.killpg(os.getpgid(self.process.pid), signal.SIGKILL)
839
+ self.process.wait(timeout=5)
840
+ except Exception as kill_err:
841
+ logging.error(
842
+ "Error force-killing process for action %s: %s",
843
+ getattr(self, "action_record_id", "unknown"),
844
+ str(kill_err),
845
+ )
846
+ except Exception as proc_err:
847
+ logging.error(
848
+ "Error stopping process for action %s: %s",
849
+ getattr(self, "action_record_id", "unknown"),
850
+ str(proc_err),
851
+ )
852
+
853
+ # Step 3: Ensure final logs are sent
854
+ self._ensure_final_logs_sent()
855
+
856
+ # Step 4: Wait for log thread to complete
857
+ if self.log_thread and self.log_thread.is_alive():
858
+ logging.info(
859
+ "Waiting for log thread to complete for action %s",
860
+ getattr(self, "action_record_id", "unknown"),
861
+ )
862
+ try:
863
+ self.log_thread.join(
864
+ timeout=30
865
+ ) # Wait up to 30 seconds for logs to complete
866
+ if self.log_thread.is_alive():
867
+ logging.warning(
868
+ "Log thread didn't complete within timeout for action %s",
869
+ getattr(self, "action_record_id", "unknown"),
870
+ )
871
+ else:
872
+ logging.info(
873
+ "Log thread completed successfully for action %s",
874
+ getattr(self, "action_record_id", "unknown"),
875
+ )
876
+ except Exception as thread_err:
877
+ logging.error(
878
+ "Error waiting for log thread for action %s: %s",
879
+ getattr(self, "action_record_id", "unknown"),
880
+ str(thread_err),
881
+ )
882
+
883
+ @log_errors(raise_exception=False)
884
+ def execute(self):
885
+ """Execute the task."""
886
+ self.task(self)
887
+
888
+
889
+ @log_errors(raise_exception=False)
890
+ def data_preparation_execute(
891
+ self: ActionInstance,
892
+ ):
893
+ """Execute data preparation task."""
894
+ work_fs = get_max_file_system()
895
+ action_details = self.get_action_details()
896
+ if not action_details:
897
+ return
898
+ self.setup_action_requirements(action_details, work_fs, model_family="")
899
+ action = {"jobParams": action_details["jobParams"]}
900
+ dataset_id_version = (
901
+ action_details["jobParams"]["dataset_id"]
902
+ + action_details["jobParams"]["dataset_version"]
903
+ )
904
+ action["jobParams"].update(
905
+ {
906
+ "dataset_host_path_map": {dataset_id_version: f"{work_fs}/workspace"},
907
+ "dataset_local_path_map": {dataset_id_version: "/usr/src/app/workspace"},
908
+ "host_file_system": work_fs,
909
+ }
910
+ )
911
+ self.scaling.update_action(
912
+ id=self.action_record_id,
913
+ step_code="DCK_LNCH",
914
+ action_type=action_details["action"],
915
+ status=action_details["status"],
916
+ sub_action=action_details["subAction"],
917
+ status_description="Job is assigned to docker",
918
+ service="bg-job-scheduler",
919
+ job_params=action["jobParams"],
920
+ )
921
+ if action["jobParams"].get("model_train_docker"):
922
+ logging.info("Pulling the docker image")
923
+ pull_cmd = f"docker pull {action['jobParams']['model_train_docker']}"
924
+ process = subprocess.Popen(
925
+ pull_cmd,
926
+ shell=True,
927
+ stdout=subprocess.PIPE,
928
+ stderr=subprocess.PIPE,
929
+ )
930
+ logging.info(
931
+ "Started pulling Docker image with PID: %s",
932
+ process.pid,
933
+ )
934
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
935
+ logging.info("cmd is: %s", cmd)
936
+ self.start(cmd, "data_preparation_log")
937
+
938
+
939
+ @log_errors(raise_exception=False)
940
+ def data_processing_execute(self: ActionInstance):
941
+ """Execute data processing task."""
942
+ work_fs = get_max_file_system()
943
+ action_details = self.get_action_details()
944
+ if not action_details:
945
+ return
946
+ self.setup_action_requirements(action_details, work_fs, model_family="")
947
+ action = {"jobParams": action_details["jobParams"]}
948
+ action["jobParams"].update(
949
+ {
950
+ "dp_dv_host_paths": [f"{work_fs}/workspace"],
951
+ "dp_dv_local_paths": ["/usr/src/app/workspace"],
952
+ }
953
+ )
954
+ self.scaling.update_action(
955
+ id=self.action_record_id,
956
+ step_code="DCK_LNCH",
957
+ action_type=action_details["action"],
958
+ status="ACK",
959
+ status_description="Job is assigned to docker",
960
+ service="bg-job-scheduler",
961
+ job_params=action["jobParams"],
962
+ )
963
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
964
+ logging.info("cmd: %s", cmd)
965
+ self.start(cmd, "data_processing_log")
966
+
967
+
968
+ @log_errors(raise_exception=False)
969
+ def data_split_execute(self: ActionInstance):
970
+ """Execute data split task."""
971
+ work_fs = get_max_file_system()
972
+ action_details = self.get_action_details()
973
+ if not action_details:
974
+ return
975
+ self.setup_action_requirements(action_details, work_fs, model_family="")
976
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
977
+ logging.info("cmd: %s", cmd)
978
+ self.start(cmd, "data_split")
979
+
980
+
981
+ @log_errors(raise_exception=False)
982
+ def dataset_annotation_execute(
983
+ self: ActionInstance,
984
+ ):
985
+ """Execute dataset annotation task."""
986
+ work_fs = get_max_file_system()
987
+ action_details = self.get_action_details()
988
+ if not action_details:
989
+ return
990
+ self.setup_action_requirements(action_details, work_fs)
991
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
992
+ logging.info("cmd: %s", cmd)
993
+ self.start(cmd, "dataset_annotation")
994
+
995
+
996
+ @log_errors(raise_exception=False)
997
+ def dataset_augmentation_execute(
998
+ self: ActionInstance,
999
+ ):
1000
+ """Execute dataset augmentation task."""
1001
+ work_fs = get_max_file_system()
1002
+ action_details = self.get_action_details()
1003
+ if not action_details:
1004
+ return
1005
+ self.setup_action_requirements(action_details, work_fs)
1006
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1007
+ logging.info("cmd: %s", cmd)
1008
+ self.start(cmd, "dataset_augmentation")
1009
+
1010
+
1011
+ @log_errors(raise_exception=False)
1012
+ def augmentation_server_creation_execute(
1013
+ self: ActionInstance,
1014
+ ):
1015
+ """Create Augmentation Server"""
1016
+ work_fs = get_max_file_system()
1017
+ action_details = self.get_action_details()
1018
+ external_port = self.scaling.get_open_port()
1019
+ if not action_details:
1020
+ return
1021
+ self.setup_action_requirements(action_details, work_fs)
1022
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1023
+ logging.info("cmd: %s", cmd)
1024
+ self.start(cmd, "augmentation_setup")
1025
+
1026
+
1027
+ @log_errors(raise_exception=False)
1028
+ def database_setup_execute(self: ActionInstance):
1029
+ """
1030
+ Creates and setup the database for facial recognition server.
1031
+ """
1032
+ action_details = self.get_action_details()
1033
+ if not action_details:
1034
+ return
1035
+ image = action_details["actionDetails"].get("docker")
1036
+
1037
+ self.setup_action_requirements(action_details)
1038
+
1039
+ project_id = action_details["_idProject"]
1040
+
1041
+ # Run docker compose up
1042
+
1043
+ cmd = (
1044
+ f"docker run --pull=always -p 27020:27017 "
1045
+ f"--name mongodbdatabase "
1046
+ f"-e ACTION_RECORD_ID={self.action_record_id} "
1047
+ f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1048
+ f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
1049
+ f"-e PROJECT_ID={project_id} "
1050
+ f"-e ENV=dev "
1051
+ f"{image} "
1052
+ )
1053
+ print("Docker command", cmd)
1054
+
1055
+ qdrant_cmd = (
1056
+ f"docker run --pull=always "
1057
+ f"--name qdrant "
1058
+ f"-p 6333:6333 "
1059
+ f"-p 6334:6334 "
1060
+ f"{'qdrant/qdrant:latest'} "
1061
+ )
1062
+
1063
+ # Docker Command run
1064
+ self.start(cmd, "database_setup")
1065
+
1066
+ # Docker for qdrant
1067
+ self.start(qdrant_cmd, 'qdrant_setup')
1068
+
1069
+ @log_errors(raise_exception=False)
1070
+ def facial_recognition_setup_execute(self: ActionInstance):
1071
+ """
1072
+ Creates and setup the database for facial recognition server.
1073
+ """
1074
+ action_details = self.get_action_details()
1075
+
1076
+ if not action_details:
1077
+ return
1078
+ image = action_details["actionDetails"].get("docker")
1079
+
1080
+ self.setup_action_requirements(action_details)
1081
+
1082
+ # Add worker container run command
1083
+ worker_cmd = (
1084
+ f"docker run -d --pull=always "
1085
+ f"--name worker "
1086
+ f"-p 8081:8081 "
1087
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1088
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1089
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1090
+ f'-e ACTION_ID="{self.action_record_id}" '
1091
+ f"{image}"
1092
+ )
1093
+ print("Worker docker run command:", worker_cmd)
1094
+
1095
+ # Docker Command run
1096
+ self.start(worker_cmd, "facial_recognition_setup")
1097
+
1098
+ @log_errors(raise_exception=False)
1099
+ def inference_ws_server_execute(self: ActionInstance):
1100
+ """
1101
+ Creates and start inference pipline.
1102
+ """
1103
+ action_details = self.get_action_details()
1104
+
1105
+ if not action_details:
1106
+ return
1107
+ image = action_details["actionDetails"].get("docker")
1108
+
1109
+ self.setup_action_requirements(action_details)
1110
+
1111
+ # Add worker container run command
1112
+ worker_cmd = (
1113
+ f"docker run -d --pull=always "
1114
+ f"--name inference "
1115
+ f"-p 8102:8102 "
1116
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1117
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1118
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1119
+ f"{image}"
1120
+ )
1121
+ print("inference docker run command:", worker_cmd)
1122
+
1123
+ # Docker Command run
1124
+ self.start(worker_cmd, "inference_ws_server")
1125
+
1126
+
1127
+ @log_errors(raise_exception=False)
1128
+ def fe_fs_streaming_execute(self: ActionInstance):
1129
+ """
1130
+ Creates and setup the frontend for fs streaming
1131
+ """
1132
+ action_details = self.get_action_details()
1133
+
1134
+ if not action_details:
1135
+ return
1136
+ image = action_details["actionDetails"].get("docker")
1137
+
1138
+ self.setup_action_requirements(action_details)
1139
+
1140
+ # Add worker container run command
1141
+ worker_cmd = (
1142
+ f"docker run -d --pull=always "
1143
+ f"--name fe_streaming "
1144
+ f"-p 3000:3000 "
1145
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1146
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1147
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1148
+ f"{image}"
1149
+ )
1150
+ print("fe_fs_stremaing docker run command:", worker_cmd)
1151
+
1152
+ # Docker Command run
1153
+ self.start(worker_cmd, "fe_fs_streaming")
1154
+
1155
+
1156
+ @log_errors(raise_exception=False)
1157
+ def synthetic_dataset_generation_execute(self: ActionInstance):
1158
+ """Execute synthetic dataset generation task."""
1159
+ work_fs = get_max_file_system()
1160
+ action_details = self.get_action_details()
1161
+ if not action_details:
1162
+ return
1163
+ self.setup_action_requirements(action_details, work_fs)
1164
+ extra_env_vars = {}
1165
+ hf_token = self.get_hugging_face_token_for_data_generation()
1166
+ extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
1167
+ if hf_token:
1168
+ extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
1169
+ else:
1170
+ return
1171
+ use_gpu = self.get_gpu_config(action_details)
1172
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1173
+ logging.info("cmd is: %s", cmd)
1174
+ self.start(cmd, "dataset_generation")
1175
+
1176
+
1177
+ @log_errors(raise_exception=False)
1178
+ def synthetic_data_setup_execute(self: ActionInstance):
1179
+ """Execute synthetic data setup task."""
1180
+ work_fs = get_max_file_system()
1181
+ action_details = self.get_action_details()
1182
+ external_port = self.scaling.get_open_port()
1183
+ if not action_details:
1184
+ return
1185
+ self.setup_action_requirements(action_details, work_fs)
1186
+ extra_env_vars = {}
1187
+ hf_token = self.get_hugging_face_token_for_data_generation()
1188
+ if hf_token:
1189
+ extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
1190
+ else:
1191
+ return
1192
+ use_gpu = self.get_gpu_config(action_details)
1193
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1194
+ logging.info("cmd is: %s", cmd)
1195
+ self.start(cmd, "synthetic_data_setup")
1196
+
1197
+
1198
+ @log_errors(raise_exception=False)
1199
+ def redis_setup_execute(self: ActionInstance):
1200
+ """
1201
+ Creates and starts a Redis container using Docker.
1202
+ """
1203
+ external_port = self.scaling.get_open_port()
1204
+ work_fs = get_max_file_system()
1205
+
1206
+ action_details = self.get_action_details()
1207
+ if not action_details:
1208
+ return
1209
+ action_id = action_details["_id"]
1210
+
1211
+ redis_password = action_details["jobParams"].get(
1212
+ "password", f"redis_pass_{int(time.time())}"
1213
+ )
1214
+
1215
+ container_info, error, message = self.create_redis_container(
1216
+ action_details["actionDetails"].get("redis_image", "redis:latest"),
1217
+ redis_password=redis_password,
1218
+ )
1219
+ if error:
1220
+ logging.error(
1221
+ "Error creating Redis container: %s",
1222
+ message,
1223
+ )
1224
+ return
1225
+ logging.info("Redis container created successfully: %s", container_info)
1226
+
1227
+ # Initialize redis container
1228
+ self.setup_action_requirements(
1229
+ action_details,
1230
+ work_fs,
1231
+ model_family="",
1232
+ action_id=action_id,
1233
+ )
1234
+
1235
+ env_vars = {
1236
+ "REDIS_URL": f"{container_info['container_name']}:{container_info['external_port']}",
1237
+ "REDIS_PASSWORD": container_info["password"],
1238
+ }
1239
+
1240
+ network_config = f" --network {container_info['network_name']} -p 8082:8082"
1241
+
1242
+ # Make the docker file here
1243
+ cmd = (
1244
+ f"docker run "
1245
+ f"{network_config} "
1246
+ f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1247
+ f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1248
+ f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
1249
+ f"-e MATRICE_SECRET_ACCESS_KEY={shlex.quote(self.matrice_secret_access_key)} "
1250
+ f"-e ENV={shlex.quote(os.environ.get('ENV', 'prod'))} "
1251
+ f"-v /var/run/docker.sock:/var/run/docker.sock "
1252
+ f"--shm-size=30G --pull=always "
1253
+ f"{self.docker_container} "
1254
+ f"{self.action_record_id} "
1255
+ )
1256
+
1257
+ logging.info("cmd is: %s", cmd)
1258
+
1259
+ self.start(cmd, "redis_setup")
1260
+
1261
+
1262
+ @log_errors(raise_exception=False)
1263
+ def deploy_aggregator_execute(
1264
+ self: ActionInstance,
1265
+ ):
1266
+ """Execute deploy aggregator task."""
1267
+ work_fs = get_max_file_system()
1268
+ action_details = self.get_action_details()
1269
+ if not action_details:
1270
+ return
1271
+ self.setup_action_requirements(action_details, work_fs)
1272
+ cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1273
+ logging.info("cmd: %s", cmd)
1274
+ self.start(cmd, "deploy_aggregator")
1275
+
1276
+
1277
+ @log_errors(raise_exception=False)
1278
+ def model_deploy_execute(self: ActionInstance):
1279
+ """Execute model deployment task."""
1280
+ external_port = self.scaling.get_open_port()
1281
+ internal_port = self.scaling.get_open_port()
1282
+ work_fs = get_max_file_system()
1283
+ action_details = self.get_action_details()
1284
+ if not action_details:
1285
+ return
1286
+ action_id = action_details["_id"]
1287
+ model_family = action_details["actionDetails"]["modelFamily"]
1288
+ self.setup_action_requirements(
1289
+ action_details,
1290
+ work_fs,
1291
+ model_family=model_family,
1292
+ action_id=action_id,
1293
+ )
1294
+ use_gpu = self.get_gpu_config(action_details)
1295
+ extra_env_vars = {"INTERNAL_PORT": internal_port}
1296
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1297
+ logging.info("cmd is: %s", cmd)
1298
+ self.start(cmd, "deploy_log")
1299
+
1300
+
1301
+ @log_errors(raise_exception=False)
1302
+ def model_train_execute(self: ActionInstance):
1303
+ """Execute model training task."""
1304
+ action_details = self.get_action_details()
1305
+ if not action_details:
1306
+ return
1307
+ action_id = action_details["_id"]
1308
+ use_gpu = self.get_gpu_config(action_details)
1309
+ work_fs = action_details["jobParams"]["host_file_system"]
1310
+ model_key = action_details["actionDetails"]["modelKey"]
1311
+ model_family = action_details["actionDetails"]["modelFamily"]
1312
+ self.setup_action_requirements(
1313
+ action_details,
1314
+ work_fs,
1315
+ model_family=model_family,
1316
+ action_id=action_id,
1317
+ )
1318
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1319
+ logging.info("cmd is: %s", cmd)
1320
+ self.start(cmd, "train_log")
1321
+
1322
+
1323
+ @log_errors(raise_exception=False)
1324
+ def model_eval_execute(self: ActionInstance):
1325
+ """Execute model evaluation task."""
1326
+ action_details = self.get_action_details()
1327
+ if not action_details:
1328
+ return
1329
+ action_id = action_details["_id"]
1330
+ work_fs = action_details["jobParams"]["host_file_system"]
1331
+ model_family = action_details["actionDetails"]["modelFamily"]
1332
+ use_gpu = self.get_gpu_config(action_details)
1333
+ self.setup_action_requirements(
1334
+ action_details,
1335
+ work_fs,
1336
+ model_family=model_family,
1337
+ action_id=action_id,
1338
+ )
1339
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1340
+ logging.info("cmd is: %s", cmd)
1341
+ self.start(cmd, "eval_log")
1342
+
1343
+
1344
+ @log_errors(raise_exception=False)
1345
+ def model_export_execute(self: ActionInstance):
1346
+ """Execute model export task."""
1347
+ work_fs = get_max_file_system()
1348
+ action_details = self.get_action_details()
1349
+ if not action_details:
1350
+ return
1351
+ action_id = action_details["_id"]
1352
+ if "host_file_system" in action_details["jobParams"]:
1353
+ work_fs = action_details["jobParams"]["host_file_system"]
1354
+ logging.info("host_file_system: %s", work_fs)
1355
+ use_gpu = self.get_gpu_config(action_details)
1356
+ model_family = action_details["actionDetails"]["modelFamily"]
1357
+ self.setup_action_requirements(
1358
+ action_details,
1359
+ work_fs,
1360
+ model_family=model_family,
1361
+ action_id=action_id,
1362
+ )
1363
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1364
+ logging.info("cmd is: %s", cmd)
1365
+ self.start(cmd, "export_log")
1366
+
1367
+
1368
+ @log_errors(raise_exception=False)
1369
+ def image_build_execute(self: ActionInstance):
1370
+ """Execute image building task."""
1371
+ action_details = self.get_action_details()
1372
+ if not action_details:
1373
+ return
1374
+ self.setup_action_requirements(action_details)
1375
+ model_family_id = action_details["_idService"]
1376
+ action_id = action_details["_id"]
1377
+ internal_api_key = self.get_internal_api_key(action_id)
1378
+ extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1379
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1380
+ logging.info("cmd is: %s", cmd)
1381
+ self.start(cmd, "image_build_log")
1382
+
1383
+
1384
+ @log_errors(raise_exception=False)
1385
+ def resource_clone_execute(self: ActionInstance):
1386
+ """Execute resource clone task."""
1387
+ action_details = self.get_action_details()
1388
+ if not action_details:
1389
+ return
1390
+ self.setup_action_requirements(action_details)
1391
+ cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1392
+ logging.info("cmd is: %s", cmd)
1393
+ self.start(cmd, "resource_clone")
1394
+
1395
+
1396
+ @log_errors(raise_exception=False)
1397
+ def streaming_gateway_execute(self: ActionInstance):
1398
+ """Execute streaming gateway task."""
1399
+ action_details = self.get_action_details()
1400
+ if not action_details:
1401
+ return
1402
+ self.setup_action_requirements(action_details)
1403
+ if not self.docker_container:
1404
+ self.docker_container = (
1405
+ f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1406
+ )
1407
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1408
+ logging.info("cmd is: %s", cmd)
1409
+ self.start(cmd, "streaming_gateway")
1410
+
1411
+
1412
+ @log_errors(raise_exception=False)
1413
+ def kafka_setup_execute(self: ActionInstance):
1414
+ """Execute kafka server task."""
1415
+ action_details = self.get_action_details()
1416
+ if not action_details:
1417
+ return
1418
+ host_port = self.scaling.get_open_port()
1419
+ host_ip = (
1420
+ urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
1421
+ )
1422
+ container_port = 9092
1423
+ # Setup credentials
1424
+ self.setup_action_requirements(action_details)
1425
+
1426
+ # Get Docker disk usage to calculate log retention
1427
+ from matrice_compute.instance_utils import get_docker_disk_space_usage
1428
+
1429
+ docker_disk_usage = get_docker_disk_space_usage()
1430
+ log_retention_bytes = 0
1431
+ if docker_disk_usage:
1432
+ # Calculate 90% of total Docker disk space in bytes
1433
+ available_disk_gb = docker_disk_usage["available"]
1434
+ log_retention_bytes = int(
1435
+ available_disk_gb * 0.9 * 1024 * 1024 * 1024
1436
+ ) # Convert GB to bytes
1437
+ logging.info(
1438
+ "Kafka log retention set to %d bytes (90%% of %f GB Docker disk)",
1439
+ log_retention_bytes,
1440
+ available_disk_gb,
1441
+ )
1442
+ else:
1443
+ # Fallback if Docker disk usage cannot be determined
1444
+ log_retention_bytes = 500 * 1024 * 1024 * 1024 # 10GB default
1445
+ logging.warning(
1446
+ "Could not determine Docker disk usage, using default 10GB log retention"
1447
+ )
1448
+
1449
+ # Prepare environment variables for Kafka
1450
+ env = os.environ.get("ENV", "prod")
1451
+ env_vars = {
1452
+ "ENV": env,
1453
+ "MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
1454
+ "MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
1455
+ "KAFKA_NODE_ID": 1,
1456
+ "KAFKA_PROCESS_ROLES": "broker,controller",
1457
+ "KAFKA_LISTENERS": "SASL_PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093",
1458
+ "KAFKA_ADVERTISED_LISTENERS": f"SASL_PLAINTEXT://{host_ip}:{host_port}",
1459
+ "KAFKA_LISTENER_SECURITY_PROTOCOL_MAP": "CONTROLLER:PLAINTEXT,SASL_PLAINTEXT:SASL_PLAINTEXT",
1460
+ "KAFKA_CONTROLLER_LISTENER_NAMES": "CONTROLLER",
1461
+ "KAFKA_CONTROLLER_QUORUM_VOTERS": "1@localhost:9093",
1462
+ "KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR": 1,
1463
+ "KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR": 1,
1464
+ "KAFKA_TRANSACTION_STATE_LOG_MIN_ISR": 1,
1465
+ "KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS": 0,
1466
+ "KAFKA_NUM_PARTITIONS": 5,
1467
+ "KAFKA_SASL_ENABLED_MECHANISMS": "SCRAM-SHA-256",
1468
+ "KAFKA_SASL_MECHANISM_INTER_BROKER_PROTOCOL": "SCRAM-SHA-256",
1469
+ "KAFKA_INTER_BROKER_LISTENER_NAME": "SASL_PLAINTEXT",
1470
+ "KAFKA_MESSAGE_MAX_BYTES": 25000000,
1471
+ "KAFKA_HEAP_OPTS": "-Xms2G -Xmx8G",
1472
+ "KAFKA_NUM_NETWORK_THREADS": 6,
1473
+ "KAFKA_NUM_IO_THREADS": 8,
1474
+ "KAFKA_REPLICA_FETCH_MAX_BYTES": 25000000,
1475
+ "KAFKA_FETCH_MESSAGE_MAX_BYTES": 25000000,
1476
+ "KAFKA_REPLICA_FETCH_RESPONSE_MAX_BYTES": 25000000,
1477
+ "KAFKA_REPLICA_FETCH_RESPONSE_MAX_BYTES": 25000000,
1478
+ # Log retention settings based on Docker disk space
1479
+ "KAFKA_LOG_RETENTION_BYTES": log_retention_bytes,
1480
+ "KAFKA_LOG_SEGMENT_BYTES": min(
1481
+ 1073741824, log_retention_bytes // 10
1482
+ ), # 1GB or 10% of retention, whichever is smaller
1483
+ }
1484
+
1485
+ # Build environment variable command parts
1486
+ env_args = " ".join(
1487
+ [f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()]
1488
+ )
1489
+
1490
+ # Build the docker command directly to match user's pattern
1491
+ pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
1492
+
1493
+ cmd = (
1494
+ f"docker run -p {host_port}:{container_port} "
1495
+ f"{env_args} "
1496
+ f"--shm-size=30G --pull=always "
1497
+ f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
1498
+ f"cd /opt/kafka/bin && "
1499
+ f"source venv/bin/activate && "
1500
+ f"/opt/kafka/bin/startup.sh & "
1501
+ f"if [ -f requirements.txt ]; then venv/bin/python3 -m pip install -r requirements.txt; fi && "
1502
+ f"venv/bin/python3 -m pip install --upgrade --force-reinstall --index-url {pypi_index} matrice_common matrice && "
1503
+ f"sleep 20 && "
1504
+ f'venv/bin/python3 main.py {self.action_record_id} {host_port}"'
1505
+ )
1506
+
1507
+ logging.info("cmd is: %s", cmd)
1508
+ self.start(cmd, "kafka_setup")