matrice-compute 0.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2023 @@
1
+ """Module providing action_instance functionality."""
2
+
3
+ import logging
4
+ import os
5
+ import shlex
6
+ import subprocess
7
+ import threading
8
+ import time
9
+ import signal
10
+ import urllib.request
11
+ from matrice_compute.instance_utils import (
12
+ get_gpu_with_sufficient_memory_for_action,
13
+ get_gpu_config_for_deployment,
14
+ get_decrypted_access_key_pair,
15
+ get_max_file_system,
16
+ get_best_service_ip_and_network,
17
+ )
18
+ from matrice_compute.task_utils import (
19
+ setup_workspace_and_run_task,
20
+ )
21
+ from matrice_compute.scaling import (
22
+ Scaling,
23
+ )
24
+ from matrice_common.utils import log_errors
25
+
26
+
27
+ class ActionInstance:
28
+ """Base class for tasks that run in Action containers."""
29
+
30
+ # Class-level dictionary to track deployed services and their ports
31
+ # Key: _idService, Value: {"triton_ports": "port1,port2,port3", "is_first": False}
32
+ _deployed_services = {}
33
+
34
+ def __init__(self, scaling: Scaling, action_info: dict):
35
+ """Initialize an action instance.
36
+
37
+ Args:
38
+ scaling (Scaling): Scaling service instance
39
+ action_info (dict): Action information dictionary
40
+ """
41
+ self.scaling = scaling
42
+ self.process: subprocess.Popen | None = None
43
+ self.stop_thread = False
44
+ self.log_thread: threading.Thread | None = None
45
+ self.log_path: str | None = None
46
+ self.cmd: str | None = None
47
+ self.matrice_access_key_id: str | None = None
48
+ self.matrice_secret_access_key: str | None = None
49
+ self.action_info = action_info
50
+ self.action_record_id = action_info["_id"]
51
+ self.action_type = action_info["action"]
52
+ self.action_details = action_info["actionDetails"]
53
+ self.docker_container = self.action_details.get(
54
+ "docker",
55
+ self.action_details.get(
56
+ "docker_container",
57
+ self.scaling.get_data_processing_image(),
58
+ ),
59
+ )
60
+ self.actions_map = {
61
+ "model_train": model_train_execute,
62
+ "model_eval": model_eval_execute,
63
+ "model_export": model_export_execute,
64
+ "deploy_add": model_deploy_execute,
65
+ "data_import": data_processing_execute,
66
+ "data_add": data_processing_execute,
67
+ "data_split": data_split_execute,
68
+ "data_prep": data_preparation_execute,
69
+ "dataset_annotation": dataset_annotation_execute,
70
+ "dataset_augmentation": dataset_augmentation_execute,
71
+ "augmentation_setup": augmentation_server_creation_execute,
72
+ "dataset_generation": synthetic_dataset_generation_execute,
73
+ "synthetic_data_setup": synthetic_data_setup_execute, # start
74
+ "image_build": image_build_execute,
75
+ "resource_clone": resource_clone_execute,
76
+ "database_setup": database_setup_execute,
77
+ "kafka_setup": kafka_setup_execute,
78
+ "inference_aggregator": deploy_aggregator_execute,
79
+ "redis_setup": redis_setup_execute,
80
+ "streaming_gateway": streaming_gateway_execute,
81
+ "facial_recognition_setup": facial_recognition_setup_execute,
82
+ "fe_fs_streaming": fe_fs_streaming_execute,
83
+ "inference_ws_server": inference_ws_server_execute,
84
+ "fe_analytics_service": fe_analytics_service_execute,
85
+ "lpr_setup": lpr_setup_execute,
86
+ "inference_tracker_server": inference_tracker_setup_execute
87
+ }
88
+ if self.action_type not in self.actions_map:
89
+ raise ValueError(f"Unknown action type: {self.action_type}")
90
+ self.task = self.actions_map[self.action_type]
91
+
92
+ @classmethod
93
+ def is_first_deployment_for_service(cls, service_id):
94
+ """Check if this is the first deployment for a given service.
95
+
96
+ Args:
97
+ service_id (str): Service ID (_idService)
98
+
99
+ Returns:
100
+ bool: True if this is the first deployment, False otherwise
101
+ """
102
+ if not service_id:
103
+ return False
104
+ return service_id not in cls._deployed_services
105
+
106
+ @classmethod
107
+ def get_or_create_triton_ports(cls, service_id, scaling_instance):
108
+ """Get existing TRITON_PORTS for a service or create new ones.
109
+
110
+ Args:
111
+ service_id (str): Service ID (_idService)
112
+ scaling_instance: Scaling instance to get open ports
113
+
114
+ Returns:
115
+ str: Comma-separated string of 3 port numbers (e.g., "8001,8002,8003")
116
+ """
117
+ if not service_id:
118
+ # No service_id, generate new ports
119
+ port1 = scaling_instance.get_open_port()
120
+ port2 = scaling_instance.get_open_port()
121
+ port3 = scaling_instance.get_open_port()
122
+ return f"{port1},{port2},{port3}"
123
+
124
+ # Check if ports already exist for this service
125
+ if service_id in cls._deployed_services:
126
+ triton_ports = cls._deployed_services[service_id]["triton_ports"]
127
+ logging.info(
128
+ "Reusing TRITON_PORTS for service %s: %s",
129
+ service_id,
130
+ triton_ports
131
+ )
132
+ return triton_ports
133
+
134
+ # First deployment: generate new ports and store them
135
+ port1 = scaling_instance.get_open_port()
136
+ port2 = scaling_instance.get_open_port()
137
+ port3 = scaling_instance.get_open_port()
138
+ triton_ports = f"{port1},{port2},{port3}"
139
+
140
+ # Store for future use
141
+ cls._deployed_services[service_id] = {
142
+ "triton_ports": triton_ports,
143
+ "is_first": False
144
+ }
145
+
146
+ logging.info(
147
+ "First deployment for service %s - generated TRITON_PORTS: %s",
148
+ service_id,
149
+ triton_ports
150
+ )
151
+ return triton_ports
152
+
153
+ @log_errors(default_return={}, raise_exception=True, log_error=False)
154
+ def _init_credentials(self):
155
+ """Initialize Matrice credentials.
156
+
157
+ Returns:
158
+ dict: Dictionary containing access key ID and secret access key
159
+ """
160
+ self.matrice_access_key_id = self.scaling.session.access_key
161
+ self.matrice_secret_access_key = self.scaling.session.secret_key
162
+ if not all(
163
+ [
164
+ self.matrice_access_key_id,
165
+ self.matrice_secret_access_key,
166
+ ]
167
+ ):
168
+ raise ValueError(
169
+ "Matrice credentials not found - both access key ID and secret access key are required"
170
+ )
171
+ return {
172
+ "matrice_access_key_id": self.matrice_access_key_id,
173
+ "matrice_secret_access_key": self.matrice_secret_access_key,
174
+ }
175
+
176
+ @log_errors(default_return="logs", raise_exception=False, log_error=False)
177
+ def get_log_path(self):
178
+ """Get log directory path, creating if needed.
179
+
180
+ Returns:
181
+ str: Path to log directory
182
+ """
183
+ os.makedirs("logs", exist_ok=True)
184
+ return "logs"
185
+
186
+ @log_errors(default_return=False, raise_exception=False, log_error=False)
187
+ def is_running(self) -> bool:
188
+ """Check if task process is running.
189
+
190
+ This method performs a thorough check to determine if the process is still running:
191
+ 1. Verifies that the process attribute exists and is not None
192
+ 2. Checks if the process has terminated using poll() method
193
+ 3. Additional safeguards against zombie processes
194
+ 4. Coordinates with log monitoring to ensure all logs are sent before cleanup
195
+
196
+ Returns:
197
+ bool: True if process exists and is still running, False if process
198
+ does not exist or has terminated
199
+ """
200
+ # Basic check if process exists
201
+ if not hasattr(self, "process") or self.process is None:
202
+ return False
203
+
204
+ try:
205
+ # Check if process has terminated
206
+ poll_result = self.process.poll()
207
+
208
+ # poll() returns None if the process is still running
209
+ is_running = poll_result is None
210
+
211
+ # If process has terminated, ensure we do proper cleanup
212
+ if not is_running:
213
+ # Log termination with action ID for debugging
214
+ action_id = getattr(self, "action_record_id", "unknown")
215
+ logging.info(
216
+ "Process for action %s has terminated with exit code: %s",
217
+ action_id,
218
+ poll_result,
219
+ )
220
+
221
+ # CRITICAL: Ensure all logs are sent before cleaning up process
222
+ self._ensure_final_logs_sent()
223
+
224
+ # Try to explicitly clean up the process to avoid zombies
225
+ try:
226
+ # Wait for process with a short timeout to ensure it's fully terminated
227
+ self.process.wait(timeout=1)
228
+ except subprocess.TimeoutExpired:
229
+ # If still running after timeout (unlikely at this point)
230
+ logging.warning(
231
+ f"Process for action {action_id} failed to terminate properly"
232
+ )
233
+
234
+ # Set process to None to help garbage collection - BUT ONLY after logs are handled
235
+ self.process = None
236
+
237
+ return is_running
238
+
239
+ except Exception as e:
240
+ # Something went wrong while checking the process status
241
+ logging.error(f"Error checking process status: {str(e)}")
242
+ # Ensure logs are sent even in error cases
243
+ self._ensure_final_logs_sent()
244
+ # To be safe, assume process is not running when we can't check it
245
+ self.process = None
246
+ return False
247
+
248
+ def _ensure_final_logs_sent(self):
249
+ """Ensure all remaining logs are sent when a process terminates.
250
+
251
+ This method performs a final log flush to ensure no logs are lost
252
+ when a container crashes or shuts down.
253
+ """
254
+ if (
255
+ not hasattr(self, "log_path")
256
+ or not self.log_path
257
+ or not os.path.exists(self.log_path)
258
+ ):
259
+ return
260
+
261
+ try:
262
+ # Set flag to stop continuous logging thread
263
+ self.stop_thread = True
264
+
265
+ # Give log thread a moment to finish current operation
266
+ time.sleep(1)
267
+
268
+ # Perform final log flush
269
+ logging.info(
270
+ "Performing final log flush for action %s",
271
+ getattr(self, "action_record_id", "unknown"),
272
+ )
273
+
274
+ # Read any remaining logs that haven't been sent
275
+ with open(self.log_path, "rb") as log_file:
276
+ # Get the last position that was read (if tracked)
277
+ last_position = getattr(self, "_last_log_position", 0)
278
+ log_file.seek(last_position)
279
+ remaining_content = log_file.read()
280
+
281
+ if remaining_content:
282
+ try:
283
+ decoded_content = remaining_content.decode("utf-8")
284
+ except UnicodeDecodeError:
285
+ decoded_content = remaining_content.decode(
286
+ "utf-8", errors="replace"
287
+ )
288
+
289
+ # Send final logs
290
+ self._send_logs_to_scaling(decoded_content)
291
+ self._check_cuda(decoded_content)
292
+
293
+ logging.info(
294
+ "Sent %d bytes of final logs for action %s",
295
+ len(remaining_content),
296
+ getattr(self, "action_record_id", "unknown"),
297
+ )
298
+ else:
299
+ logging.debug(
300
+ "No additional logs to send for action %s",
301
+ getattr(self, "action_record_id", "unknown"),
302
+ )
303
+
304
+ except Exception as e:
305
+ logging.error(
306
+ "Error during final log flush for action %s: %s",
307
+ getattr(self, "action_record_id", "unknown"),
308
+ str(e),
309
+ )
310
+
311
+ @log_errors(default_return=None, raise_exception=False, log_error=False)
312
+ def get_action_details(self):
313
+ """Get action details from scaling service.
314
+
315
+ Returns:
316
+ dict: Action details if successful, None otherwise
317
+ """
318
+ resp, error, message = self.scaling.get_action_details(self.action_record_id)
319
+ if error:
320
+ logging.error(
321
+ "Error getting action details: %s",
322
+ error,
323
+ )
324
+ return None
325
+ return resp
326
+
327
+ @log_errors(default_return="", raise_exception=False)
328
+ def get_gpu_config(self, action_details):
329
+ """Get GPU configuration string based on available GPUs.
330
+
331
+ Args:
332
+ action_details (dict): Action details containing GPU requirements
333
+
334
+ Returns:
335
+ str: GPU configuration string
336
+ """
337
+ action_id = action_details.get("_id", "unknown")
338
+
339
+ # Check if GPU is required
340
+ gpu_required = action_details["actionDetails"].get("gpuRequired", False)
341
+ if not gpu_required:
342
+ logging.info(
343
+ "Action %s does not require GPU - will run on CPU",
344
+ action_id
345
+ )
346
+ return ""
347
+
348
+ # Get required GPU memory for logging
349
+ required_memory = action_details.get("actionDetails", {}).get(
350
+ "expectedResources", {}
351
+ ).get("gpuMemory", 0)
352
+
353
+ logging.info(
354
+ "Action %s requires GPU with %d MB memory - selecting GPU(s) with most free memory",
355
+ action_id,
356
+ required_memory
357
+ )
358
+
359
+ try:
360
+ # Get the GPU(s) with most free memory that have sufficient memory
361
+ gpu_indices = get_gpu_with_sufficient_memory_for_action(
362
+ action_details=action_details
363
+ )
364
+
365
+ if gpu_indices:
366
+ gpu_str = ",".join(map(str, gpu_indices))
367
+ logging.info(
368
+ "Action %s: Selected GPU device(s): %s (required memory: %d MB)",
369
+ action_id,
370
+ gpu_str,
371
+ required_memory
372
+ )
373
+
374
+ # Return Docker GPU configuration
375
+ # Format: --gpus "device=0" or --gpus "device=0,1,2"
376
+ return f'--gpus "device={gpu_str}"'
377
+ else:
378
+ logging.warning(
379
+ "Action %s: No GPUs with sufficient memory found (required: %d MB)",
380
+ action_id,
381
+ required_memory
382
+ )
383
+ return ""
384
+
385
+ except ValueError as e:
386
+ logging.error(
387
+ "Action %s: Error selecting GPU - %s",
388
+ action_id,
389
+ str(e)
390
+ )
391
+ return ""
392
+ except Exception as e:
393
+ logging.error(
394
+ "Action %s: Unexpected error in GPU selection - %s",
395
+ action_id,
396
+ str(e)
397
+ )
398
+ return ""
399
+
400
+ @log_errors(default_return="", raise_exception=False)
401
+ def get_base_docker_cmd(
402
+ self,
403
+ work_fs: str = "",
404
+ use_gpu: str = "",
405
+ mount_docker_sock: bool = False,
406
+ action_id: str = "",
407
+ model_key: str = "",
408
+ extra_env_vars: dict = {},
409
+ port_mapping: dict = {},
410
+ network_config: str = "",
411
+ destination_workspace_path: str = "/usr/src/workspace",
412
+ docker_workdir: str = "",
413
+ extra_pkgs: list = [],
414
+ ):
415
+ """Build base Docker command with common options.
416
+
417
+ Args:
418
+ work_fs (str): Work filesystem path
419
+ use_gpu (str): GPU configuration string
420
+ mount_docker_sock (bool): Whether to mount Docker socket
421
+ action_id (str): Action ID
422
+ model_key (str): Model key
423
+ extra_env_vars (dict): Additional environment variables
424
+ port_mapping (dict): Port mappings {host_port: container_port}
425
+ destination_workspace_path (str): Container workspace path
426
+ docker_workdir (str): Docker working directory
427
+ extra_pkgs (list): List of extra packages to install
428
+ Returns:
429
+ str: Base Docker command
430
+ """
431
+ env = os.environ.get("ENV", "prod")
432
+ env_vars = {
433
+ "ENV": env,
434
+ "MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
435
+ "MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
436
+ }
437
+ if self.get_hugging_face_token(model_key):
438
+ env_vars["HUGGING_FACE_ACCESS_TOKEN"] = self.get_hugging_face_token(
439
+ model_key
440
+ )
441
+ if extra_env_vars:
442
+ env_vars.update(extra_env_vars)
443
+
444
+ if network_config == "":
445
+ network_config = (
446
+ "--net=host"
447
+ if not port_mapping
448
+ else " ".join(
449
+ f"-p {host}:{container}" for host, container in port_mapping.items()
450
+ )
451
+ )
452
+
453
+ if not docker_workdir:
454
+ if action_id:
455
+ docker_workdir = f"/usr/src/{action_id}"
456
+ else:
457
+ docker_workdir = "."
458
+ volumes = [
459
+ ( # Mount workspace if work_fs is provided
460
+ f"-v {work_fs}/workspace:{destination_workspace_path}"
461
+ if work_fs and work_fs not in ["/"]
462
+ else ""
463
+ ),
464
+ ( # Mount action directory if work_fs and action_id are provided
465
+ f"-v {work_fs}/{action_id}:/usr/src/{action_id}"
466
+ if work_fs and work_fs not in ["/"] and action_id
467
+ else ""
468
+ ),
469
+ "-v /var/run/docker.sock:/var/run/docker.sock" if mount_docker_sock else "",
470
+ ]
471
+ pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
472
+
473
+ pkgs = ["matrice_common", "matrice"]
474
+ pkgs.extend(extra_pkgs)
475
+ if env == 'dev':
476
+ pkgs = [pkg + ">=1.0.0" for pkg in pkgs]
477
+ pip_install_matrice = f"pip install --pre --upgrade --force-reinstall --index-url {pypi_index} {' '.join(pkgs)}"
478
+ else:
479
+ pip_install_matrice = f"pip install --upgrade --force-reinstall --index-url {pypi_index} {' '.join(pkgs)}"
480
+ pip_install_requirements = (
481
+ "if [ -f requirements.txt ]; then pip install -r requirements.txt; fi "
482
+ )
483
+
484
+ # Create export statements for environment variables to ensure they're available in subshells
485
+ env_exports = " && ".join(
486
+ [
487
+ f"export {key}={shlex.quote(str(value))}"
488
+ for key, value in env_vars.items()
489
+ ]
490
+ )
491
+
492
+ cmd_parts = [
493
+ f"docker run {use_gpu} ",
494
+ network_config,
495
+ *[f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()],
496
+ *volumes,
497
+ # Container configuration and startup commands
498
+ f"--cidfile ./{self.action_record_id}.cid ",
499
+ f"--shm-size=30G --pull=always {shlex.quote(self.docker_container)}",
500
+ f'/bin/bash -c "cd {docker_workdir} && '
501
+ f"{env_exports} && "
502
+ f"{pip_install_requirements} && "
503
+ f"{pip_install_matrice} && ",
504
+ ]
505
+
506
+ # Join all non-empty parts with spaces
507
+ return " ".join(filter(None, cmd_parts))
508
+
509
+ @log_errors(default_return="", raise_exception=False)
510
+ def get_hugging_face_token(self, model_key):
511
+ """Get Hugging Face token for specific model keys.
512
+
513
+ Args:
514
+ model_key (str): Model key to check
515
+
516
+ Returns:
517
+ str: Hugging Face token if available, empty string otherwise
518
+ """
519
+ hugging_face_token = ""
520
+ if model_key and (
521
+ model_key.startswith("microsoft") or model_key.startswith("timm")
522
+ ):
523
+ secret_name = "hugging_face"
524
+ resp, error, message = self.scaling.get_model_secret_keys(secret_name)
525
+ if error is not None:
526
+ logging.error(
527
+ "Error getting Hugging Face token: %s",
528
+ message,
529
+ )
530
+ else:
531
+ hugging_face_token = resp["user_access_token"]
532
+ return hugging_face_token
533
+
534
+ @log_errors(default_return="", raise_exception=False)
535
+ def get_hugging_face_token_for_data_generation(self):
536
+ secret_name = "hugging_face"
537
+ resp, error, message = self.scaling.get_model_secret_keys(secret_name)
538
+ if error is not None:
539
+ logging.error(
540
+ "Error getting Hugging Face token: %s",
541
+ message,
542
+ )
543
+ else:
544
+ hugging_face_token = resp["user_access_token"]
545
+ return hugging_face_token
546
+
547
+ @log_errors(default_return="", raise_exception=False)
548
+ def get_internal_api_key(self, action_id):
549
+ """Get internal API key for action.
550
+
551
+ Args:
552
+ action_id (str): Action ID
553
+
554
+ Returns:
555
+ str: Internal API key if available, empty string otherwise
556
+ """
557
+ internal_api_key = ""
558
+ resp, error, message = self.scaling.get_internal_api_key(action_id)
559
+ if error is not None:
560
+ logging.error(
561
+ "Error getting internal api key: %s",
562
+ message,
563
+ )
564
+ else:
565
+ internal_api_key = resp["internal_api_key"]
566
+ return internal_api_key
567
+
568
+ @log_errors(raise_exception=True)
569
+ def setup_action_requirements(
570
+ self,
571
+ action_details,
572
+ work_fs="",
573
+ model_family="",
574
+ action_id="",
575
+ ):
576
+ """Setup action requirements.
577
+
578
+ Args:
579
+ action_details (dict): Action details
580
+ work_fs (str): Work filesystem path
581
+ model_family (str): Model family name
582
+ action_id (str): Action ID
583
+
584
+ Raises:
585
+ Exception: If setup fails
586
+ """
587
+ # Get job parameters from action_details
588
+ job_params = action_details.get("jobParams", {})
589
+
590
+ # Setup model codebase if model_family is provided
591
+ if model_family:
592
+ # Try to get model codebase URLs from action_details first
593
+ model_codebase_url = job_params.get("model_codebase_url")
594
+ model_requirements_url = job_params.get("model_requirements_url")
595
+ dockerId = job_params.get("_idDocker")
596
+
597
+ # Fallback to API calls if not provided in action_details
598
+ if not model_codebase_url:
599
+ model_codebase_url, error, message = self.scaling.get_model_codebase(
600
+ dockerId
601
+ )
602
+ if error:
603
+ logging.warning(f"Failed to get model codebase URL: {message}")
604
+ model_codebase_url = None
605
+
606
+ # Handle requirements URL - use from job_params or get from API
607
+ if model_requirements_url:
608
+ model_codebase_requirements_url = model_requirements_url
609
+ else:
610
+ model_codebase_requirements_url, error, message = (
611
+ self.scaling.get_model_codebase_requirements(dockerId)
612
+ )
613
+ if error:
614
+ logging.warning(
615
+ f"Failed to get model codebase requirements URL: {message}"
616
+ )
617
+ model_codebase_requirements_url = None
618
+
619
+ # Setup workspace if we have the URLs
620
+ if model_codebase_url:
621
+ setup_workspace_and_run_task(
622
+ work_fs,
623
+ action_id,
624
+ model_codebase_url,
625
+ model_codebase_requirements_url,
626
+ )
627
+
628
+ # Setup Docker credentials
629
+ try:
630
+ # Try to get Docker credentials from action_details first
631
+ docker_username = job_params.get("Username")
632
+ docker_password = job_params.get("Password")
633
+ if docker_username and docker_password:
634
+ username = docker_username
635
+ password = docker_password
636
+ logging.info("Using Docker credentials from action_details")
637
+ else:
638
+ # Fallback to API call
639
+ creds, error, message = self.scaling.get_docker_hub_credentials()
640
+ if error:
641
+ raise Exception(f"Failed to get Docker credentials: {message}")
642
+ username = creds["username"]
643
+ password = creds["password"]
644
+ logging.info("Using Docker credentials from API call")
645
+
646
+ if username and password:
647
+ login_cmd = f"docker login -u {shlex.quote(username)} -p {shlex.quote(password)}"
648
+ result = subprocess.run(login_cmd, shell=True, check=False, capture_output=True, text=True, timeout=30)
649
+ if result.returncode != 0:
650
+ raise Exception(f"Docker login failed with exit code {result.returncode}: {result.stderr}")
651
+ logging.info("Docker login successful")
652
+ else:
653
+ logging.warning(
654
+ "Docker credentials not available, skipping Docker login"
655
+ )
656
+
657
+ except subprocess.TimeoutExpired:
658
+ logging.error("Docker login timed out after 30 seconds")
659
+ raise Exception("Docker login timed out")
660
+ except Exception as err:
661
+ logging.error(
662
+ "Docker login failed: %s",
663
+ str(err),
664
+ )
665
+ raise
666
+
667
+ # Setup user access credentials
668
+ try:
669
+ # Try to get access key and secret key from job_params first
670
+ access_key = job_params.get("access_key")
671
+ secret_key = job_params.get("secret_key")
672
+
673
+ if access_key and secret_key:
674
+ logging.info("Using access key and secret key from job_params")
675
+ (
676
+ self.matrice_access_key_id,
677
+ self.matrice_secret_access_key,
678
+ ) = get_decrypted_access_key_pair(access_key, secret_key)
679
+ else:
680
+ # Fallback to API call
681
+ logging.info(
682
+ "Access key and secret key not found in job_params, falling back to API call"
683
+ )
684
+ (
685
+ user_access_key_pair,
686
+ error,
687
+ message,
688
+ ) = self.scaling.get_user_access_key_pair(action_details["_idUser"])
689
+ if error:
690
+ raise Exception(f"Failed to get user access key pair: {message}")
691
+ access_key = user_access_key_pair["access_key"]
692
+ secret_key = user_access_key_pair["secret_key"]
693
+ (
694
+ self.matrice_access_key_id,
695
+ self.matrice_secret_access_key,
696
+ ) = get_decrypted_access_key_pair(access_key, secret_key)
697
+
698
+ except Exception as err:
699
+ logging.error(
700
+ "Failed to setup credentials: %s",
701
+ str(err),
702
+ )
703
+ raise
704
+
705
+ # @log_errors(raise_exception=False)
706
+ # def create_redis_container(self, redis_image=None, redis_password=None):
707
+ # """Create and start a Redis container using Docker.
708
+
709
+ # Args:
710
+ # redis_image (str, optional): Redis Docker image to use. Defaults to 'redis:latest'
711
+
712
+ # Returns:
713
+ # tuple: (container_info, error, message)
714
+ # """
715
+ # if redis_image is None:
716
+ # redis_image = "redis:latest"
717
+
718
+ # network_name = f"redis_network_{int(time.time())}"
719
+ # subprocess.run(f"docker network create {network_name}", shell=True, check=True)
720
+
721
+ # try:
722
+ # # Get an available port for Redis
723
+ # external_port = "6379"
724
+
725
+ # # Generate a unique container name and password
726
+ # container_name = f"redis_container_{int(time.time())}"
727
+
728
+ # # Build the docker command to create Redis container with password
729
+ # cmd = (
730
+ # f"docker run -d "
731
+ # f"--network {network_name} "
732
+ # f"--name {container_name} "
733
+ # f"-p {external_port}:6379 "
734
+ # f"--restart unless-stopped "
735
+ # f"{redis_image} "
736
+ # f"redis-server --bind 0.0.0.0 --appendonly yes --requirepass {redis_password}"
737
+ # )
738
+
739
+ # logging.info("Creating Redis container with command: %s", cmd)
740
+
741
+ # # Execute the command
742
+ # result = subprocess.run(
743
+ # cmd, shell=True, capture_output=True, text=True, timeout=60
744
+ # )
745
+
746
+ # if result.returncode == 0:
747
+ # container_id = result.stdout.strip()
748
+ # container_info = {
749
+ # "container_id": container_id,
750
+ # "container_name": container_name,
751
+ # "network_name": network_name,
752
+ # "external_port": external_port,
753
+ # "internal_port": 6379,
754
+ # "password": redis_password,
755
+ # "image": redis_image,
756
+ # "status": "running",
757
+ # }
758
+
759
+ # logging.info("Redis container created successfully: %s", container_info)
760
+ # return container_info, None, "Redis container created successfully"
761
+ # else:
762
+ # error_message = f"Failed to create Redis container: {result.stderr}"
763
+ # logging.error(error_message)
764
+ # return None, "ContainerCreationError", error_message
765
+
766
+ # except subprocess.TimeoutExpired:
767
+ # error_message = "Timeout while creating Redis container"
768
+ # logging.error(error_message)
769
+ # return None, "TimeoutError", error_message
770
+ # except Exception as e:
771
+ # error_message = f"Unexpected error creating Redis container: {str(e)}"
772
+ # logging.error(error_message)
773
+ # return None, "UnexpectedError", error_message
774
+
775
+ @log_errors(raise_exception=False, log_error=False)
776
+ def send_logs_continuously(self):
777
+ """Continuously read and send logs from the log file to the scaling service.
778
+
779
+ Enhanced version that tracks log position and handles graceful shutdown.
780
+ """
781
+ last_position = 0
782
+ self._last_log_position = 0 # Track position for final flush
783
+
784
+ while not self.stop_thread and os.path.exists(self.log_path):
785
+ try:
786
+ with open(self.log_path, "rb") as log_file:
787
+ log_file.seek(last_position)
788
+ new_content = log_file.read()
789
+ if new_content:
790
+ try:
791
+ decoded_content = new_content.decode("utf-8")
792
+ except UnicodeDecodeError:
793
+ # Handle invalid UTF-8 bytes by replacing them
794
+ decoded_content = new_content.decode(
795
+ "utf-8",
796
+ errors="replace",
797
+ )
798
+ self._send_logs_to_scaling(decoded_content)
799
+ self._check_cuda(decoded_content)
800
+
801
+ # Update tracked position
802
+ last_position = log_file.tell()
803
+ self._last_log_position = last_position
804
+
805
+ except Exception as e:
806
+ logging.error(
807
+ "Error reading logs for action %s: %s",
808
+ getattr(self, "action_record_id", "unknown"),
809
+ str(e),
810
+ )
811
+
812
+ # Use shorter sleep interval for more responsive log monitoring
813
+ time.sleep(10) # Reduced from 30 to 10 seconds for better responsiveness
814
+
815
+ # Final attempt to send any remaining logs when thread is stopping
816
+ logging.info(
817
+ "Log monitoring thread stopping for action %s, performing final check",
818
+ getattr(self, "action_record_id", "unknown"),
819
+ )
820
+
821
+ # One more final read attempt
822
+ try:
823
+ if os.path.exists(self.log_path):
824
+ with open(self.log_path, "rb") as log_file:
825
+ log_file.seek(last_position)
826
+ final_content = log_file.read()
827
+ if final_content:
828
+ try:
829
+ decoded_content = final_content.decode("utf-8")
830
+ except UnicodeDecodeError:
831
+ decoded_content = final_content.decode(
832
+ "utf-8", errors="replace"
833
+ )
834
+ self._send_logs_to_scaling(decoded_content)
835
+ self._check_cuda(decoded_content)
836
+ logging.info(
837
+ "Sent final %d bytes of logs for action %s",
838
+ len(final_content),
839
+ getattr(self, "action_record_id", "unknown"),
840
+ )
841
+ except Exception as e:
842
+ logging.error(
843
+ "Error in final log read for action %s: %s",
844
+ getattr(self, "action_record_id", "unknown"),
845
+ str(e),
846
+ )
847
+
848
+ @log_errors(raise_exception=False, log_error=False)
849
+ def _send_logs_to_scaling(self, log_content):
850
+ """Send logs to the scaling service.
851
+
852
+ Args:
853
+ log_content (str): Log content to send
854
+ """
855
+ _, error, message = self.scaling.update_action_docker_logs(
856
+ action_record_id=self.action_record_id,
857
+ log_content=log_content,
858
+ )
859
+ if error:
860
+ logging.error(
861
+ "Error from update_action_docker_logs: %s",
862
+ error,
863
+ )
864
+
865
+ @log_errors(raise_exception=False, log_error=False)
866
+ def _check_cuda(self, log_content):
867
+ """Check for CUDA out of memory errors in logs and update action status.
868
+
869
+ Args:
870
+ log_content (str): Log content to check
871
+ """
872
+ if "CUDA error: out of memory" in log_content:
873
+ action_details = self.get_action_details()
874
+ if not action_details:
875
+ return
876
+ self.scaling.update_action(
877
+ id=self.action_record_id,
878
+ step_code="ERROR",
879
+ action_type=action_details["action"],
880
+ status="ERROR",
881
+ status_description="CUDA error: out of memory",
882
+ service="bg-job-scheduler",
883
+ job_params=action_details["jobParams"],
884
+ )
885
+
886
+ @log_errors(raise_exception=True)
887
+ def start_process(self, cmd, log_name):
888
+ """Start the process and initialize logging.
889
+
890
+ Args:
891
+ cmd (str): Command to execute
892
+ log_name (str): Name for log file
893
+
894
+ Raises:
895
+ Exception: If process fails to start
896
+ """
897
+ self.cmd = cmd
898
+ self.log_path = f"{self.get_log_path()}/{log_name}_{self.action_record_id}.txt"
899
+
900
+ with open(self.log_path, "wb") as out:
901
+ self.process = subprocess.Popen(
902
+ shlex.split(self.cmd),
903
+ stdout=out,
904
+ stderr=out,
905
+ env={**os.environ},
906
+ start_new_session=True,
907
+ )
908
+
909
+ self.container_id = None
910
+
911
+ cid_file_path = f"./{self.action_record_id}.cid"
912
+ max_retries = 5
913
+ retry_delay = 1 # seconds
914
+ for attempt in range(max_retries):
915
+ try:
916
+ with open(cid_file_path, "r") as cid_file:
917
+ container_id = cid_file.read().strip()
918
+ self.container_id = container_id
919
+ logging.info(
920
+ "Started process for action %s with container ID: %s",
921
+ self.action_record_id,
922
+ self.container_id,
923
+ )
924
+ break
925
+ except FileNotFoundError:
926
+ logging.warning(
927
+ "CID file not found for action %s, attempt %d/%d",
928
+ self.action_record_id,
929
+ attempt + 1,
930
+ max_retries,
931
+ )
932
+ time.sleep(retry_delay)
933
+ except Exception as e:
934
+ logging.error(
935
+ "Error reading CID file for action %s: %s",
936
+ self.action_record_id,
937
+ str(e),
938
+ )
939
+ time.sleep(retry_delay)
940
+ else:
941
+ logging.error(
942
+ "Failed to read CID file for action %s after %d attempts",
943
+ self.action_record_id,
944
+ max_retries,
945
+ )
946
+ raise Exception("Failed to start process: CID file not found")
947
+
948
+ # report container id to scaling service
949
+ self.scaling.update_action_container_id(
950
+ action_record_id=self.action_record_id,
951
+ container_id=self.container_id,
952
+ )
953
+
954
+
955
+ @log_errors(raise_exception=False)
956
+ def start_logger(self):
957
+ """Start the log monitoring thread."""
958
+ self.log_thread = threading.Thread(
959
+ target=self.send_logs_continuously,
960
+ daemon=False, # CRITICAL: Make thread non-daemon to ensure it completes
961
+ )
962
+ self.log_thread.start()
963
+
964
+ @log_errors(raise_exception=False)
965
+ def start(self, cmd: str = "", log_name: str = ""):
966
+ """Start the process and log monitoring thread.
967
+
968
+ Args:
969
+ cmd (str): Command to execute
970
+ log_name (str): Name for log file
971
+ """
972
+ self.start_process(cmd, log_name)
973
+ self.start_logger()
974
+ self.scaling.update_status(
975
+ self.action_record_id,
976
+ self.action_type,
977
+ "bg-job-scheduler",
978
+ "DKR_CMD",
979
+ "OK",
980
+ f"Start docker container with command: {cmd.replace(self.matrice_access_key_id, 'MATRICE_ACCESS_KEY_ID').replace(self.matrice_secret_access_key, 'MATRICE_SECRET_ACCESS_KEY')}",
981
+ )
982
+
983
+ @log_errors(raise_exception=False, log_error=False)
984
+ def stop(self):
985
+ """Stop the process and log monitoring thread.
986
+
987
+ Enhanced version that ensures proper cleanup sequencing and log completion.
988
+ """
989
+ logging.info("Stopping action %s", getattr(self, "action_record_id", "unknown"))
990
+
991
+ # Step 1: Signal log thread to stop
992
+ self.stop_thread = True
993
+
994
+ # Step 2: Stop the process
995
+ try:
996
+ if self.process:
997
+ logging.info(
998
+ "Terminating process for action %s",
999
+ getattr(self, "action_record_id", "unknown"),
1000
+ )
1001
+ os.killpg(
1002
+ os.getpgid(self.process.pid),
1003
+ signal.SIGTERM,
1004
+ )
1005
+ # Give process time to terminate gracefully
1006
+ try:
1007
+ self.process.wait(timeout=15)
1008
+ logging.info(
1009
+ "Process terminated gracefully for action %s",
1010
+ getattr(self, "action_record_id", "unknown"),
1011
+ )
1012
+ except subprocess.TimeoutExpired:
1013
+ logging.warning(
1014
+ "Process didn't terminate gracefully, forcing kill for action %s",
1015
+ getattr(self, "action_record_id", "unknown"),
1016
+ )
1017
+ try:
1018
+ os.killpg(os.getpgid(self.process.pid), signal.SIGKILL)
1019
+ self.process.wait(timeout=5)
1020
+ except Exception as kill_err:
1021
+ logging.error(
1022
+ "Error force-killing process for action %s: %s",
1023
+ getattr(self, "action_record_id", "unknown"),
1024
+ str(kill_err),
1025
+ )
1026
+ except Exception as proc_err:
1027
+ logging.error(
1028
+ "Error stopping process for action %s: %s",
1029
+ getattr(self, "action_record_id", "unknown"),
1030
+ str(proc_err),
1031
+ )
1032
+
1033
+ # Step 3: Ensure final logs are sent
1034
+ self._ensure_final_logs_sent()
1035
+
1036
+ # Step 4: Wait for log thread to complete
1037
+ if self.log_thread and self.log_thread.is_alive():
1038
+ logging.info(
1039
+ "Waiting for log thread to complete for action %s",
1040
+ getattr(self, "action_record_id", "unknown"),
1041
+ )
1042
+ try:
1043
+ self.log_thread.join(
1044
+ timeout=30
1045
+ ) # Wait up to 30 seconds for logs to complete
1046
+ if self.log_thread.is_alive():
1047
+ logging.warning(
1048
+ "Log thread didn't complete within timeout for action %s",
1049
+ getattr(self, "action_record_id", "unknown"),
1050
+ )
1051
+ else:
1052
+ logging.info(
1053
+ "Log thread completed successfully for action %s",
1054
+ getattr(self, "action_record_id", "unknown"),
1055
+ )
1056
+ except Exception as thread_err:
1057
+ logging.error(
1058
+ "Error waiting for log thread for action %s: %s",
1059
+ getattr(self, "action_record_id", "unknown"),
1060
+ str(thread_err),
1061
+ )
1062
+
1063
+ @log_errors(raise_exception=False)
1064
+ def execute(self):
1065
+ """Execute the task."""
1066
+ self.task(self)
1067
+
1068
+
1069
+ @log_errors(raise_exception=False)
1070
+ def data_preparation_execute(
1071
+ self: ActionInstance,
1072
+ ):
1073
+ """Execute data preparation task."""
1074
+ work_fs = get_max_file_system()
1075
+ action_details = self.get_action_details()
1076
+ if not action_details:
1077
+ return
1078
+ self.setup_action_requirements(action_details, work_fs, model_family="")
1079
+ action = {"jobParams": action_details["jobParams"]}
1080
+ dataset_id_version = (
1081
+ action_details["jobParams"]["dataset_id"]
1082
+ + action_details["jobParams"]["dataset_version"]
1083
+ )
1084
+ action["jobParams"].update(
1085
+ {
1086
+ "dataset_host_path_map": {dataset_id_version: f"{work_fs}/workspace"},
1087
+ "dataset_local_path_map": {dataset_id_version: "/usr/src/app/workspace"},
1088
+ "host_file_system": work_fs,
1089
+ }
1090
+ )
1091
+ self.scaling.update_action(
1092
+ id=self.action_record_id,
1093
+ step_code="DCK_LNCH",
1094
+ action_type=action_details["action"],
1095
+ status=action_details["status"],
1096
+ sub_action=action_details["subAction"],
1097
+ status_description="Job is assigned to docker",
1098
+ service="bg-job-scheduler",
1099
+ job_params=action["jobParams"],
1100
+ )
1101
+ if action["jobParams"].get("model_train_docker"):
1102
+ logging.info("Pulling the docker image")
1103
+ pull_cmd = f"docker pull {action['jobParams']['model_train_docker']}"
1104
+ process = subprocess.Popen(
1105
+ pull_cmd,
1106
+ shell=True,
1107
+ stdout=subprocess.PIPE,
1108
+ stderr=subprocess.PIPE,
1109
+ )
1110
+ logging.info(
1111
+ "Started pulling Docker image with PID: %s",
1112
+ process.pid,
1113
+ )
1114
+ cmd = f'{self.get_base_docker_cmd(work_fs, destination_workspace_path="/usr/src/app/workspace", docker_workdir="/usr/src/app/workspace", extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_preparation.py {self.action_record_id} "'
1115
+ logging.info("cmd is: %s", cmd)
1116
+ self.start(cmd, "data_preparation_log")
1117
+
1118
+
1119
+ @log_errors(raise_exception=False)
1120
+ def data_processing_execute(self: ActionInstance):
1121
+ """Execute data processing task."""
1122
+ work_fs = get_max_file_system()
1123
+ action_details = self.get_action_details()
1124
+ if not action_details:
1125
+ return
1126
+ self.setup_action_requirements(action_details, work_fs, model_family="")
1127
+ action = {"jobParams": action_details["jobParams"]}
1128
+ action["jobParams"].update(
1129
+ {
1130
+ "dp_dv_host_paths": [f"{work_fs}/workspace"],
1131
+ "dp_dv_local_paths": ["/usr/src/app/workspace"],
1132
+ }
1133
+ )
1134
+ self.scaling.update_action(
1135
+ id=self.action_record_id,
1136
+ step_code="DCK_LNCH",
1137
+ action_type=action_details["action"],
1138
+ status="ACK",
1139
+ status_description="Job is assigned to docker",
1140
+ service="bg-job-scheduler",
1141
+ job_params=action["jobParams"],
1142
+ )
1143
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/main.py {self.action_record_id} "'
1144
+ logging.info("cmd: %s", cmd)
1145
+ self.start(cmd, "data_processing_log")
1146
+
1147
+
1148
+ @log_errors(raise_exception=False)
1149
+ def data_split_execute(self: ActionInstance):
1150
+ """Execute data split task."""
1151
+ work_fs = get_max_file_system()
1152
+ action_details = self.get_action_details()
1153
+ if not action_details:
1154
+ return
1155
+ self.setup_action_requirements(action_details, work_fs, model_family="")
1156
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_split.py {self.action_record_id} "'
1157
+ logging.info("cmd: %s", cmd)
1158
+ self.start(cmd, "data_split")
1159
+
1160
+
1161
+ @log_errors(raise_exception=False)
1162
+ def dataset_annotation_execute(
1163
+ self: ActionInstance,
1164
+ ):
1165
+ """Execute dataset annotation task."""
1166
+ work_fs = get_max_file_system()
1167
+ action_details = self.get_action_details()
1168
+ if not action_details:
1169
+ return
1170
+ self.setup_action_requirements(action_details, work_fs)
1171
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/dataset_annotation.py {self.action_record_id} "'
1172
+ logging.info("cmd: %s", cmd)
1173
+ self.start(cmd, "dataset_annotation")
1174
+
1175
+
1176
+ @log_errors(raise_exception=False)
1177
+ def dataset_augmentation_execute(
1178
+ self: ActionInstance,
1179
+ ):
1180
+ """Execute dataset augmentation task."""
1181
+ work_fs = get_max_file_system()
1182
+ action_details = self.get_action_details()
1183
+ if not action_details:
1184
+ return
1185
+ self.setup_action_requirements(action_details, work_fs)
1186
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_augmentation.py {self.action_record_id} "'
1187
+ logging.info("cmd: %s", cmd)
1188
+ self.start(cmd, "dataset_augmentation")
1189
+
1190
+
1191
+ @log_errors(raise_exception=False)
1192
+ def augmentation_server_creation_execute(
1193
+ self: ActionInstance,
1194
+ ):
1195
+ """Create Augmentation Server"""
1196
+ work_fs = get_max_file_system()
1197
+ action_details = self.get_action_details()
1198
+ external_port = self.scaling.get_open_port()
1199
+ if not action_details:
1200
+ return
1201
+ self.setup_action_requirements(action_details, work_fs)
1202
+ cmd = f'{self.get_base_docker_cmd(work_fs, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/aug_server.py {self.action_record_id} {external_port} "'
1203
+ logging.info("cmd: %s", cmd)
1204
+ self.start(cmd, "augmentation_setup")
1205
+
1206
+
1207
+ @log_errors(raise_exception=False)
1208
+ def database_setup_execute(self: ActionInstance):
1209
+ """
1210
+ Creates and setup the database for facial recognition server.
1211
+ MongoDB runs on port 27020:27017 (localhost only with --net=host).
1212
+ Qdrant runs on port 6334 (localhost only with --net=host).
1213
+ """
1214
+ action_details = self.get_action_details()
1215
+ if not action_details:
1216
+ return
1217
+ image = action_details["actionDetails"].get("docker")
1218
+
1219
+ self.setup_action_requirements(action_details)
1220
+
1221
+ project_id = action_details["_idProject"]
1222
+
1223
+ if action_details["actionDetails"].get("containerId"):
1224
+ logging.info(
1225
+ "Using existing container ID for inference tracker: %s",
1226
+ action_details["actionDetails"]["containerId"],
1227
+ )
1228
+ self.docker_container = action_details["actionDetails"]["containerId"]
1229
+ cmd = "docker restart " + self.docker_container
1230
+ self.start(cmd, "qdrant_setup")
1231
+
1232
+ #qdrant restart
1233
+ qdrant_cmd = "docker restart qdrant"
1234
+ self.start(qdrant_cmd, 'qdrant_setup')
1235
+
1236
+ return
1237
+
1238
+ # MongoDB container with --net=host (Port: 27020:27017)
1239
+ cmd = (
1240
+ f"docker run --pull=always --net=host "
1241
+ f"--name mongodbdatabase "
1242
+ f"-v matrice_myvol:/matrice_data "
1243
+ f"--cidfile ./{self.action_record_id}.cid "
1244
+ f"-e ACTION_RECORD_ID={self.action_record_id} "
1245
+ f"-e MATRICE_ACCESS_KEY_ID={self.matrice_access_key_id} "
1246
+ f"-e MATRICE_SECRET_ACCESS_KEY={self.matrice_secret_access_key} "
1247
+ f"-e PROJECT_ID={project_id} "
1248
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1249
+ f"{image} "
1250
+ )
1251
+ logging.info("Starting MongoDB container (Port: 27020:27017): %s", cmd)
1252
+
1253
+ # Qdrant container with --net=host (Port: 6334)
1254
+ qdrant_cmd = (
1255
+ f"docker run --pull=always --net=host "
1256
+ f"--name qdrant "
1257
+ f"-v matrice_myvol:/matrice_data "
1258
+ f"{'qdrant/qdrant:latest'} "
1259
+ )
1260
+ logging.info("Starting Qdrant container (Port: 6334): %s", qdrant_cmd)
1261
+
1262
+ # Docker Command run
1263
+ self.start(cmd, "database_setup")
1264
+
1265
+ # Docker for qdrant
1266
+ self.start(qdrant_cmd, 'qdrant_setup')
1267
+
1268
+ @log_errors(raise_exception=False)
1269
+ def facial_recognition_setup_execute(self: ActionInstance):
1270
+ """
1271
+ Creates and setup the facial recognition worker server.
1272
+ Facial recognition worker runs on port 8081 (localhost only with --net=host).
1273
+ """
1274
+ action_details = self.get_action_details()
1275
+
1276
+ if not action_details:
1277
+ return
1278
+ image = action_details["actionDetails"].get("docker")
1279
+
1280
+ self.setup_action_requirements(action_details)
1281
+
1282
+ if action_details["actionDetails"].get("containerId"):
1283
+ logging.info(
1284
+ "Using existing container ID for facial recognition worker: %s",
1285
+ action_details["actionDetails"]["containerId"],
1286
+ )
1287
+ self.docker_container = action_details["actionDetails"]["containerId"]
1288
+ cmd = "docker restart " + self.docker_container
1289
+ self.start(cmd, "facial_recognition_setup")
1290
+ return
1291
+
1292
+ # Facial recognition worker container with --net=host (Port: 8081)
1293
+ worker_cmd = (
1294
+ f"docker run -d --pull=always --net=host "
1295
+ f"--name worker "
1296
+ f"--cidfile ./{self.action_record_id}.cid "
1297
+ f"-v matrice_myvol:/matrice_data "
1298
+ f"--cidfile ./{self.action_record_id}.cid "
1299
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1300
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1301
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1302
+ f'-e ACTION_ID="{self.action_record_id}" '
1303
+ f"{image}"
1304
+ )
1305
+ logging.info("Starting facial recognition worker (Port: 8081): %s", worker_cmd)
1306
+
1307
+ # Docker Command run
1308
+ self.start(worker_cmd, "facial_recognition_setup")
1309
+
1310
+ @log_errors(raise_exception=False)
1311
+ def lpr_setup_execute(self: ActionInstance):
1312
+ """
1313
+ Creates and setup the license plate recognition server.
1314
+ LPR worker runs on port 8082 (localhost only with --net=host).
1315
+ """
1316
+ action_details = self.get_action_details()
1317
+
1318
+ if not action_details:
1319
+ return
1320
+ image = self.docker_container
1321
+
1322
+ self.setup_action_requirements(action_details)
1323
+
1324
+ if action_details["actionDetails"].get("containerId"):
1325
+ logging.info(
1326
+ "Using existing container ID for LPR worker: %s",
1327
+ action_details["actionDetails"]["containerId"],
1328
+ )
1329
+ self.docker_container = action_details["actionDetails"]["containerId"]
1330
+ cmd = "docker restart " + self.docker_container
1331
+ self.start(cmd, "lpr_setup")
1332
+ return
1333
+
1334
+ # LPR worker container with --net=host (Port: 8082)
1335
+ worker_cmd = (
1336
+ f"docker run -d --net=host --pull=always "
1337
+ f"--name lpr-worker "
1338
+ f"--cidfile ./{self.action_record_id}.cid "
1339
+ f"-v matrice_myvol:/matrice_data "
1340
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1341
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1342
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1343
+ f'-e ACTION_ID="{self.action_record_id}" '
1344
+ f'-e PORT=8082 '
1345
+ f"{image}"
1346
+ )
1347
+ logging.info("Starting LPR worker (Port: 8082): %s", worker_cmd)
1348
+
1349
+ # Docker Command run
1350
+ self.start(worker_cmd, "lpr_setup")
1351
+
1352
+ @log_errors(raise_exception=False)
1353
+ def inference_ws_server_execute(self: ActionInstance):
1354
+ """
1355
+ Creates and start inference pipeline.
1356
+ Inference WebSocket server runs on port 8102 (localhost only with --net=host).
1357
+ """
1358
+ action_details = self.get_action_details()
1359
+
1360
+ if not action_details:
1361
+ return
1362
+ image = action_details["actionDetails"].get("docker")
1363
+
1364
+ self.setup_action_requirements(action_details)
1365
+
1366
+ # Get the best IP and network configuration for port 8102
1367
+ ws_host, use_host_network = get_best_service_ip_and_network(8102)
1368
+
1369
+ # Store ws_host in environment variable for use by other actions (e.g., fe_fs_streaming)
1370
+ if not os.environ.get("INFERENCE_WS_HOST"):
1371
+ os.environ["INFERENCE_WS_HOST"] = ws_host
1372
+
1373
+ logging.info(f"Inference WebSocket server will use IP: {ws_host} on port 8102 (use_host_network={use_host_network})")
1374
+
1375
+ if action_details["actionDetails"].get("containerId"):
1376
+ logging.info(
1377
+ "Using existing container ID for inference WebSocket server: %s",
1378
+ action_details["actionDetails"]["containerId"],
1379
+ )
1380
+ self.docker_container = action_details["actionDetails"]["containerId"]
1381
+ cmd = "docker restart " + self.docker_container
1382
+ self.start(cmd, "inference_ws_server")
1383
+ return
1384
+
1385
+ # Inference WebSocket server with --net=host (Port: 8102)
1386
+ worker_cmd = (
1387
+ f"docker run -d --pull=always --net=host "
1388
+ f"--name inference "
1389
+ f"--cidfile ./{self.action_record_id}.cid "
1390
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1391
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1392
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1393
+ f"{image} "
1394
+ f"./app "
1395
+ f"{self.action_record_id} "
1396
+ )
1397
+ logging.info("Starting inference WebSocket server (Port: 8102): %s", worker_cmd)
1398
+
1399
+ # Docker Command run
1400
+ self.start(worker_cmd, "inference_ws_server")
1401
+
1402
+
1403
+ @log_errors(raise_exception=False)
1404
+ def fe_fs_streaming_execute(self: ActionInstance):
1405
+ """
1406
+ Creates and setup the frontend for fs streaming.
1407
+ Frontend streaming runs on port 3000 (localhost only with --net=host).
1408
+ """
1409
+ action_details = self.get_action_details()
1410
+
1411
+ if not action_details:
1412
+ return
1413
+ image = action_details["actionDetails"].get("docker")
1414
+
1415
+ self.setup_action_requirements(action_details)
1416
+
1417
+ # Get the ws_host from environment variable set by inference_ws_server_execute
1418
+ ws_host = os.environ.get("INFERENCE_WS_HOST", "localhost")
1419
+ ws_url = f"{ws_host}:8102"
1420
+
1421
+ logging.info(f"Frontend streaming will connect to WebSocket at: {ws_url}")
1422
+
1423
+ if action_details["actionDetails"].get("containerId"):
1424
+ logging.info(
1425
+ "Using existing container ID for frontend streaming: %s",
1426
+ action_details["actionDetails"]["containerId"],
1427
+ )
1428
+ self.docker_container = action_details["actionDetails"]["containerId"]
1429
+ cmd = "docker restart " + self.docker_container
1430
+ self.start(cmd, "fe_fs_streaming")
1431
+ return
1432
+
1433
+ # Frontend streaming with --net=host (Port: 3000)
1434
+ worker_cmd = (
1435
+ f"docker run -d --pull=always --net=host "
1436
+ f"--name fe_streaming "
1437
+ f"--cidfile ./{self.action_record_id}.cid "
1438
+ f"-v matrice_myvol:/matrice_data "
1439
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
1440
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1441
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1442
+ f"-e PORT=3000 "
1443
+ f'-e WS_HOST="{ws_url}" '
1444
+ f"{image}"
1445
+ )
1446
+ logging.info("Starting frontend streaming (Port: 3000) with WS_HOST=%s: %s", ws_url, worker_cmd)
1447
+
1448
+ # Docker Command run
1449
+ self.start(worker_cmd, "fe_fs_streaming")
1450
+
1451
+
1452
+ @log_errors(raise_exception=False)
1453
+ def fe_analytics_service_execute(self: ActionInstance):
1454
+ """
1455
+ Creates and setup the frontend analytics service.
1456
+ Frontend analytics service runs on port 3001 (localhost only with --net=host).
1457
+ """
1458
+ action_details = self.get_action_details()
1459
+
1460
+ if not action_details:
1461
+ return
1462
+ image = action_details["actionDetails"].get("docker")
1463
+
1464
+ self.setup_action_requirements(action_details)
1465
+
1466
+ project_id = action_details["_idProject"]
1467
+
1468
+ if action_details["actionDetails"].get("containerId"):
1469
+ logging.info(
1470
+ "Using existing container ID for frontend analytics service: %s",
1471
+ action_details["actionDetails"]["containerId"],
1472
+ )
1473
+ self.docker_container = action_details["actionDetails"]["containerId"]
1474
+ cmd = "docker restart " + self.docker_container
1475
+ self.start(cmd, "fe_analytics_service")
1476
+ return
1477
+
1478
+ # Frontend analytics service with --net=host (Port: 3001)
1479
+ worker_cmd = (
1480
+ f"docker run -d --pull=always --net=host "
1481
+ f"--name fe-analytics "
1482
+ f"--cidfile ./{self.action_record_id}.cid "
1483
+ f'-e NEXT_PUBLIC_DEPLOYMENT_ENV="{os.environ.get("ENV", "prod")}" '
1484
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
1485
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
1486
+ f'-e ACTION_ID="{self.action_record_id}" '
1487
+ f"-e PORT=3001 "
1488
+ f'-e PROJECT_ID="{project_id}" '
1489
+ f"{image}"
1490
+ )
1491
+ logging.info("Starting frontend analytics service (Port: 3001): %s", worker_cmd)
1492
+
1493
+ # Docker Command run
1494
+ self.start(worker_cmd, "fe_analytics_service")
1495
+
1496
+
1497
+ @log_errors(raise_exception=False)
1498
+ def synthetic_dataset_generation_execute(self: ActionInstance):
1499
+ """Execute synthetic dataset generation task."""
1500
+ work_fs = get_max_file_system()
1501
+ action_details = self.get_action_details()
1502
+ if not action_details:
1503
+ return
1504
+ self.setup_action_requirements(action_details, work_fs)
1505
+ extra_env_vars = {}
1506
+ hf_token = self.get_hugging_face_token_for_data_generation()
1507
+ extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
1508
+ if hf_token:
1509
+ extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
1510
+ else:
1511
+ return
1512
+ use_gpu = self.get_gpu_config(action_details)
1513
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/synthetic_dataset_generation.py {self.action_record_id} "'
1514
+ logging.info("cmd is: %s", cmd)
1515
+ self.start(cmd, "dataset_generation")
1516
+
1517
+
1518
+ @log_errors(raise_exception=False)
1519
+ def synthetic_data_setup_execute(self: ActionInstance):
1520
+ """Execute synthetic data setup task."""
1521
+ work_fs = get_max_file_system()
1522
+ action_details = self.get_action_details()
1523
+ external_port = self.scaling.get_open_port()
1524
+ if not action_details:
1525
+ return
1526
+ self.setup_action_requirements(action_details, work_fs)
1527
+ extra_env_vars = {}
1528
+ hf_token = self.get_hugging_face_token_for_data_generation()
1529
+ if hf_token:
1530
+ extra_env_vars["HUGGING_FACE_ACCESS_TOKEN"] = hf_token
1531
+ else:
1532
+ return
1533
+ use_gpu = self.get_gpu_config(action_details)
1534
+ cmd = f'{self.get_base_docker_cmd(work_fs=work_fs, use_gpu=use_gpu, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_dataset"])} python3 /usr/src/app/data_generation.py {self.action_record_id} {external_port} "'
1535
+ logging.info("cmd is: %s", cmd)
1536
+ self.start(cmd, "synthetic_data_setup")
1537
+
1538
+
1539
+ @log_errors(raise_exception=False)
1540
+ def redis_setup_execute(self: ActionInstance):
1541
+ """
1542
+ Creates and starts a Redis container using Docker.
1543
+ Redis runs on port 6379 (localhost only with --net=host).
1544
+ """
1545
+ work_fs = get_max_file_system()
1546
+
1547
+ action_details = self.get_action_details()
1548
+ if not action_details:
1549
+ return
1550
+ action_id = action_details["_id"]
1551
+
1552
+ redis_password = action_details["jobParams"].get(
1553
+ "password", f"redis_pass_{int(time.time())}"
1554
+ )
1555
+
1556
+ # Initialize redis container
1557
+ self.setup_action_requirements(
1558
+ action_details,
1559
+ work_fs,
1560
+ model_family="",
1561
+ action_id=action_id,
1562
+ )
1563
+
1564
+ # Get the best IP for Redis (port 6379)
1565
+ redis_host, _ = get_best_service_ip_and_network(6379)
1566
+
1567
+ logging.info(f"Redis will use IP: {redis_host} on port 6379")
1568
+
1569
+ redis_image = action_details["actionDetails"].get("redis_image", "redis:latest")
1570
+
1571
+
1572
+ if action_details["actionDetails"].get("containerId"):
1573
+ logging.info(
1574
+ "Using existing container ID for redis management: %s",
1575
+ action_details["actionDetails"]["containerId"],
1576
+ )
1577
+ self.docker_container = action_details["actionDetails"]["containerId"]
1578
+ cmd = "docker restart " + self.docker_container
1579
+ self.start(cmd, "redis_setup")
1580
+
1581
+ # Redis container restart
1582
+ redis_restart_cmd = "docker restart redis_container"
1583
+ self.start(redis_restart_cmd, "redis")
1584
+
1585
+ return
1586
+
1587
+ # Redis container with --net=host (Port: 6379)
1588
+ redis_cmd = (
1589
+ f"docker run -d --net=host "
1590
+ f"--name redis_container "
1591
+ f"--restart unless-stopped "
1592
+ f"{redis_image} "
1593
+ f"redis-server --bind 0.0.0.0 "
1594
+ f"--appendonly no "
1595
+ f'--save "" '
1596
+ f"--maxmemory 30gb "
1597
+ f"--maxmemory-policy allkeys-lru "
1598
+ f"--io-threads 4 "
1599
+ f"--io-threads-do-reads yes "
1600
+ f"--stream-node-max-bytes 8192 "
1601
+ f"--stream-node-max-entries 1000 "
1602
+ f"--hz 100 "
1603
+ f"--tcp-backlog 2048 "
1604
+ f"--timeout 0 "
1605
+ f"--lazyfree-lazy-eviction yes "
1606
+ f"--lazyfree-lazy-expire yes "
1607
+ f"--lazyfree-lazy-server-del yes "
1608
+ f"--activedefrag yes "
1609
+ f"--requirepass {redis_password}"
1610
+ )
1611
+ logging.info("Starting Redis container on %s:6379: %s", redis_host, redis_cmd)
1612
+
1613
+ # Start Redis container first
1614
+ redis_process = subprocess.Popen(
1615
+ redis_cmd,
1616
+ shell=True,
1617
+ stdout=subprocess.PIPE,
1618
+ stderr=subprocess.PIPE,
1619
+ )
1620
+ logging.info("Redis container started successfully on %s:6379", redis_host)
1621
+
1622
+ # Wait for Redis to be ready
1623
+ time.sleep(5)
1624
+
1625
+ env_vars = {
1626
+ "REDIS_URL": f"{redis_host}:6379",
1627
+ "REDIS_PASSWORD": redis_password,
1628
+ }
1629
+
1630
+ # bg-redis management container with --net=host (Port: 8082)
1631
+ cmd = (
1632
+ f"docker run --net=host "
1633
+ f"--cidfile ./{self.action_record_id}.cid "
1634
+ f"-e REDIS_URL={shlex.quote(env_vars['REDIS_URL'])} "
1635
+ f"-e REDIS_PASSWORD={shlex.quote(env_vars['REDIS_PASSWORD'])} "
1636
+ f"-e MATRICE_ACCESS_KEY_ID={shlex.quote(self.matrice_access_key_id)} "
1637
+ f"-e MATRICE_SECRET_ACCESS_KEY={shlex.quote(self.matrice_secret_access_key)} "
1638
+ f"-e ENV={shlex.quote(os.environ.get('ENV', 'prod'))} "
1639
+ f"-v /var/run/docker.sock:/var/run/docker.sock "
1640
+ f"--shm-size=30G --pull=always "
1641
+ f"{self.docker_container} "
1642
+ f"{self.action_record_id} "
1643
+ )
1644
+
1645
+ logging.info("Starting bg-redis management (Port: 8082) with REDIS_URL=%s: %s", env_vars['REDIS_URL'], cmd)
1646
+
1647
+ self.start(cmd, "redis_setup")
1648
+
1649
+
1650
+ @log_errors(raise_exception=False)
1651
+ def deploy_aggregator_execute(
1652
+ self: ActionInstance,
1653
+ ):
1654
+ """Execute deploy aggregator task."""
1655
+ work_fs = get_max_file_system()
1656
+ action_details = self.get_action_details()
1657
+ if not action_details:
1658
+ return
1659
+ self.setup_action_requirements(action_details, work_fs)
1660
+ cmd = f'{self.get_base_docker_cmd(work_fs)} python3 /usr/src/app/deploy_aggregator.py {self.action_record_id} "'
1661
+ logging.info("cmd: %s", cmd)
1662
+ self.start(cmd, "deploy_aggregator")
1663
+
1664
+
1665
+ @log_errors(raise_exception=False)
1666
+ def model_deploy_execute(self: ActionInstance):
1667
+ """Execute model deployment task."""
1668
+ external_port = self.scaling.get_open_port()
1669
+ internal_port = self.scaling.get_open_port()
1670
+ work_fs = get_max_file_system()
1671
+ action_details = self.get_action_details()
1672
+ if not action_details:
1673
+ return
1674
+ action_id = action_details["_id"]
1675
+ model_family = action_details["actionDetails"]["modelFamily"]
1676
+
1677
+ # Get the service ID to track deployments
1678
+ service_id = action_details.get("_idService")
1679
+
1680
+ self.setup_action_requirements(
1681
+ action_details,
1682
+ work_fs,
1683
+ model_family=model_family,
1684
+ action_id=action_id,
1685
+ )
1686
+
1687
+ # Check if this is the first deployment for this service
1688
+ is_first_deployment = ActionInstance.is_first_deployment_for_service(service_id)
1689
+
1690
+ # Get GPU configuration (uses utility function with fail-safe fallback)
1691
+ use_gpu = get_gpu_config_for_deployment(action_details, is_first_deployment)
1692
+
1693
+ logging.info(
1694
+ "Action %s: Model deployment GPU config: %s (first_deployment=%s)",
1695
+ action_id,
1696
+ use_gpu if use_gpu else "CPU-only",
1697
+ is_first_deployment
1698
+ )
1699
+
1700
+ # Get or create TRITON_PORTS (uses utility method)
1701
+ triton_ports = ActionInstance.get_or_create_triton_ports(service_id, self.scaling)
1702
+
1703
+ extra_env_vars = {
1704
+ "INTERNAL_PORT": internal_port,
1705
+ "TRITON_PORTS": triton_ports
1706
+ }
1707
+
1708
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, mount_docker_sock=True, action_id=action_id, extra_env_vars=extra_env_vars, extra_pkgs=["matrice_inference", "matrice_analytics"])} python3 deploy.py {self.action_record_id} {external_port}"'
1709
+ logging.info("cmd is: %s", cmd)
1710
+ self.start(cmd, "deploy_log")
1711
+
1712
+
1713
+ @log_errors(raise_exception=False)
1714
+ def model_train_execute(self: ActionInstance):
1715
+ """Execute model training task."""
1716
+ action_details = self.get_action_details()
1717
+ if not action_details:
1718
+ return
1719
+ action_id = action_details["_id"]
1720
+ use_gpu = self.get_gpu_config(action_details)
1721
+ work_fs = action_details["jobParams"]["host_file_system"]
1722
+ model_key = action_details["actionDetails"]["modelKey"]
1723
+ model_family = action_details["actionDetails"]["modelFamily"]
1724
+ self.setup_action_requirements(
1725
+ action_details,
1726
+ work_fs,
1727
+ model_family=model_family,
1728
+ action_id=action_id,
1729
+ )
1730
+
1731
+ if action_details["actionDetails"].get("containerId"):
1732
+ logging.info(
1733
+ "Using existing container ID for training: %s",
1734
+ action_details["actionDetails"]["containerId"],
1735
+ )
1736
+ self.docker_container = action_details["actionDetails"]["containerId"]
1737
+ cmd = "docker restart " + self.docker_container
1738
+ self.start(cmd, "train_log")
1739
+ return
1740
+
1741
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id, model_key=model_key)} python3 train.py {self.action_record_id} "'
1742
+ logging.info("cmd is: %s", cmd)
1743
+ self.start(cmd, "train_log")
1744
+
1745
+
1746
+ @log_errors(raise_exception=False)
1747
+ def model_eval_execute(self: ActionInstance):
1748
+ """Execute model evaluation task."""
1749
+ action_details = self.get_action_details()
1750
+ if not action_details:
1751
+ return
1752
+ action_id = action_details["_id"]
1753
+ work_fs = action_details["jobParams"]["host_file_system"]
1754
+ model_family = action_details["actionDetails"]["modelFamily"]
1755
+ use_gpu = self.get_gpu_config(action_details)
1756
+ self.setup_action_requirements(
1757
+ action_details,
1758
+ work_fs,
1759
+ model_family=model_family,
1760
+ action_id=action_id,
1761
+ )
1762
+ if action_details["actionDetails"].get("containerId"):
1763
+ logging.info(
1764
+ "Using existing container ID for training: %s",
1765
+ action_details["actionDetails"]["containerId"],
1766
+ )
1767
+ self.docker_container = action_details["actionDetails"]["containerId"]
1768
+ cmd = "docker restart " + self.docker_container
1769
+ self.start(cmd, "eval_log")
1770
+ return
1771
+
1772
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 eval.py {self.action_record_id} "'
1773
+ logging.info("cmd is: %s", cmd)
1774
+ self.start(cmd, "eval_log")
1775
+
1776
+
1777
+ @log_errors(raise_exception=False)
1778
+ def model_export_execute(self: ActionInstance):
1779
+ """Execute model export task."""
1780
+ work_fs = get_max_file_system()
1781
+ action_details = self.get_action_details()
1782
+ if not action_details:
1783
+ return
1784
+ action_id = action_details["_id"]
1785
+ if "host_file_system" in action_details["jobParams"]:
1786
+ work_fs = action_details["jobParams"]["host_file_system"]
1787
+ logging.info("host_file_system: %s", work_fs)
1788
+ use_gpu = self.get_gpu_config(action_details)
1789
+ model_family = action_details["actionDetails"]["modelFamily"]
1790
+ self.setup_action_requirements(
1791
+ action_details,
1792
+ work_fs,
1793
+ model_family=model_family,
1794
+ action_id=action_id,
1795
+ )
1796
+ if action_details["actionDetails"].get("containerId"):
1797
+ logging.info(
1798
+ "Using existing container ID for training: %s",
1799
+ action_details["actionDetails"]["containerId"],
1800
+ )
1801
+ self.docker_container = action_details["actionDetails"]["containerId"]
1802
+ cmd = "docker restart " + self.docker_container
1803
+ self.start(cmd, "export_log")
1804
+ return
1805
+
1806
+ cmd = f'{self.get_base_docker_cmd(work_fs, use_gpu, action_id=action_id)} python3 export.py {self.action_record_id} "'
1807
+ logging.info("cmd is: %s", cmd)
1808
+ self.start(cmd, "export_log")
1809
+
1810
+
1811
+ @log_errors(raise_exception=False)
1812
+ def image_build_execute(self: ActionInstance):
1813
+ """Execute image building task."""
1814
+ action_details = self.get_action_details()
1815
+ if not action_details:
1816
+ return
1817
+ self.setup_action_requirements(action_details)
1818
+ model_family_id = action_details["_idService"]
1819
+ action_id = action_details["_id"]
1820
+ internal_api_key = self.get_internal_api_key(action_id)
1821
+ extra_env_vars = {"MATRICE_INTERNAL_API_KEY": internal_api_key}
1822
+ cmd = f'{self.get_base_docker_cmd(mount_docker_sock=True, extra_env_vars=extra_env_vars)} python3 main.py {model_family_id} {action_id}"'
1823
+ logging.info("cmd is: %s", cmd)
1824
+ self.start(cmd, "image_build_log")
1825
+
1826
+
1827
+ @log_errors(raise_exception=False)
1828
+ def resource_clone_execute(self: ActionInstance):
1829
+ """Execute resource clone task."""
1830
+ action_details = self.get_action_details()
1831
+ if not action_details:
1832
+ return
1833
+ self.setup_action_requirements(action_details)
1834
+ cmd = f'{self.get_base_docker_cmd()} python3 main.py {self.action_record_id} "'
1835
+ logging.info("cmd is: %s", cmd)
1836
+ self.start(cmd, "resource_clone")
1837
+
1838
+
1839
+ @log_errors(raise_exception=False)
1840
+ def streaming_gateway_execute(self: ActionInstance):
1841
+ """Execute streaming gateway task."""
1842
+ action_details = self.get_action_details()
1843
+ if not action_details:
1844
+ return
1845
+ self.setup_action_requirements(action_details)
1846
+ if not self.docker_container:
1847
+ self.docker_container = (
1848
+ f"aiforeveryone/streaming-gateway:{os.environ.get('ENV', 'prod')}"
1849
+ )
1850
+ if action_details["actionDetails"].get("containerId"):
1851
+ logging.info(
1852
+ "Using existing container ID for training: %s",
1853
+ action_details["actionDetails"]["containerId"],
1854
+ )
1855
+ self.docker_container = action_details["actionDetails"]["containerId"]
1856
+ cmd = "docker restart " + self.docker_container
1857
+ self.start(cmd, "streaming_gateway")
1858
+ return
1859
+
1860
+ cmd = f'{self.get_base_docker_cmd(extra_pkgs=["matrice_streaming"])} python3 /usr/src/app/streaming_gateway.py {self.action_record_id} "'
1861
+ logging.info("cmd is: %s", cmd)
1862
+ self.start(cmd, "streaming_gateway")
1863
+
1864
+
1865
+ @log_errors(raise_exception=False)
1866
+ def kafka_setup_execute(self: ActionInstance):
1867
+ """
1868
+ Execute kafka server task.
1869
+ Kafka runs on port 9092 (SASL_PLAINTEXT) and 9093 (CONTROLLER) - localhost only with --net=host.
1870
+ """
1871
+ action_details = self.get_action_details()
1872
+ if not action_details:
1873
+ return
1874
+ host_port = self.scaling.get_open_port()
1875
+ host_ip = (
1876
+ urllib.request.urlopen("https://ident.me", timeout=10).read().decode("utf8")
1877
+ )
1878
+ # Setup credentials
1879
+ self.setup_action_requirements(action_details)
1880
+
1881
+ # Get Docker disk usage to calculate log retention
1882
+ from matrice_compute.instance_utils import get_docker_disk_space_usage
1883
+
1884
+ docker_disk_usage = get_docker_disk_space_usage()
1885
+ log_retention_bytes = 0
1886
+ if docker_disk_usage:
1887
+ # Calculate 90% of total Docker disk space in bytes
1888
+ available_disk_gb = docker_disk_usage["available"]
1889
+ log_retention_bytes = int(
1890
+ available_disk_gb * 0.9 * 1024 * 1024 * 1024
1891
+ ) # Convert GB to bytes
1892
+ logging.info(
1893
+ "Kafka log retention set to %d bytes (90%% of %f GB Docker disk)",
1894
+ log_retention_bytes,
1895
+ available_disk_gb,
1896
+ )
1897
+ else:
1898
+ # Fallback if Docker disk usage cannot be determined
1899
+ log_retention_bytes = 500 * 1024 * 1024 * 1024 # 10GB default
1900
+ logging.warning(
1901
+ "Could not determine Docker disk usage, using default 10GB log retention"
1902
+ )
1903
+
1904
+ # Prepare environment variables for Kafka
1905
+ env = os.environ.get("ENV", "prod")
1906
+ env_vars = {
1907
+ "ENV": env,
1908
+ "MATRICE_SECRET_ACCESS_KEY": self.matrice_secret_access_key,
1909
+ "MATRICE_ACCESS_KEY_ID": self.matrice_access_key_id,
1910
+ "KAFKA_NODE_ID": 1,
1911
+ "KAFKA_PROCESS_ROLES": "broker,controller",
1912
+ "KAFKA_LISTENERS": "SASL_PLAINTEXT://0.0.0.0:9092,CONTROLLER://0.0.0.0:9093",
1913
+ "KAFKA_ADVERTISED_LISTENERS": f"SASL_PLAINTEXT://{host_ip}:{host_port}",
1914
+ "KAFKA_LISTENER_SECURITY_PROTOCOL_MAP": "CONTROLLER:PLAINTEXT,SASL_PLAINTEXT:SASL_PLAINTEXT",
1915
+ "KAFKA_CONTROLLER_LISTENER_NAMES": "CONTROLLER",
1916
+ "KAFKA_CONTROLLER_QUORUM_VOTERS": "1@localhost:9093",
1917
+ "KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR": 1,
1918
+ "KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR": 1,
1919
+ "KAFKA_TRANSACTION_STATE_LOG_MIN_ISR": 1,
1920
+ "KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS": 0,
1921
+ "KAFKA_NUM_PARTITIONS": 5,
1922
+ "KAFKA_SASL_ENABLED_MECHANISMS": "SCRAM-SHA-256",
1923
+ "KAFKA_SASL_MECHANISM_INTER_BROKER_PROTOCOL": "SCRAM-SHA-256",
1924
+ "KAFKA_INTER_BROKER_LISTENER_NAME": "SASL_PLAINTEXT",
1925
+ "KAFKA_MESSAGE_MAX_BYTES": 25000000,
1926
+ "KAFKA_HEAP_OPTS": "-Xms2G -Xmx8G",
1927
+ "KAFKA_NUM_NETWORK_THREADS": 6,
1928
+ "KAFKA_NUM_IO_THREADS": 8,
1929
+ "KAFKA_REPLICA_FETCH_MAX_BYTES": 25000000,
1930
+ "KAFKA_FETCH_MESSAGE_MAX_BYTES": 25000000,
1931
+ "KAFKA_REPLICA_FETCH_RESPONSE_MAX_BYTES": 25000000,
1932
+ "KAFKA_REPLICA_FETCH_RESPONSE_MAX_BYTES": 25000000,
1933
+ # Log retention settings based on Docker disk space
1934
+ "KAFKA_LOG_RETENTION_BYTES": log_retention_bytes,
1935
+ "KAFKA_LOG_SEGMENT_BYTES": min(
1936
+ 1073741824, log_retention_bytes // 10
1937
+ ), # 1GB or 10% of retention, whichever is smaller
1938
+ }
1939
+
1940
+ # Build environment variable command parts
1941
+ env_args = " ".join(
1942
+ [f"-e {key}={shlex.quote(str(value))}" for key, value in env_vars.items()]
1943
+ )
1944
+
1945
+ # Build the docker command with --net=host
1946
+ pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
1947
+
1948
+ if env == 'dev':
1949
+ pypi_index = f"https://test.pypi.org/simple/ --pre"
1950
+ pkgs = f"matrice_common>=1.0.0 matrice>=1.0.0"
1951
+ else:
1952
+ pkgs = f"matrice_common matrice"
1953
+
1954
+ if action_details["actionDetails"].get("containerId"):
1955
+ logging.info(
1956
+ "Using existing container ID for training: %s",
1957
+ action_details["actionDetails"]["containerId"],
1958
+ )
1959
+ self.docker_container = action_details["actionDetails"]["containerId"]
1960
+ cmd = "docker restart " + self.docker_container
1961
+ self.start(cmd, "kafka_setup")
1962
+ return
1963
+
1964
+
1965
+ # Kafka container with --net=host (Ports: 9092, 9093)
1966
+ cmd = (
1967
+ f"docker run --net=host "
1968
+ f"{env_args} "
1969
+ f"--shm-size=30G --pull=always "
1970
+ f'aiforeveryone/matrice-kafka:latest /bin/bash -c "'
1971
+ f"cd /opt/kafka/bin && "
1972
+ f"source venv/bin/activate && "
1973
+ f"/opt/kafka/bin/startup.sh & "
1974
+ f"if [ -f requirements.txt ]; then venv/bin/python3 -m pip install -r requirements.txt; fi && "
1975
+ f"venv/bin/python3 -m pip install --upgrade --force-reinstall --index-url {pypi_index} {pkgs} && "
1976
+ f"sleep 20 && "
1977
+ f'venv/bin/python3 main.py {self.action_record_id} {host_port}"'
1978
+ )
1979
+
1980
+ logging.info("Starting Kafka container (Ports: 9092, 9093): %s", cmd)
1981
+ self.start(cmd, "kafka_setup")
1982
+
1983
+
1984
+ @log_errors(raise_exception=False)
1985
+ def inference_tracker_setup_execute(self: ActionInstance):
1986
+
1987
+ """
1988
+ Creates and start inference tracker.
1989
+ Inference tracker runs on port 8110 (localhost only with --net=host).
1990
+ """
1991
+
1992
+ action_details = self.get_action_details()
1993
+ if not action_details:
1994
+ return
1995
+
1996
+ image = self.docker_container
1997
+
1998
+ self.setup_action_requirements(action_details)
1999
+
2000
+ if action_details["actionDetails"].get("containerId"):
2001
+ logging.info(
2002
+ "Using existing container ID for inference tracker: %s",
2003
+ action_details["actionDetails"]["containerId"],
2004
+ )
2005
+ self.docker_container = action_details["actionDetails"]["containerId"]
2006
+ cmd = "docker restart " + self.docker_container
2007
+ self.start(cmd, "inference_tracker_setup")
2008
+ return
2009
+
2010
+ # This is the existing Docker run command
2011
+ worker_cmd = (
2012
+ f"docker run -d --pull=always --net=host "
2013
+ f"--cidfile ./{self.action_record_id}.cid "
2014
+ f"--name inference-tracker-worker "
2015
+ f"-v matrice_myvol:/matrice_data "
2016
+ f'-e ENV="{os.environ.get("ENV", "prod")}" '
2017
+ f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
2018
+ f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
2019
+ f'-e ACTION_ID="{self.action_record_id}" '
2020
+ f"{image}"
2021
+ )
2022
+
2023
+ self.start(worker_cmd, "inference_tracker_setup")