ddeutil-workflow 0.0.77__py3-none-any.whl → 0.0.79__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,908 @@
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
+ # Licensed under the MIT License. See LICENSE in the project root for
4
+ # license information.
5
+ # ------------------------------------------------------------------------------
6
+ """AWS Batch Provider Module.
7
+
8
+ This module provides AWS Batch integration for workflow job execution.
9
+ It handles compute environment creation, job queue management, job submission,
10
+ task execution, and result retrieval.
11
+
12
+ The AWS Batch provider enables running workflow jobs on AWS Batch compute
13
+ environments, providing scalable and managed execution environments for complex
14
+ workflow processing.
15
+
16
+ Key Features:
17
+ - Automatic compute environment creation and management
18
+ - Job queue management and job submission
19
+ - Result file upload/download via S3
20
+ - Error handling and status monitoring
21
+ - Resource cleanup and management
22
+ - Optimized file operations and caching
23
+
24
+ Classes:
25
+ AWSBatchProvider: Main provider for AWS Batch operations
26
+ BatchComputeEnvironmentConfig: Configuration for AWS Batch compute environments
27
+ BatchJobQueueConfig: Configuration for AWS Batch job queues
28
+ BatchJobConfig: Configuration for AWS Batch jobs
29
+ BatchTaskConfig: Configuration for AWS Batch tasks
30
+
31
+ References:
32
+ - https://docs.aws.amazon.com/batch/latest/userguide/what-is-batch.html
33
+ - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/batch.html
34
+
35
+ Config Example:
36
+
37
+ ```dotenv
38
+ export AWS_ACCESS_KEY_ID="your-access-key"
39
+ export AWS_SECRET_ACCESS_KEY="your-secret-key"
40
+ export AWS_DEFAULT_REGION="us-east-1"
41
+ export AWS_BATCH_JOB_QUEUE_ARN="arn:aws:batch:region:account:job-queue/queue-name"
42
+ export AWS_S3_BUCKET="your-s3-bucket"
43
+ ```
44
+
45
+ ```yaml
46
+ jobs:
47
+ my-job:
48
+ runs-on:
49
+ type: "aws_batch"
50
+ with:
51
+ job_queue_arn: "${AWS_BATCH_JOB_QUEUE_ARN}"
52
+ s3_bucket: "${AWS_S3_BUCKET}"
53
+ compute_environment_type: "EC2"
54
+ instance_types: ["c5.large", "c5.xlarge"]
55
+ stages:
56
+ - name: "process"
57
+ type: "py"
58
+ run: |
59
+ # Your processing logic here
60
+ result.context.update({"output": "processed"})
61
+ ```
62
+
63
+ """
64
+ from __future__ import annotations
65
+
66
+ import json
67
+ import os
68
+ import tempfile
69
+ import time
70
+ from contextlib import contextmanager
71
+ from typing import Any, Optional
72
+
73
+ try:
74
+ import boto3
75
+ from botocore.config import Config
76
+ from botocore.exceptions import ClientError
77
+
78
+ AWS_AVAILABLE = True
79
+ except ImportError:
80
+ AWS_AVAILABLE = False
81
+
82
+ from pydantic import BaseModel, Field
83
+
84
+ from ...__types import DictData
85
+ from ...job import Job
86
+ from ...result import FAILED, SUCCESS, Result
87
+ from ...traces import get_trace
88
+ from ...utils import gen_id
89
+
90
+
91
+ class BatchComputeEnvironmentConfig(BaseModel):
92
+ """AWS Batch compute environment configuration."""
93
+
94
+ compute_environment_name: str = Field(
95
+ description="Unique compute environment name"
96
+ )
97
+ compute_environment_type: str = Field(
98
+ default="EC2", description="Compute environment type (EC2/SPOT)"
99
+ )
100
+ instance_types: list[str] = Field(
101
+ default=["c5.large"], description="EC2 instance types"
102
+ )
103
+ min_vcpus: int = Field(default=0, description="Minimum vCPUs")
104
+ max_vcpus: int = Field(default=256, description="Maximum vCPUs")
105
+ desired_vcpus: int = Field(default=0, description="Desired vCPUs")
106
+ subnets: list[str] = Field(description="Subnet IDs for compute resources")
107
+ security_group_ids: list[str] = Field(description="Security group IDs")
108
+ instance_role: str = Field(description="IAM instance profile ARN")
109
+ service_role: str = Field(description="IAM service role ARN")
110
+ enable_managed_compute: bool = Field(
111
+ default=True, description="Enable managed compute"
112
+ )
113
+ spot_iam_fleet_role: Optional[str] = Field(
114
+ default=None, description="Spot IAM fleet role ARN"
115
+ )
116
+ bid_percentage: Optional[int] = Field(
117
+ default=None, description="Spot bid percentage"
118
+ )
119
+
120
+
121
+ class BatchJobQueueConfig(BaseModel):
122
+ """AWS Batch job queue configuration."""
123
+
124
+ job_queue_name: str = Field(description="Unique job queue name")
125
+ state: str = Field(default="ENABLED", description="Job queue state")
126
+ priority: int = Field(default=1, description="Job queue priority")
127
+ compute_environment_order: list[dict[str, str]] = Field(
128
+ description="Compute environment order"
129
+ )
130
+ scheduling_policy_arn: Optional[str] = Field(
131
+ default=None, description="Scheduling policy ARN"
132
+ )
133
+
134
+
135
+ class BatchJobConfig(BaseModel):
136
+ """AWS Batch job configuration."""
137
+
138
+ job_name: str = Field(description="Unique job name")
139
+ job_queue_arn: str = Field(description="Job queue ARN")
140
+ job_definition_arn: str = Field(description="Job definition ARN")
141
+ parameters: Optional[dict[str, str]] = Field(
142
+ default=None, description="Job parameters"
143
+ )
144
+ timeout: Optional[dict[str, int]] = Field(
145
+ default=None, description="Job timeout"
146
+ )
147
+ retry_strategy: Optional[dict[str, Any]] = Field(
148
+ default=None, description="Retry strategy"
149
+ )
150
+ depends_on: Optional[list[dict[str, str]]] = Field(
151
+ default=None, description="Job dependencies"
152
+ )
153
+
154
+
155
+ class BatchTaskConfig(BaseModel):
156
+ """AWS Batch task configuration."""
157
+
158
+ task_name: str = Field(description="Unique task name")
159
+ command: list[str] = Field(description="Command to execute")
160
+ vcpus: int = Field(default=1, description="Number of vCPUs")
161
+ memory: int = Field(default=1024, description="Memory in MiB")
162
+ job_role_arn: Optional[str] = Field(
163
+ default=None, description="IAM job role ARN"
164
+ )
165
+ timeout: Optional[dict[str, int]] = Field(
166
+ default=None, description="Task timeout"
167
+ )
168
+ environment_variables: Optional[dict[str, str]] = Field(
169
+ default=None, description="Environment variables"
170
+ )
171
+ mount_points: Optional[list[dict[str, str]]] = Field(
172
+ default=None, description="Mount points"
173
+ )
174
+ volumes: Optional[list[dict[str, Any]]] = Field(
175
+ default=None, description="Volumes"
176
+ )
177
+
178
+
179
+ class AWSBatchProvider:
180
+ """AWS Batch provider for workflow job execution.
181
+
182
+ This provider handles the complete lifecycle of AWS Batch operations
183
+ including compute environment creation, job queue management, job submission,
184
+ task execution, and result retrieval. It integrates with S3 for file management
185
+ and provides comprehensive error handling and monitoring.
186
+
187
+ Attributes:
188
+ batch_client: AWS Batch client
189
+ s3_client: AWS S3 client
190
+ ec2_client: AWS EC2 client
191
+ iam_client: AWS IAM client
192
+ s3_bucket: S3 bucket name for files
193
+ compute_env_config: Compute environment configuration
194
+ job_queue_config: Job queue configuration
195
+ job_config: Job configuration
196
+ task_config: Task configuration
197
+
198
+ Example:
199
+ ```python
200
+ provider = AWSBatchProvider(
201
+ job_queue_arn="arn:aws:batch:region:account:job-queue/queue-name",
202
+ s3_bucket="my-workflow-bucket",
203
+ region_name="us-east-1"
204
+ )
205
+
206
+ result = provider.execute_job(job, params, run_id="job-123")
207
+ ```
208
+ """
209
+
210
+ def __init__(
211
+ self,
212
+ job_queue_arn: str,
213
+ s3_bucket: str,
214
+ region_name: str = "us-east-1",
215
+ compute_env_config: Optional[BatchComputeEnvironmentConfig] = None,
216
+ job_queue_config: Optional[BatchJobQueueConfig] = None,
217
+ job_config: Optional[BatchJobConfig] = None,
218
+ task_config: Optional[BatchTaskConfig] = None,
219
+ aws_access_key_id: Optional[str] = None,
220
+ aws_secret_access_key: Optional[str] = None,
221
+ aws_session_token: Optional[str] = None,
222
+ ):
223
+ """Initialize AWS Batch provider.
224
+
225
+ Args:
226
+ job_queue_arn: AWS Batch job queue ARN
227
+ s3_bucket: S3 bucket name for files
228
+ region_name: AWS region name
229
+ compute_env_config: Compute environment configuration
230
+ job_queue_config: Job queue configuration
231
+ job_config: Job configuration
232
+ task_config: Task configuration
233
+ aws_access_key_id: AWS access key ID
234
+ aws_secret_access_key: AWS secret access key
235
+ aws_session_token: AWS session token
236
+ """
237
+ if not AWS_AVAILABLE:
238
+ raise ImportError(
239
+ "AWS dependencies not available. "
240
+ "Install with: pip install boto3"
241
+ )
242
+
243
+ self.job_queue_arn = job_queue_arn
244
+ self.s3_bucket = s3_bucket
245
+ self.region_name = region_name
246
+
247
+ # Initialize AWS clients with optimized configuration
248
+ session = boto3.Session(
249
+ aws_access_key_id=aws_access_key_id,
250
+ aws_secret_access_key=aws_secret_access_key,
251
+ aws_session_token=aws_session_token,
252
+ region_name=region_name,
253
+ )
254
+
255
+ # Configure clients with retry and timeout settings
256
+ config = Config(
257
+ retries={"max_attempts": 3, "mode": "adaptive"},
258
+ connect_timeout=30,
259
+ read_timeout=300,
260
+ )
261
+
262
+ self.batch_client = session.client("batch", config=config)
263
+ self.s3_client = session.client("s3", config=config)
264
+ self.ec2_client = session.client("ec2", config=config)
265
+ self.iam_client = session.client("iam", config=config)
266
+
267
+ # Set configurations
268
+ self.compute_env_config = compute_env_config
269
+ self.job_queue_config = job_queue_config
270
+ self.job_config = job_config
271
+ self.task_config = task_config
272
+
273
+ # Cache for bucket operations
274
+ self._bucket_exists: Optional[bool] = None
275
+
276
+ @contextmanager
277
+ def _temp_file_context(self, suffix: str = ".tmp"):
278
+ """Context manager for temporary file operations."""
279
+ temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
280
+ try:
281
+ yield temp_file.name
282
+ finally:
283
+ try:
284
+ os.unlink(temp_file.name)
285
+ except OSError:
286
+ pass
287
+
288
+ def _ensure_s3_bucket(self) -> None:
289
+ """Ensure S3 bucket exists with optimized settings."""
290
+ if self._bucket_exists is None:
291
+ try:
292
+ self.s3_client.head_bucket(Bucket=self.s3_bucket)
293
+ self._bucket_exists = True
294
+ except ClientError as e:
295
+ error_code = e.response["Error"]["Code"]
296
+ if error_code == "404":
297
+ # Create bucket with optimized settings
298
+ create_kwargs = {
299
+ "Bucket": self.s3_bucket,
300
+ "CreateBucketConfiguration": {
301
+ "LocationConstraint": self.region_name
302
+ },
303
+ }
304
+
305
+ # Add versioning for better data protection
306
+ self.s3_client.create_bucket(**create_kwargs)
307
+ self.s3_client.put_bucket_versioning(
308
+ Bucket=self.s3_bucket,
309
+ VersioningConfiguration={"Status": "Enabled"},
310
+ )
311
+
312
+ # Add lifecycle policy for cost optimization
313
+ lifecycle_config = {
314
+ "Rules": [
315
+ {
316
+ "ID": "workflow-cleanup",
317
+ "Status": "Enabled",
318
+ "Filter": {"Prefix": "jobs/"},
319
+ "Expiration": {
320
+ "Days": 7 # Keep workflow files for 7 days
321
+ },
322
+ }
323
+ ]
324
+ }
325
+
326
+ try:
327
+ self.s3_client.put_bucket_lifecycle_configuration(
328
+ Bucket=self.s3_bucket,
329
+ LifecycleConfiguration=lifecycle_config,
330
+ )
331
+ except ClientError:
332
+ # Lifecycle configuration might not be supported
333
+ pass
334
+
335
+ self._bucket_exists = True
336
+ else:
337
+ raise
338
+
339
+ def _upload_file_to_s3(self, file_path: str, s3_key: str) -> str:
340
+ """Upload file to S3 with optimized settings.
341
+
342
+ Args:
343
+ file_path: Local file path
344
+ s3_key: S3 object key
345
+
346
+ Returns:
347
+ str: S3 object URL
348
+ """
349
+ self._ensure_s3_bucket()
350
+
351
+ # Set optimized metadata for workflow files
352
+ metadata = {
353
+ "workflow_provider": "aws_batch",
354
+ "upload_time": str(time.time()),
355
+ "content_type": "application/octet-stream",
356
+ }
357
+
358
+ with open(file_path, "rb") as data:
359
+ self.s3_client.upload_fileobj(
360
+ data,
361
+ self.s3_bucket,
362
+ s3_key,
363
+ ExtraArgs={
364
+ "Metadata": metadata,
365
+ "StorageClass": "STANDARD_IA", # Use IA for cost optimization
366
+ },
367
+ )
368
+
369
+ return f"s3://{self.s3_bucket}/{s3_key}"
370
+
371
+ def _download_file_from_s3(self, s3_key: str, local_path: str) -> None:
372
+ """Download file from S3 with optimized settings.
373
+
374
+ Args:
375
+ s3_key: S3 object key
376
+ local_path: Local file path
377
+ """
378
+ self.s3_client.download_file(
379
+ self.s3_bucket,
380
+ s3_key,
381
+ local_path,
382
+ ExtraArgs={
383
+ "RequestPayer": "requester"
384
+ }, # Handle cross-account access
385
+ )
386
+
387
+ def _create_job_definition_if_not_exists(self, job_def_name: str) -> str:
388
+ """Create AWS Batch job definition if it doesn't exist with optimized settings.
389
+
390
+ Args:
391
+ job_def_name: Job definition name
392
+
393
+ Returns:
394
+ str: Job definition ARN
395
+ """
396
+ try:
397
+ response = self.batch_client.describe_job_definitions(
398
+ jobDefinitionName=job_def_name, status="ACTIVE"
399
+ )
400
+ if response["jobDefinitions"]:
401
+ return response["jobDefinitions"][0]["jobDefinitionArn"]
402
+ except ClientError:
403
+ pass
404
+
405
+ # Create optimized job definition
406
+ job_def_config = self.task_config or BatchTaskConfig(
407
+ task_name=job_def_name, command=["python3", "task_script.py"]
408
+ )
409
+
410
+ # Build environment variables
411
+ environment = []
412
+ if job_def_config.environment_variables:
413
+ for key, value in job_def_config.environment_variables.items():
414
+ environment.append({"name": key, "value": value})
415
+
416
+ # Add optimized environment variables
417
+ environment.extend(
418
+ [
419
+ {"name": "PYTHONUNBUFFERED", "value": "1"},
420
+ {"name": "PYTHONDONTWRITEBYTECODE", "value": "1"},
421
+ {"name": "AWS_DEFAULT_REGION", "value": self.region_name},
422
+ ]
423
+ )
424
+
425
+ # Build container properties
426
+ container_props = {
427
+ "image": "python:3.11-slim",
428
+ "vcpus": job_def_config.vcpus,
429
+ "memory": job_def_config.memory,
430
+ "command": job_def_config.command,
431
+ "environment": environment,
432
+ "resourceRequirements": [
433
+ {"type": "VCPU", "value": str(job_def_config.vcpus)},
434
+ {"type": "MEMORY", "value": str(job_def_config.memory)},
435
+ ],
436
+ }
437
+
438
+ # Add optional configurations
439
+ if job_def_config.job_role_arn:
440
+ container_props["jobRoleArn"] = job_def_config.job_role_arn
441
+ container_props["executionRoleArn"] = job_def_config.job_role_arn
442
+
443
+ if job_def_config.mount_points and job_def_config.volumes:
444
+ container_props["mountPoints"] = job_def_config.mount_points
445
+ container_props["volumes"] = job_def_config.volumes
446
+
447
+ response = self.batch_client.register_job_definition(
448
+ jobDefinitionName=job_def_name,
449
+ type="container",
450
+ containerProperties=container_props,
451
+ platformCapabilities=["EC2"], # Specify platform capabilities
452
+ )
453
+
454
+ return response["jobDefinitionArn"]
455
+
456
+ def _create_job(
457
+ self, job_name: str, job_def_arn: str, parameters: dict[str, str]
458
+ ) -> str:
459
+ """Create AWS Batch job with optimized settings.
460
+
461
+ Args:
462
+ job_name: Job name
463
+ job_def_arn: Job definition ARN
464
+ parameters: Job parameters
465
+
466
+ Returns:
467
+ str: Job ARN
468
+ """
469
+ job_config = self.job_config or BatchJobConfig(
470
+ job_name=job_name,
471
+ job_queue_arn=self.job_queue_arn,
472
+ job_definition_arn=job_def_arn,
473
+ )
474
+
475
+ # Build job parameters
476
+ job_params = {
477
+ "jobName": job_name,
478
+ "jobQueue": self.job_queue_arn,
479
+ "jobDefinition": job_def_arn,
480
+ "parameters": parameters or {},
481
+ }
482
+
483
+ # Add optional configurations
484
+ if job_config.timeout:
485
+ job_params["timeout"] = job_config.timeout
486
+
487
+ if job_config.retry_strategy:
488
+ job_params["retryStrategy"] = job_config.retry_strategy
489
+
490
+ if job_config.depends_on:
491
+ job_params["dependsOn"] = job_config.depends_on
492
+
493
+ response = self.batch_client.submit_job(**job_params)
494
+ return response["jobArn"]
495
+
496
+ def _wait_for_job_completion(
497
+ self, job_arn: str, timeout: int = 3600
498
+ ) -> dict[str, Any]:
499
+ """Wait for job completion with optimized polling.
500
+
501
+ Args:
502
+ job_arn: Job ARN
503
+ timeout: Timeout in seconds
504
+
505
+ Returns:
506
+ Dict[str, Any]: Job results
507
+ """
508
+ start_time = time.time()
509
+ poll_interval = 10 # Start with 10 second intervals
510
+
511
+ while time.time() - start_time < timeout:
512
+ try:
513
+ response = self.batch_client.describe_jobs(jobs=[job_arn])
514
+ job = response["jobs"][0]
515
+
516
+ if job["status"] == "SUCCEEDED":
517
+ return self._process_successful_job(job, job_arn)
518
+
519
+ elif job["status"] == "FAILED":
520
+ return self._process_failed_job(job)
521
+
522
+ elif job["status"] in ["RUNNING", "SUBMITTED", "PENDING"]:
523
+ # Adaptive polling: increase interval for long-running jobs
524
+ if time.time() - start_time > 300: # After 5 minutes
525
+ poll_interval = min(
526
+ poll_interval * 1.5, 60
527
+ ) # Max 60 seconds
528
+
529
+ time.sleep(poll_interval)
530
+ else:
531
+ # For other states, use shorter polling
532
+ time.sleep(5)
533
+
534
+ except ClientError as e:
535
+ if e.response["Error"]["Code"] == "JobNotFoundException":
536
+ # Job might be deleted, wait a bit and retry
537
+ time.sleep(poll_interval)
538
+ else:
539
+ # Continue polling on error with exponential backoff
540
+ poll_interval = min(poll_interval * 2, 60)
541
+ time.sleep(poll_interval)
542
+ except Exception:
543
+ # Continue polling on error with exponential backoff
544
+ poll_interval = min(poll_interval * 2, 60)
545
+ time.sleep(poll_interval)
546
+
547
+ return {"status": "timeout", "exit_code": 1}
548
+
549
+ def _process_successful_job(
550
+ self, job: dict[str, Any], job_arn: str
551
+ ) -> dict[str, Any]:
552
+ """Process successful job and download results.
553
+
554
+ Args:
555
+ job: Job object
556
+ job_arn: Job ARN
557
+
558
+ Returns:
559
+ Dict[str, Any]: Job results with files
560
+ """
561
+ result_files = {}
562
+ try:
563
+ # List objects in job's S3 prefix
564
+ job_id = job_arn.split("/")[-1]
565
+ prefix = f"jobs/{job_id}/"
566
+
567
+ response = self.s3_client.list_objects_v2(
568
+ Bucket=self.s3_bucket, Prefix=prefix
569
+ )
570
+
571
+ # Download result files efficiently
572
+ for obj in response.get("Contents", []):
573
+ if obj["Key"].endswith((".json", ".txt", ".log")):
574
+ with self._temp_file_context() as tmp_file:
575
+ self.s3_client.download_file(
576
+ self.s3_bucket, obj["Key"], tmp_file
577
+ )
578
+ with open(tmp_file) as f:
579
+ result_files[obj["Key"]] = f.read()
580
+ except Exception:
581
+ # File download failed, continue with empty results
582
+ pass
583
+
584
+ return {"status": "completed", "exit_code": 0, "files": result_files}
585
+
586
+ def _process_failed_job(self, job: dict[str, Any]) -> dict[str, Any]:
587
+ """Process failed job and extract error information.
588
+
589
+ Args:
590
+ job: Job object
591
+
592
+ Returns:
593
+ Dict[str, Any]: Failure information
594
+ """
595
+ failure_reason = "Job failed"
596
+
597
+ # Try to extract more detailed error information
598
+ if "attempts" in job and job["attempts"]:
599
+ last_attempt = job["attempts"][-1]
600
+ if "reason" in last_attempt:
601
+ failure_reason = last_attempt["reason"]
602
+
603
+ return {
604
+ "status": "failed",
605
+ "exit_code": 1,
606
+ "failure_reason": failure_reason,
607
+ }
608
+
609
+ def _create_optimized_task_script(
610
+ self, job: Job, params: DictData, run_id: str
611
+ ) -> str:
612
+ """Create optimized Python script for task execution.
613
+
614
+ Args:
615
+ job: Job to execute
616
+ params: Job parameters
617
+ run_id: Execution run ID
618
+
619
+ Returns:
620
+ str: Path to created script
621
+ """
622
+ script_content = f'''#!/usr/bin/env python3
623
+ import json
624
+ import sys
625
+ import os
626
+ import subprocess
627
+ import time
628
+ from pathlib import Path
629
+
630
+ def install_package(package):
631
+ """Install package with retry logic."""
632
+ for attempt in range(3):
633
+ try:
634
+ subprocess.run([sys.executable, '-m', 'pip', 'install', package],
635
+ check=True, capture_output=True, timeout=300)
636
+ return True
637
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
638
+ if attempt == 2:
639
+ raise
640
+ time.sleep(2 ** attempt)
641
+
642
+ def download_file(s3_url, local_path):
643
+ """Download file with retry logic."""
644
+ for attempt in range(3):
645
+ try:
646
+ subprocess.run(['aws', 's3', 'cp', s3_url, local_path],
647
+ check=True, capture_output=True, timeout=300)
648
+ return True
649
+ except subprocess.CalledProcessError:
650
+ if attempt == 2:
651
+ raise
652
+ time.sleep(2 ** attempt)
653
+
654
+ # Install ddeutil-workflow with retry
655
+ install_package('ddeutil-workflow')
656
+
657
+ # Download files with retry
658
+ download_file(os.environ['JOB_CONFIG_S3_URL'], 'job_config.json')
659
+ download_file(os.environ['PARAMS_S3_URL'], 'params.json')
660
+ download_file(os.environ['SCRIPT_S3_URL'], 'task_script.py')
661
+
662
+ # Add current directory to Python path
663
+ sys.path.insert(0, os.getcwd())
664
+
665
+ from ddeutil.workflow.job import local_execute
666
+ from ddeutil.workflow import Job
667
+
668
+ # Load job configuration
669
+ with open('job_config.json', 'r') as f:
670
+ job_data = json.load(f)
671
+
672
+ # Load parameters
673
+ with open('params.json', 'r') as f:
674
+ params = json.load(f)
675
+
676
+ # Create job instance
677
+ job = Job(**job_data)
678
+
679
+ # Execute job
680
+ result = local_execute(job, params, run_id='{run_id}')
681
+
682
+ # Save result
683
+ with open('result.json', 'w') as f:
684
+ json.dump(result.model_dump(), f, indent=2)
685
+
686
+ # Upload result to S3 with retry
687
+ job_id = '{run_id}'
688
+ bucket = '{self.s3_bucket}'
689
+
690
+ # Create directory structure
691
+ subprocess.run(['aws', 's3', 'mkdir', 's3://{{bucket}}/jobs/{{job_id}}'],
692
+ check=True, capture_output=True)
693
+
694
+ # Upload result file with retry
695
+ download_file('result.json', f's3://{{bucket}}/jobs/{{job_id}}/result.json')
696
+
697
+ sys.exit(0 if result.status == 'success' else 1)
698
+ '''
699
+
700
+ with self._temp_file_context(suffix=".py") as script_path:
701
+ with open(script_path, "w") as f:
702
+ f.write(script_content)
703
+ return script_path
704
+
705
+ def execute_job(
706
+ self,
707
+ job: Job,
708
+ params: DictData,
709
+ *,
710
+ run_id: Optional[str] = None,
711
+ event: Optional[Any] = None,
712
+ ) -> Result:
713
+ """Execute job on AWS Batch with optimized performance.
714
+
715
+ Args:
716
+ job: Job to execute
717
+ params: Job parameters
718
+ run_id: Execution run ID
719
+ event: Event for cancellation
720
+
721
+ Returns:
722
+ Result: Execution result
723
+ """
724
+ if event and event.is_set():
725
+ return Result(
726
+ status=FAILED,
727
+ context={
728
+ "errors": {"message": "Execution was canceled before start"}
729
+ },
730
+ run_id=run_id or gen_id("aws-batch"),
731
+ extras={},
732
+ )
733
+
734
+ # Generate run ID if not provided
735
+ if not run_id:
736
+ run_id = gen_id(job.id or "aws-batch", unique=True)
737
+
738
+ trace = get_trace(run_id, extras=job.extras)
739
+ trace.info(f"[AWS_BATCH]: Starting job execution: {job.id}")
740
+
741
+ try:
742
+ # Create job definition
743
+ job_def_name = f"workflow-job-def-{run_id}"
744
+ trace.info(f"[AWS_BATCH]: Creating job definition: {job_def_name}")
745
+ job_def_arn = self._create_job_definition_if_not_exists(
746
+ job_def_name
747
+ )
748
+
749
+ # Create optimized task script
750
+ script_path = self._create_optimized_task_script(
751
+ job, params, run_id
752
+ )
753
+
754
+ # Upload files efficiently
755
+ job_config_s3_key = f"jobs/{run_id}/job_config.json"
756
+ params_s3_key = f"jobs/{run_id}/params.json"
757
+ script_s3_key = f"jobs/{run_id}/task_script.py"
758
+
759
+ # Upload files efficiently
760
+ trace.info("[AWS_BATCH]: Uploading files to S3")
761
+
762
+ with self._temp_file_context(suffix=".json") as job_config_path:
763
+ with open(job_config_path, "w") as f:
764
+ json.dump(job.model_dump(), f)
765
+ self._upload_file_to_s3(job_config_path, job_config_s3_key)
766
+
767
+ with self._temp_file_context(suffix=".json") as params_path:
768
+ with open(params_path, "w") as f:
769
+ json.dump(params, f)
770
+ self._upload_file_to_s3(params_path, params_s3_key)
771
+
772
+ self._upload_file_to_s3(script_path, script_s3_key)
773
+
774
+ # Create job
775
+ job_name = f"workflow-job-{run_id}"
776
+ job_parameters = {
777
+ "job_config_s3_url": f"s3://{self.s3_bucket}/{job_config_s3_key}",
778
+ "params_s3_url": f"s3://{self.s3_bucket}/{params_s3_key}",
779
+ "script_s3_url": f"s3://{self.s3_bucket}/{script_s3_key}",
780
+ }
781
+
782
+ trace.info(f"[AWS_BATCH]: Creating job: {job_name}")
783
+ job_arn = self._create_job(job_name, job_def_arn, job_parameters)
784
+
785
+ # Wait for job completion
786
+ trace.info("[AWS_BATCH]: Waiting for job completion")
787
+ job_result = self._wait_for_job_completion(job_arn)
788
+
789
+ # Process results
790
+ if job_result["status"] == "completed":
791
+ result_data = {}
792
+ result_file_key = f"jobs/{run_id}/result.json"
793
+
794
+ if result_file_key in job_result.get("files", {}):
795
+ try:
796
+ result_data = json.loads(
797
+ job_result["files"][result_file_key]
798
+ )
799
+ except (json.JSONDecodeError, KeyError):
800
+ result_data = {"status": SUCCESS}
801
+
802
+ trace.info("[AWS_BATCH]: Job completed successfully")
803
+ return Result(
804
+ status=SUCCESS,
805
+ context=result_data,
806
+ run_id=run_id,
807
+ extras=job.extras or {},
808
+ )
809
+ else:
810
+ error_msg = f"Job failed: {job_result.get('status', 'unknown')}"
811
+ if job_result.get("failure_reason"):
812
+ error_msg += f" - {job_result['failure_reason']}"
813
+
814
+ trace.error(f"[AWS_BATCH]: {error_msg}")
815
+ return Result(
816
+ status=FAILED,
817
+ context={"errors": {"message": error_msg}},
818
+ run_id=run_id,
819
+ extras=job.extras or {},
820
+ )
821
+
822
+ except Exception as e:
823
+ trace.error(f"[AWS_BATCH]: Execution failed: {str(e)}")
824
+ return Result(
825
+ status=FAILED,
826
+ context={"errors": {"message": str(e)}},
827
+ run_id=run_id,
828
+ extras=job.extras or {},
829
+ )
830
+
831
+ def cleanup(self, job_id: Optional[str] = None) -> None:
832
+ """Clean up AWS Batch resources efficiently.
833
+
834
+ Args:
835
+ job_id: Job ID to clean up (if None, cleans up all workflow jobs)
836
+ """
837
+ try:
838
+ prefix = f"jobs/{job_id}/" if job_id else "jobs/"
839
+
840
+ # List objects with pagination for large datasets
841
+ paginator = self.s3_client.get_paginator("list_objects_v2")
842
+ pages = paginator.paginate(Bucket=self.s3_bucket, Prefix=prefix)
843
+
844
+ # Delete objects in batches for better performance
845
+ batch_size = 1000
846
+ objects_to_delete = []
847
+
848
+ for page in pages:
849
+ for obj in page.get("Contents", []):
850
+ objects_to_delete.append({"Key": obj["Key"]})
851
+
852
+ if len(objects_to_delete) >= batch_size:
853
+ self.s3_client.delete_objects(
854
+ Bucket=self.s3_bucket,
855
+ Delete={"Objects": objects_to_delete},
856
+ )
857
+ objects_to_delete = []
858
+
859
+ # Delete remaining objects
860
+ if objects_to_delete:
861
+ self.s3_client.delete_objects(
862
+ Bucket=self.s3_bucket, Delete={"Objects": objects_to_delete}
863
+ )
864
+
865
+ except Exception:
866
+ pass
867
+
868
+
869
+ def aws_batch_execute(
870
+ job: Job,
871
+ params: DictData,
872
+ *,
873
+ run_id: Optional[str] = None,
874
+ event: Optional[Any] = None,
875
+ ) -> Result:
876
+ """AWS Batch job execution function with optimized performance.
877
+
878
+ This function creates an AWS Batch provider and executes the job
879
+ on AWS Batch compute environments. It handles the complete lifecycle
880
+ including job definition creation, job submission, and result retrieval.
881
+
882
+ Args:
883
+ job: Job to execute
884
+ params: Job parameters
885
+ run_id: Execution run ID
886
+ event: Event for cancellation
887
+
888
+ Returns:
889
+ Result: Execution result
890
+ """
891
+ # Extract AWS Batch configuration from job
892
+ batch_args = job.runs_on.args
893
+
894
+ provider = AWSBatchProvider(
895
+ job_queue_arn=batch_args.job_queue_arn,
896
+ s3_bucket=batch_args.s3_bucket,
897
+ region_name=batch_args.region_name,
898
+ aws_access_key_id=batch_args.aws_access_key_id,
899
+ aws_secret_access_key=batch_args.aws_secret_access_key,
900
+ aws_session_token=batch_args.aws_session_token,
901
+ )
902
+
903
+ try:
904
+ return provider.execute_job(job, params, run_id=run_id, event=event)
905
+ finally:
906
+ # Clean up resources
907
+ if run_id:
908
+ provider.cleanup(run_id)