ddeutil-workflow 0.0.78__py3-none-any.whl → 0.0.79__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddeutil/workflow/__about__.py +1 -1
- ddeutil/workflow/__init__.py +1 -5
- ddeutil/workflow/api/routes/job.py +2 -2
- ddeutil/workflow/audits.py +554 -112
- ddeutil/workflow/cli.py +19 -1
- ddeutil/workflow/conf.py +9 -21
- ddeutil/workflow/event.py +15 -6
- ddeutil/workflow/job.py +147 -73
- ddeutil/workflow/params.py +172 -58
- ddeutil/workflow/plugins/__init__.py +0 -0
- ddeutil/workflow/plugins/providers/__init__.py +0 -0
- ddeutil/workflow/plugins/providers/aws.py +908 -0
- ddeutil/workflow/plugins/providers/az.py +1003 -0
- ddeutil/workflow/plugins/providers/container.py +703 -0
- ddeutil/workflow/plugins/providers/gcs.py +826 -0
- ddeutil/workflow/result.py +6 -4
- ddeutil/workflow/reusables.py +151 -95
- ddeutil/workflow/stages.py +28 -28
- ddeutil/workflow/traces.py +1678 -540
- ddeutil/workflow/utils.py +109 -67
- ddeutil/workflow/workflow.py +20 -11
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/METADATA +52 -19
- ddeutil_workflow-0.0.79.dist-info/RECORD +36 -0
- ddeutil_workflow-0.0.78.dist-info/RECORD +0 -30
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/WHEEL +0 -0
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/entry_points.txt +0 -0
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/licenses/LICENSE +0 -0
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,908 @@
|
|
1
|
+
# ------------------------------------------------------------------------------
|
2
|
+
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
+
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
+
# license information.
|
5
|
+
# ------------------------------------------------------------------------------
|
6
|
+
"""AWS Batch Provider Module.
|
7
|
+
|
8
|
+
This module provides AWS Batch integration for workflow job execution.
|
9
|
+
It handles compute environment creation, job queue management, job submission,
|
10
|
+
task execution, and result retrieval.
|
11
|
+
|
12
|
+
The AWS Batch provider enables running workflow jobs on AWS Batch compute
|
13
|
+
environments, providing scalable and managed execution environments for complex
|
14
|
+
workflow processing.
|
15
|
+
|
16
|
+
Key Features:
|
17
|
+
- Automatic compute environment creation and management
|
18
|
+
- Job queue management and job submission
|
19
|
+
- Result file upload/download via S3
|
20
|
+
- Error handling and status monitoring
|
21
|
+
- Resource cleanup and management
|
22
|
+
- Optimized file operations and caching
|
23
|
+
|
24
|
+
Classes:
|
25
|
+
AWSBatchProvider: Main provider for AWS Batch operations
|
26
|
+
BatchComputeEnvironmentConfig: Configuration for AWS Batch compute environments
|
27
|
+
BatchJobQueueConfig: Configuration for AWS Batch job queues
|
28
|
+
BatchJobConfig: Configuration for AWS Batch jobs
|
29
|
+
BatchTaskConfig: Configuration for AWS Batch tasks
|
30
|
+
|
31
|
+
References:
|
32
|
+
- https://docs.aws.amazon.com/batch/latest/userguide/what-is-batch.html
|
33
|
+
- https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/batch.html
|
34
|
+
|
35
|
+
Config Example:
|
36
|
+
|
37
|
+
```dotenv
|
38
|
+
export AWS_ACCESS_KEY_ID="your-access-key"
|
39
|
+
export AWS_SECRET_ACCESS_KEY="your-secret-key"
|
40
|
+
export AWS_DEFAULT_REGION="us-east-1"
|
41
|
+
export AWS_BATCH_JOB_QUEUE_ARN="arn:aws:batch:region:account:job-queue/queue-name"
|
42
|
+
export AWS_S3_BUCKET="your-s3-bucket"
|
43
|
+
```
|
44
|
+
|
45
|
+
```yaml
|
46
|
+
jobs:
|
47
|
+
my-job:
|
48
|
+
runs-on:
|
49
|
+
type: "aws_batch"
|
50
|
+
with:
|
51
|
+
job_queue_arn: "${AWS_BATCH_JOB_QUEUE_ARN}"
|
52
|
+
s3_bucket: "${AWS_S3_BUCKET}"
|
53
|
+
compute_environment_type: "EC2"
|
54
|
+
instance_types: ["c5.large", "c5.xlarge"]
|
55
|
+
stages:
|
56
|
+
- name: "process"
|
57
|
+
type: "py"
|
58
|
+
run: |
|
59
|
+
# Your processing logic here
|
60
|
+
result.context.update({"output": "processed"})
|
61
|
+
```
|
62
|
+
|
63
|
+
"""
|
64
|
+
from __future__ import annotations
|
65
|
+
|
66
|
+
import json
|
67
|
+
import os
|
68
|
+
import tempfile
|
69
|
+
import time
|
70
|
+
from contextlib import contextmanager
|
71
|
+
from typing import Any, Optional
|
72
|
+
|
73
|
+
try:
|
74
|
+
import boto3
|
75
|
+
from botocore.config import Config
|
76
|
+
from botocore.exceptions import ClientError
|
77
|
+
|
78
|
+
AWS_AVAILABLE = True
|
79
|
+
except ImportError:
|
80
|
+
AWS_AVAILABLE = False
|
81
|
+
|
82
|
+
from pydantic import BaseModel, Field
|
83
|
+
|
84
|
+
from ...__types import DictData
|
85
|
+
from ...job import Job
|
86
|
+
from ...result import FAILED, SUCCESS, Result
|
87
|
+
from ...traces import get_trace
|
88
|
+
from ...utils import gen_id
|
89
|
+
|
90
|
+
|
91
|
+
class BatchComputeEnvironmentConfig(BaseModel):
|
92
|
+
"""AWS Batch compute environment configuration."""
|
93
|
+
|
94
|
+
compute_environment_name: str = Field(
|
95
|
+
description="Unique compute environment name"
|
96
|
+
)
|
97
|
+
compute_environment_type: str = Field(
|
98
|
+
default="EC2", description="Compute environment type (EC2/SPOT)"
|
99
|
+
)
|
100
|
+
instance_types: list[str] = Field(
|
101
|
+
default=["c5.large"], description="EC2 instance types"
|
102
|
+
)
|
103
|
+
min_vcpus: int = Field(default=0, description="Minimum vCPUs")
|
104
|
+
max_vcpus: int = Field(default=256, description="Maximum vCPUs")
|
105
|
+
desired_vcpus: int = Field(default=0, description="Desired vCPUs")
|
106
|
+
subnets: list[str] = Field(description="Subnet IDs for compute resources")
|
107
|
+
security_group_ids: list[str] = Field(description="Security group IDs")
|
108
|
+
instance_role: str = Field(description="IAM instance profile ARN")
|
109
|
+
service_role: str = Field(description="IAM service role ARN")
|
110
|
+
enable_managed_compute: bool = Field(
|
111
|
+
default=True, description="Enable managed compute"
|
112
|
+
)
|
113
|
+
spot_iam_fleet_role: Optional[str] = Field(
|
114
|
+
default=None, description="Spot IAM fleet role ARN"
|
115
|
+
)
|
116
|
+
bid_percentage: Optional[int] = Field(
|
117
|
+
default=None, description="Spot bid percentage"
|
118
|
+
)
|
119
|
+
|
120
|
+
|
121
|
+
class BatchJobQueueConfig(BaseModel):
|
122
|
+
"""AWS Batch job queue configuration."""
|
123
|
+
|
124
|
+
job_queue_name: str = Field(description="Unique job queue name")
|
125
|
+
state: str = Field(default="ENABLED", description="Job queue state")
|
126
|
+
priority: int = Field(default=1, description="Job queue priority")
|
127
|
+
compute_environment_order: list[dict[str, str]] = Field(
|
128
|
+
description="Compute environment order"
|
129
|
+
)
|
130
|
+
scheduling_policy_arn: Optional[str] = Field(
|
131
|
+
default=None, description="Scheduling policy ARN"
|
132
|
+
)
|
133
|
+
|
134
|
+
|
135
|
+
class BatchJobConfig(BaseModel):
|
136
|
+
"""AWS Batch job configuration."""
|
137
|
+
|
138
|
+
job_name: str = Field(description="Unique job name")
|
139
|
+
job_queue_arn: str = Field(description="Job queue ARN")
|
140
|
+
job_definition_arn: str = Field(description="Job definition ARN")
|
141
|
+
parameters: Optional[dict[str, str]] = Field(
|
142
|
+
default=None, description="Job parameters"
|
143
|
+
)
|
144
|
+
timeout: Optional[dict[str, int]] = Field(
|
145
|
+
default=None, description="Job timeout"
|
146
|
+
)
|
147
|
+
retry_strategy: Optional[dict[str, Any]] = Field(
|
148
|
+
default=None, description="Retry strategy"
|
149
|
+
)
|
150
|
+
depends_on: Optional[list[dict[str, str]]] = Field(
|
151
|
+
default=None, description="Job dependencies"
|
152
|
+
)
|
153
|
+
|
154
|
+
|
155
|
+
class BatchTaskConfig(BaseModel):
|
156
|
+
"""AWS Batch task configuration."""
|
157
|
+
|
158
|
+
task_name: str = Field(description="Unique task name")
|
159
|
+
command: list[str] = Field(description="Command to execute")
|
160
|
+
vcpus: int = Field(default=1, description="Number of vCPUs")
|
161
|
+
memory: int = Field(default=1024, description="Memory in MiB")
|
162
|
+
job_role_arn: Optional[str] = Field(
|
163
|
+
default=None, description="IAM job role ARN"
|
164
|
+
)
|
165
|
+
timeout: Optional[dict[str, int]] = Field(
|
166
|
+
default=None, description="Task timeout"
|
167
|
+
)
|
168
|
+
environment_variables: Optional[dict[str, str]] = Field(
|
169
|
+
default=None, description="Environment variables"
|
170
|
+
)
|
171
|
+
mount_points: Optional[list[dict[str, str]]] = Field(
|
172
|
+
default=None, description="Mount points"
|
173
|
+
)
|
174
|
+
volumes: Optional[list[dict[str, Any]]] = Field(
|
175
|
+
default=None, description="Volumes"
|
176
|
+
)
|
177
|
+
|
178
|
+
|
179
|
+
class AWSBatchProvider:
|
180
|
+
"""AWS Batch provider for workflow job execution.
|
181
|
+
|
182
|
+
This provider handles the complete lifecycle of AWS Batch operations
|
183
|
+
including compute environment creation, job queue management, job submission,
|
184
|
+
task execution, and result retrieval. It integrates with S3 for file management
|
185
|
+
and provides comprehensive error handling and monitoring.
|
186
|
+
|
187
|
+
Attributes:
|
188
|
+
batch_client: AWS Batch client
|
189
|
+
s3_client: AWS S3 client
|
190
|
+
ec2_client: AWS EC2 client
|
191
|
+
iam_client: AWS IAM client
|
192
|
+
s3_bucket: S3 bucket name for files
|
193
|
+
compute_env_config: Compute environment configuration
|
194
|
+
job_queue_config: Job queue configuration
|
195
|
+
job_config: Job configuration
|
196
|
+
task_config: Task configuration
|
197
|
+
|
198
|
+
Example:
|
199
|
+
```python
|
200
|
+
provider = AWSBatchProvider(
|
201
|
+
job_queue_arn="arn:aws:batch:region:account:job-queue/queue-name",
|
202
|
+
s3_bucket="my-workflow-bucket",
|
203
|
+
region_name="us-east-1"
|
204
|
+
)
|
205
|
+
|
206
|
+
result = provider.execute_job(job, params, run_id="job-123")
|
207
|
+
```
|
208
|
+
"""
|
209
|
+
|
210
|
+
def __init__(
|
211
|
+
self,
|
212
|
+
job_queue_arn: str,
|
213
|
+
s3_bucket: str,
|
214
|
+
region_name: str = "us-east-1",
|
215
|
+
compute_env_config: Optional[BatchComputeEnvironmentConfig] = None,
|
216
|
+
job_queue_config: Optional[BatchJobQueueConfig] = None,
|
217
|
+
job_config: Optional[BatchJobConfig] = None,
|
218
|
+
task_config: Optional[BatchTaskConfig] = None,
|
219
|
+
aws_access_key_id: Optional[str] = None,
|
220
|
+
aws_secret_access_key: Optional[str] = None,
|
221
|
+
aws_session_token: Optional[str] = None,
|
222
|
+
):
|
223
|
+
"""Initialize AWS Batch provider.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
job_queue_arn: AWS Batch job queue ARN
|
227
|
+
s3_bucket: S3 bucket name for files
|
228
|
+
region_name: AWS region name
|
229
|
+
compute_env_config: Compute environment configuration
|
230
|
+
job_queue_config: Job queue configuration
|
231
|
+
job_config: Job configuration
|
232
|
+
task_config: Task configuration
|
233
|
+
aws_access_key_id: AWS access key ID
|
234
|
+
aws_secret_access_key: AWS secret access key
|
235
|
+
aws_session_token: AWS session token
|
236
|
+
"""
|
237
|
+
if not AWS_AVAILABLE:
|
238
|
+
raise ImportError(
|
239
|
+
"AWS dependencies not available. "
|
240
|
+
"Install with: pip install boto3"
|
241
|
+
)
|
242
|
+
|
243
|
+
self.job_queue_arn = job_queue_arn
|
244
|
+
self.s3_bucket = s3_bucket
|
245
|
+
self.region_name = region_name
|
246
|
+
|
247
|
+
# Initialize AWS clients with optimized configuration
|
248
|
+
session = boto3.Session(
|
249
|
+
aws_access_key_id=aws_access_key_id,
|
250
|
+
aws_secret_access_key=aws_secret_access_key,
|
251
|
+
aws_session_token=aws_session_token,
|
252
|
+
region_name=region_name,
|
253
|
+
)
|
254
|
+
|
255
|
+
# Configure clients with retry and timeout settings
|
256
|
+
config = Config(
|
257
|
+
retries={"max_attempts": 3, "mode": "adaptive"},
|
258
|
+
connect_timeout=30,
|
259
|
+
read_timeout=300,
|
260
|
+
)
|
261
|
+
|
262
|
+
self.batch_client = session.client("batch", config=config)
|
263
|
+
self.s3_client = session.client("s3", config=config)
|
264
|
+
self.ec2_client = session.client("ec2", config=config)
|
265
|
+
self.iam_client = session.client("iam", config=config)
|
266
|
+
|
267
|
+
# Set configurations
|
268
|
+
self.compute_env_config = compute_env_config
|
269
|
+
self.job_queue_config = job_queue_config
|
270
|
+
self.job_config = job_config
|
271
|
+
self.task_config = task_config
|
272
|
+
|
273
|
+
# Cache for bucket operations
|
274
|
+
self._bucket_exists: Optional[bool] = None
|
275
|
+
|
276
|
+
@contextmanager
|
277
|
+
def _temp_file_context(self, suffix: str = ".tmp"):
|
278
|
+
"""Context manager for temporary file operations."""
|
279
|
+
temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
280
|
+
try:
|
281
|
+
yield temp_file.name
|
282
|
+
finally:
|
283
|
+
try:
|
284
|
+
os.unlink(temp_file.name)
|
285
|
+
except OSError:
|
286
|
+
pass
|
287
|
+
|
288
|
+
def _ensure_s3_bucket(self) -> None:
|
289
|
+
"""Ensure S3 bucket exists with optimized settings."""
|
290
|
+
if self._bucket_exists is None:
|
291
|
+
try:
|
292
|
+
self.s3_client.head_bucket(Bucket=self.s3_bucket)
|
293
|
+
self._bucket_exists = True
|
294
|
+
except ClientError as e:
|
295
|
+
error_code = e.response["Error"]["Code"]
|
296
|
+
if error_code == "404":
|
297
|
+
# Create bucket with optimized settings
|
298
|
+
create_kwargs = {
|
299
|
+
"Bucket": self.s3_bucket,
|
300
|
+
"CreateBucketConfiguration": {
|
301
|
+
"LocationConstraint": self.region_name
|
302
|
+
},
|
303
|
+
}
|
304
|
+
|
305
|
+
# Add versioning for better data protection
|
306
|
+
self.s3_client.create_bucket(**create_kwargs)
|
307
|
+
self.s3_client.put_bucket_versioning(
|
308
|
+
Bucket=self.s3_bucket,
|
309
|
+
VersioningConfiguration={"Status": "Enabled"},
|
310
|
+
)
|
311
|
+
|
312
|
+
# Add lifecycle policy for cost optimization
|
313
|
+
lifecycle_config = {
|
314
|
+
"Rules": [
|
315
|
+
{
|
316
|
+
"ID": "workflow-cleanup",
|
317
|
+
"Status": "Enabled",
|
318
|
+
"Filter": {"Prefix": "jobs/"},
|
319
|
+
"Expiration": {
|
320
|
+
"Days": 7 # Keep workflow files for 7 days
|
321
|
+
},
|
322
|
+
}
|
323
|
+
]
|
324
|
+
}
|
325
|
+
|
326
|
+
try:
|
327
|
+
self.s3_client.put_bucket_lifecycle_configuration(
|
328
|
+
Bucket=self.s3_bucket,
|
329
|
+
LifecycleConfiguration=lifecycle_config,
|
330
|
+
)
|
331
|
+
except ClientError:
|
332
|
+
# Lifecycle configuration might not be supported
|
333
|
+
pass
|
334
|
+
|
335
|
+
self._bucket_exists = True
|
336
|
+
else:
|
337
|
+
raise
|
338
|
+
|
339
|
+
def _upload_file_to_s3(self, file_path: str, s3_key: str) -> str:
|
340
|
+
"""Upload file to S3 with optimized settings.
|
341
|
+
|
342
|
+
Args:
|
343
|
+
file_path: Local file path
|
344
|
+
s3_key: S3 object key
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
str: S3 object URL
|
348
|
+
"""
|
349
|
+
self._ensure_s3_bucket()
|
350
|
+
|
351
|
+
# Set optimized metadata for workflow files
|
352
|
+
metadata = {
|
353
|
+
"workflow_provider": "aws_batch",
|
354
|
+
"upload_time": str(time.time()),
|
355
|
+
"content_type": "application/octet-stream",
|
356
|
+
}
|
357
|
+
|
358
|
+
with open(file_path, "rb") as data:
|
359
|
+
self.s3_client.upload_fileobj(
|
360
|
+
data,
|
361
|
+
self.s3_bucket,
|
362
|
+
s3_key,
|
363
|
+
ExtraArgs={
|
364
|
+
"Metadata": metadata,
|
365
|
+
"StorageClass": "STANDARD_IA", # Use IA for cost optimization
|
366
|
+
},
|
367
|
+
)
|
368
|
+
|
369
|
+
return f"s3://{self.s3_bucket}/{s3_key}"
|
370
|
+
|
371
|
+
def _download_file_from_s3(self, s3_key: str, local_path: str) -> None:
|
372
|
+
"""Download file from S3 with optimized settings.
|
373
|
+
|
374
|
+
Args:
|
375
|
+
s3_key: S3 object key
|
376
|
+
local_path: Local file path
|
377
|
+
"""
|
378
|
+
self.s3_client.download_file(
|
379
|
+
self.s3_bucket,
|
380
|
+
s3_key,
|
381
|
+
local_path,
|
382
|
+
ExtraArgs={
|
383
|
+
"RequestPayer": "requester"
|
384
|
+
}, # Handle cross-account access
|
385
|
+
)
|
386
|
+
|
387
|
+
def _create_job_definition_if_not_exists(self, job_def_name: str) -> str:
|
388
|
+
"""Create AWS Batch job definition if it doesn't exist with optimized settings.
|
389
|
+
|
390
|
+
Args:
|
391
|
+
job_def_name: Job definition name
|
392
|
+
|
393
|
+
Returns:
|
394
|
+
str: Job definition ARN
|
395
|
+
"""
|
396
|
+
try:
|
397
|
+
response = self.batch_client.describe_job_definitions(
|
398
|
+
jobDefinitionName=job_def_name, status="ACTIVE"
|
399
|
+
)
|
400
|
+
if response["jobDefinitions"]:
|
401
|
+
return response["jobDefinitions"][0]["jobDefinitionArn"]
|
402
|
+
except ClientError:
|
403
|
+
pass
|
404
|
+
|
405
|
+
# Create optimized job definition
|
406
|
+
job_def_config = self.task_config or BatchTaskConfig(
|
407
|
+
task_name=job_def_name, command=["python3", "task_script.py"]
|
408
|
+
)
|
409
|
+
|
410
|
+
# Build environment variables
|
411
|
+
environment = []
|
412
|
+
if job_def_config.environment_variables:
|
413
|
+
for key, value in job_def_config.environment_variables.items():
|
414
|
+
environment.append({"name": key, "value": value})
|
415
|
+
|
416
|
+
# Add optimized environment variables
|
417
|
+
environment.extend(
|
418
|
+
[
|
419
|
+
{"name": "PYTHONUNBUFFERED", "value": "1"},
|
420
|
+
{"name": "PYTHONDONTWRITEBYTECODE", "value": "1"},
|
421
|
+
{"name": "AWS_DEFAULT_REGION", "value": self.region_name},
|
422
|
+
]
|
423
|
+
)
|
424
|
+
|
425
|
+
# Build container properties
|
426
|
+
container_props = {
|
427
|
+
"image": "python:3.11-slim",
|
428
|
+
"vcpus": job_def_config.vcpus,
|
429
|
+
"memory": job_def_config.memory,
|
430
|
+
"command": job_def_config.command,
|
431
|
+
"environment": environment,
|
432
|
+
"resourceRequirements": [
|
433
|
+
{"type": "VCPU", "value": str(job_def_config.vcpus)},
|
434
|
+
{"type": "MEMORY", "value": str(job_def_config.memory)},
|
435
|
+
],
|
436
|
+
}
|
437
|
+
|
438
|
+
# Add optional configurations
|
439
|
+
if job_def_config.job_role_arn:
|
440
|
+
container_props["jobRoleArn"] = job_def_config.job_role_arn
|
441
|
+
container_props["executionRoleArn"] = job_def_config.job_role_arn
|
442
|
+
|
443
|
+
if job_def_config.mount_points and job_def_config.volumes:
|
444
|
+
container_props["mountPoints"] = job_def_config.mount_points
|
445
|
+
container_props["volumes"] = job_def_config.volumes
|
446
|
+
|
447
|
+
response = self.batch_client.register_job_definition(
|
448
|
+
jobDefinitionName=job_def_name,
|
449
|
+
type="container",
|
450
|
+
containerProperties=container_props,
|
451
|
+
platformCapabilities=["EC2"], # Specify platform capabilities
|
452
|
+
)
|
453
|
+
|
454
|
+
return response["jobDefinitionArn"]
|
455
|
+
|
456
|
+
def _create_job(
|
457
|
+
self, job_name: str, job_def_arn: str, parameters: dict[str, str]
|
458
|
+
) -> str:
|
459
|
+
"""Create AWS Batch job with optimized settings.
|
460
|
+
|
461
|
+
Args:
|
462
|
+
job_name: Job name
|
463
|
+
job_def_arn: Job definition ARN
|
464
|
+
parameters: Job parameters
|
465
|
+
|
466
|
+
Returns:
|
467
|
+
str: Job ARN
|
468
|
+
"""
|
469
|
+
job_config = self.job_config or BatchJobConfig(
|
470
|
+
job_name=job_name,
|
471
|
+
job_queue_arn=self.job_queue_arn,
|
472
|
+
job_definition_arn=job_def_arn,
|
473
|
+
)
|
474
|
+
|
475
|
+
# Build job parameters
|
476
|
+
job_params = {
|
477
|
+
"jobName": job_name,
|
478
|
+
"jobQueue": self.job_queue_arn,
|
479
|
+
"jobDefinition": job_def_arn,
|
480
|
+
"parameters": parameters or {},
|
481
|
+
}
|
482
|
+
|
483
|
+
# Add optional configurations
|
484
|
+
if job_config.timeout:
|
485
|
+
job_params["timeout"] = job_config.timeout
|
486
|
+
|
487
|
+
if job_config.retry_strategy:
|
488
|
+
job_params["retryStrategy"] = job_config.retry_strategy
|
489
|
+
|
490
|
+
if job_config.depends_on:
|
491
|
+
job_params["dependsOn"] = job_config.depends_on
|
492
|
+
|
493
|
+
response = self.batch_client.submit_job(**job_params)
|
494
|
+
return response["jobArn"]
|
495
|
+
|
496
|
+
def _wait_for_job_completion(
|
497
|
+
self, job_arn: str, timeout: int = 3600
|
498
|
+
) -> dict[str, Any]:
|
499
|
+
"""Wait for job completion with optimized polling.
|
500
|
+
|
501
|
+
Args:
|
502
|
+
job_arn: Job ARN
|
503
|
+
timeout: Timeout in seconds
|
504
|
+
|
505
|
+
Returns:
|
506
|
+
Dict[str, Any]: Job results
|
507
|
+
"""
|
508
|
+
start_time = time.time()
|
509
|
+
poll_interval = 10 # Start with 10 second intervals
|
510
|
+
|
511
|
+
while time.time() - start_time < timeout:
|
512
|
+
try:
|
513
|
+
response = self.batch_client.describe_jobs(jobs=[job_arn])
|
514
|
+
job = response["jobs"][0]
|
515
|
+
|
516
|
+
if job["status"] == "SUCCEEDED":
|
517
|
+
return self._process_successful_job(job, job_arn)
|
518
|
+
|
519
|
+
elif job["status"] == "FAILED":
|
520
|
+
return self._process_failed_job(job)
|
521
|
+
|
522
|
+
elif job["status"] in ["RUNNING", "SUBMITTED", "PENDING"]:
|
523
|
+
# Adaptive polling: increase interval for long-running jobs
|
524
|
+
if time.time() - start_time > 300: # After 5 minutes
|
525
|
+
poll_interval = min(
|
526
|
+
poll_interval * 1.5, 60
|
527
|
+
) # Max 60 seconds
|
528
|
+
|
529
|
+
time.sleep(poll_interval)
|
530
|
+
else:
|
531
|
+
# For other states, use shorter polling
|
532
|
+
time.sleep(5)
|
533
|
+
|
534
|
+
except ClientError as e:
|
535
|
+
if e.response["Error"]["Code"] == "JobNotFoundException":
|
536
|
+
# Job might be deleted, wait a bit and retry
|
537
|
+
time.sleep(poll_interval)
|
538
|
+
else:
|
539
|
+
# Continue polling on error with exponential backoff
|
540
|
+
poll_interval = min(poll_interval * 2, 60)
|
541
|
+
time.sleep(poll_interval)
|
542
|
+
except Exception:
|
543
|
+
# Continue polling on error with exponential backoff
|
544
|
+
poll_interval = min(poll_interval * 2, 60)
|
545
|
+
time.sleep(poll_interval)
|
546
|
+
|
547
|
+
return {"status": "timeout", "exit_code": 1}
|
548
|
+
|
549
|
+
def _process_successful_job(
|
550
|
+
self, job: dict[str, Any], job_arn: str
|
551
|
+
) -> dict[str, Any]:
|
552
|
+
"""Process successful job and download results.
|
553
|
+
|
554
|
+
Args:
|
555
|
+
job: Job object
|
556
|
+
job_arn: Job ARN
|
557
|
+
|
558
|
+
Returns:
|
559
|
+
Dict[str, Any]: Job results with files
|
560
|
+
"""
|
561
|
+
result_files = {}
|
562
|
+
try:
|
563
|
+
# List objects in job's S3 prefix
|
564
|
+
job_id = job_arn.split("/")[-1]
|
565
|
+
prefix = f"jobs/{job_id}/"
|
566
|
+
|
567
|
+
response = self.s3_client.list_objects_v2(
|
568
|
+
Bucket=self.s3_bucket, Prefix=prefix
|
569
|
+
)
|
570
|
+
|
571
|
+
# Download result files efficiently
|
572
|
+
for obj in response.get("Contents", []):
|
573
|
+
if obj["Key"].endswith((".json", ".txt", ".log")):
|
574
|
+
with self._temp_file_context() as tmp_file:
|
575
|
+
self.s3_client.download_file(
|
576
|
+
self.s3_bucket, obj["Key"], tmp_file
|
577
|
+
)
|
578
|
+
with open(tmp_file) as f:
|
579
|
+
result_files[obj["Key"]] = f.read()
|
580
|
+
except Exception:
|
581
|
+
# File download failed, continue with empty results
|
582
|
+
pass
|
583
|
+
|
584
|
+
return {"status": "completed", "exit_code": 0, "files": result_files}
|
585
|
+
|
586
|
+
def _process_failed_job(self, job: dict[str, Any]) -> dict[str, Any]:
|
587
|
+
"""Process failed job and extract error information.
|
588
|
+
|
589
|
+
Args:
|
590
|
+
job: Job object
|
591
|
+
|
592
|
+
Returns:
|
593
|
+
Dict[str, Any]: Failure information
|
594
|
+
"""
|
595
|
+
failure_reason = "Job failed"
|
596
|
+
|
597
|
+
# Try to extract more detailed error information
|
598
|
+
if "attempts" in job and job["attempts"]:
|
599
|
+
last_attempt = job["attempts"][-1]
|
600
|
+
if "reason" in last_attempt:
|
601
|
+
failure_reason = last_attempt["reason"]
|
602
|
+
|
603
|
+
return {
|
604
|
+
"status": "failed",
|
605
|
+
"exit_code": 1,
|
606
|
+
"failure_reason": failure_reason,
|
607
|
+
}
|
608
|
+
|
609
|
+
def _create_optimized_task_script(
|
610
|
+
self, job: Job, params: DictData, run_id: str
|
611
|
+
) -> str:
|
612
|
+
"""Create optimized Python script for task execution.
|
613
|
+
|
614
|
+
Args:
|
615
|
+
job: Job to execute
|
616
|
+
params: Job parameters
|
617
|
+
run_id: Execution run ID
|
618
|
+
|
619
|
+
Returns:
|
620
|
+
str: Path to created script
|
621
|
+
"""
|
622
|
+
script_content = f'''#!/usr/bin/env python3
|
623
|
+
import json
|
624
|
+
import sys
|
625
|
+
import os
|
626
|
+
import subprocess
|
627
|
+
import time
|
628
|
+
from pathlib import Path
|
629
|
+
|
630
|
+
def install_package(package):
|
631
|
+
"""Install package with retry logic."""
|
632
|
+
for attempt in range(3):
|
633
|
+
try:
|
634
|
+
subprocess.run([sys.executable, '-m', 'pip', 'install', package],
|
635
|
+
check=True, capture_output=True, timeout=300)
|
636
|
+
return True
|
637
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
|
638
|
+
if attempt == 2:
|
639
|
+
raise
|
640
|
+
time.sleep(2 ** attempt)
|
641
|
+
|
642
|
+
def download_file(s3_url, local_path):
|
643
|
+
"""Download file with retry logic."""
|
644
|
+
for attempt in range(3):
|
645
|
+
try:
|
646
|
+
subprocess.run(['aws', 's3', 'cp', s3_url, local_path],
|
647
|
+
check=True, capture_output=True, timeout=300)
|
648
|
+
return True
|
649
|
+
except subprocess.CalledProcessError:
|
650
|
+
if attempt == 2:
|
651
|
+
raise
|
652
|
+
time.sleep(2 ** attempt)
|
653
|
+
|
654
|
+
# Install ddeutil-workflow with retry
|
655
|
+
install_package('ddeutil-workflow')
|
656
|
+
|
657
|
+
# Download files with retry
|
658
|
+
download_file(os.environ['JOB_CONFIG_S3_URL'], 'job_config.json')
|
659
|
+
download_file(os.environ['PARAMS_S3_URL'], 'params.json')
|
660
|
+
download_file(os.environ['SCRIPT_S3_URL'], 'task_script.py')
|
661
|
+
|
662
|
+
# Add current directory to Python path
|
663
|
+
sys.path.insert(0, os.getcwd())
|
664
|
+
|
665
|
+
from ddeutil.workflow.job import local_execute
|
666
|
+
from ddeutil.workflow import Job
|
667
|
+
|
668
|
+
# Load job configuration
|
669
|
+
with open('job_config.json', 'r') as f:
|
670
|
+
job_data = json.load(f)
|
671
|
+
|
672
|
+
# Load parameters
|
673
|
+
with open('params.json', 'r') as f:
|
674
|
+
params = json.load(f)
|
675
|
+
|
676
|
+
# Create job instance
|
677
|
+
job = Job(**job_data)
|
678
|
+
|
679
|
+
# Execute job
|
680
|
+
result = local_execute(job, params, run_id='{run_id}')
|
681
|
+
|
682
|
+
# Save result
|
683
|
+
with open('result.json', 'w') as f:
|
684
|
+
json.dump(result.model_dump(), f, indent=2)
|
685
|
+
|
686
|
+
# Upload result to S3 with retry
|
687
|
+
job_id = '{run_id}'
|
688
|
+
bucket = '{self.s3_bucket}'
|
689
|
+
|
690
|
+
# Create directory structure
|
691
|
+
subprocess.run(['aws', 's3', 'mkdir', 's3://{{bucket}}/jobs/{{job_id}}'],
|
692
|
+
check=True, capture_output=True)
|
693
|
+
|
694
|
+
# Upload result file with retry
|
695
|
+
download_file('result.json', f's3://{{bucket}}/jobs/{{job_id}}/result.json')
|
696
|
+
|
697
|
+
sys.exit(0 if result.status == 'success' else 1)
|
698
|
+
'''
|
699
|
+
|
700
|
+
with self._temp_file_context(suffix=".py") as script_path:
|
701
|
+
with open(script_path, "w") as f:
|
702
|
+
f.write(script_content)
|
703
|
+
return script_path
|
704
|
+
|
705
|
+
def execute_job(
|
706
|
+
self,
|
707
|
+
job: Job,
|
708
|
+
params: DictData,
|
709
|
+
*,
|
710
|
+
run_id: Optional[str] = None,
|
711
|
+
event: Optional[Any] = None,
|
712
|
+
) -> Result:
|
713
|
+
"""Execute job on AWS Batch with optimized performance.
|
714
|
+
|
715
|
+
Args:
|
716
|
+
job: Job to execute
|
717
|
+
params: Job parameters
|
718
|
+
run_id: Execution run ID
|
719
|
+
event: Event for cancellation
|
720
|
+
|
721
|
+
Returns:
|
722
|
+
Result: Execution result
|
723
|
+
"""
|
724
|
+
if event and event.is_set():
|
725
|
+
return Result(
|
726
|
+
status=FAILED,
|
727
|
+
context={
|
728
|
+
"errors": {"message": "Execution was canceled before start"}
|
729
|
+
},
|
730
|
+
run_id=run_id or gen_id("aws-batch"),
|
731
|
+
extras={},
|
732
|
+
)
|
733
|
+
|
734
|
+
# Generate run ID if not provided
|
735
|
+
if not run_id:
|
736
|
+
run_id = gen_id(job.id or "aws-batch", unique=True)
|
737
|
+
|
738
|
+
trace = get_trace(run_id, extras=job.extras)
|
739
|
+
trace.info(f"[AWS_BATCH]: Starting job execution: {job.id}")
|
740
|
+
|
741
|
+
try:
|
742
|
+
# Create job definition
|
743
|
+
job_def_name = f"workflow-job-def-{run_id}"
|
744
|
+
trace.info(f"[AWS_BATCH]: Creating job definition: {job_def_name}")
|
745
|
+
job_def_arn = self._create_job_definition_if_not_exists(
|
746
|
+
job_def_name
|
747
|
+
)
|
748
|
+
|
749
|
+
# Create optimized task script
|
750
|
+
script_path = self._create_optimized_task_script(
|
751
|
+
job, params, run_id
|
752
|
+
)
|
753
|
+
|
754
|
+
# Upload files efficiently
|
755
|
+
job_config_s3_key = f"jobs/{run_id}/job_config.json"
|
756
|
+
params_s3_key = f"jobs/{run_id}/params.json"
|
757
|
+
script_s3_key = f"jobs/{run_id}/task_script.py"
|
758
|
+
|
759
|
+
# Upload files efficiently
|
760
|
+
trace.info("[AWS_BATCH]: Uploading files to S3")
|
761
|
+
|
762
|
+
with self._temp_file_context(suffix=".json") as job_config_path:
|
763
|
+
with open(job_config_path, "w") as f:
|
764
|
+
json.dump(job.model_dump(), f)
|
765
|
+
self._upload_file_to_s3(job_config_path, job_config_s3_key)
|
766
|
+
|
767
|
+
with self._temp_file_context(suffix=".json") as params_path:
|
768
|
+
with open(params_path, "w") as f:
|
769
|
+
json.dump(params, f)
|
770
|
+
self._upload_file_to_s3(params_path, params_s3_key)
|
771
|
+
|
772
|
+
self._upload_file_to_s3(script_path, script_s3_key)
|
773
|
+
|
774
|
+
# Create job
|
775
|
+
job_name = f"workflow-job-{run_id}"
|
776
|
+
job_parameters = {
|
777
|
+
"job_config_s3_url": f"s3://{self.s3_bucket}/{job_config_s3_key}",
|
778
|
+
"params_s3_url": f"s3://{self.s3_bucket}/{params_s3_key}",
|
779
|
+
"script_s3_url": f"s3://{self.s3_bucket}/{script_s3_key}",
|
780
|
+
}
|
781
|
+
|
782
|
+
trace.info(f"[AWS_BATCH]: Creating job: {job_name}")
|
783
|
+
job_arn = self._create_job(job_name, job_def_arn, job_parameters)
|
784
|
+
|
785
|
+
# Wait for job completion
|
786
|
+
trace.info("[AWS_BATCH]: Waiting for job completion")
|
787
|
+
job_result = self._wait_for_job_completion(job_arn)
|
788
|
+
|
789
|
+
# Process results
|
790
|
+
if job_result["status"] == "completed":
|
791
|
+
result_data = {}
|
792
|
+
result_file_key = f"jobs/{run_id}/result.json"
|
793
|
+
|
794
|
+
if result_file_key in job_result.get("files", {}):
|
795
|
+
try:
|
796
|
+
result_data = json.loads(
|
797
|
+
job_result["files"][result_file_key]
|
798
|
+
)
|
799
|
+
except (json.JSONDecodeError, KeyError):
|
800
|
+
result_data = {"status": SUCCESS}
|
801
|
+
|
802
|
+
trace.info("[AWS_BATCH]: Job completed successfully")
|
803
|
+
return Result(
|
804
|
+
status=SUCCESS,
|
805
|
+
context=result_data,
|
806
|
+
run_id=run_id,
|
807
|
+
extras=job.extras or {},
|
808
|
+
)
|
809
|
+
else:
|
810
|
+
error_msg = f"Job failed: {job_result.get('status', 'unknown')}"
|
811
|
+
if job_result.get("failure_reason"):
|
812
|
+
error_msg += f" - {job_result['failure_reason']}"
|
813
|
+
|
814
|
+
trace.error(f"[AWS_BATCH]: {error_msg}")
|
815
|
+
return Result(
|
816
|
+
status=FAILED,
|
817
|
+
context={"errors": {"message": error_msg}},
|
818
|
+
run_id=run_id,
|
819
|
+
extras=job.extras or {},
|
820
|
+
)
|
821
|
+
|
822
|
+
except Exception as e:
|
823
|
+
trace.error(f"[AWS_BATCH]: Execution failed: {str(e)}")
|
824
|
+
return Result(
|
825
|
+
status=FAILED,
|
826
|
+
context={"errors": {"message": str(e)}},
|
827
|
+
run_id=run_id,
|
828
|
+
extras=job.extras or {},
|
829
|
+
)
|
830
|
+
|
831
|
+
def cleanup(self, job_id: Optional[str] = None) -> None:
|
832
|
+
"""Clean up AWS Batch resources efficiently.
|
833
|
+
|
834
|
+
Args:
|
835
|
+
job_id: Job ID to clean up (if None, cleans up all workflow jobs)
|
836
|
+
"""
|
837
|
+
try:
|
838
|
+
prefix = f"jobs/{job_id}/" if job_id else "jobs/"
|
839
|
+
|
840
|
+
# List objects with pagination for large datasets
|
841
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
842
|
+
pages = paginator.paginate(Bucket=self.s3_bucket, Prefix=prefix)
|
843
|
+
|
844
|
+
# Delete objects in batches for better performance
|
845
|
+
batch_size = 1000
|
846
|
+
objects_to_delete = []
|
847
|
+
|
848
|
+
for page in pages:
|
849
|
+
for obj in page.get("Contents", []):
|
850
|
+
objects_to_delete.append({"Key": obj["Key"]})
|
851
|
+
|
852
|
+
if len(objects_to_delete) >= batch_size:
|
853
|
+
self.s3_client.delete_objects(
|
854
|
+
Bucket=self.s3_bucket,
|
855
|
+
Delete={"Objects": objects_to_delete},
|
856
|
+
)
|
857
|
+
objects_to_delete = []
|
858
|
+
|
859
|
+
# Delete remaining objects
|
860
|
+
if objects_to_delete:
|
861
|
+
self.s3_client.delete_objects(
|
862
|
+
Bucket=self.s3_bucket, Delete={"Objects": objects_to_delete}
|
863
|
+
)
|
864
|
+
|
865
|
+
except Exception:
|
866
|
+
pass
|
867
|
+
|
868
|
+
|
869
|
+
def aws_batch_execute(
|
870
|
+
job: Job,
|
871
|
+
params: DictData,
|
872
|
+
*,
|
873
|
+
run_id: Optional[str] = None,
|
874
|
+
event: Optional[Any] = None,
|
875
|
+
) -> Result:
|
876
|
+
"""AWS Batch job execution function with optimized performance.
|
877
|
+
|
878
|
+
This function creates an AWS Batch provider and executes the job
|
879
|
+
on AWS Batch compute environments. It handles the complete lifecycle
|
880
|
+
including job definition creation, job submission, and result retrieval.
|
881
|
+
|
882
|
+
Args:
|
883
|
+
job: Job to execute
|
884
|
+
params: Job parameters
|
885
|
+
run_id: Execution run ID
|
886
|
+
event: Event for cancellation
|
887
|
+
|
888
|
+
Returns:
|
889
|
+
Result: Execution result
|
890
|
+
"""
|
891
|
+
# Extract AWS Batch configuration from job
|
892
|
+
batch_args = job.runs_on.args
|
893
|
+
|
894
|
+
provider = AWSBatchProvider(
|
895
|
+
job_queue_arn=batch_args.job_queue_arn,
|
896
|
+
s3_bucket=batch_args.s3_bucket,
|
897
|
+
region_name=batch_args.region_name,
|
898
|
+
aws_access_key_id=batch_args.aws_access_key_id,
|
899
|
+
aws_secret_access_key=batch_args.aws_secret_access_key,
|
900
|
+
aws_session_token=batch_args.aws_session_token,
|
901
|
+
)
|
902
|
+
|
903
|
+
try:
|
904
|
+
return provider.execute_job(job, params, run_id=run_id, event=event)
|
905
|
+
finally:
|
906
|
+
# Clean up resources
|
907
|
+
if run_id:
|
908
|
+
provider.cleanup(run_id)
|