ddeutil-workflow 0.0.77__py3-none-any.whl → 0.0.79__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddeutil/workflow/__about__.py +1 -1
- ddeutil/workflow/__init__.py +1 -5
- ddeutil/workflow/api/routes/job.py +2 -2
- ddeutil/workflow/audits.py +554 -112
- ddeutil/workflow/cli.py +25 -3
- ddeutil/workflow/conf.py +16 -28
- ddeutil/workflow/errors.py +13 -15
- ddeutil/workflow/event.py +37 -41
- ddeutil/workflow/job.py +161 -92
- ddeutil/workflow/params.py +172 -58
- ddeutil/workflow/plugins/__init__.py +0 -0
- ddeutil/workflow/plugins/providers/__init__.py +0 -0
- ddeutil/workflow/plugins/providers/aws.py +908 -0
- ddeutil/workflow/plugins/providers/az.py +1003 -0
- ddeutil/workflow/plugins/providers/container.py +703 -0
- ddeutil/workflow/plugins/providers/gcs.py +826 -0
- ddeutil/workflow/result.py +35 -37
- ddeutil/workflow/reusables.py +153 -96
- ddeutil/workflow/stages.py +84 -60
- ddeutil/workflow/traces.py +1660 -521
- ddeutil/workflow/utils.py +111 -69
- ddeutil/workflow/workflow.py +74 -47
- {ddeutil_workflow-0.0.77.dist-info → ddeutil_workflow-0.0.79.dist-info}/METADATA +52 -20
- ddeutil_workflow-0.0.79.dist-info/RECORD +36 -0
- ddeutil_workflow-0.0.77.dist-info/RECORD +0 -30
- {ddeutil_workflow-0.0.77.dist-info → ddeutil_workflow-0.0.79.dist-info}/WHEEL +0 -0
- {ddeutil_workflow-0.0.77.dist-info → ddeutil_workflow-0.0.79.dist-info}/entry_points.txt +0 -0
- {ddeutil_workflow-0.0.77.dist-info → ddeutil_workflow-0.0.79.dist-info}/licenses/LICENSE +0 -0
- {ddeutil_workflow-0.0.77.dist-info → ddeutil_workflow-0.0.79.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,826 @@
|
|
1
|
+
# ------------------------------------------------------------------------------
|
2
|
+
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
+
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
+
# license information.
|
5
|
+
# ------------------------------------------------------------------------------
|
6
|
+
"""Google Cloud Batch Provider Module.
|
7
|
+
|
8
|
+
This module provides Google Cloud Batch integration for workflow job execution.
|
9
|
+
It handles job creation, task execution, and result retrieval using Google Cloud
|
10
|
+
Batch service and Google Cloud Storage.
|
11
|
+
|
12
|
+
The Google Cloud Batch provider enables running workflow jobs on Google Cloud
|
13
|
+
Batch compute resources, providing scalable and managed execution environments
|
14
|
+
for complex workflow processing.
|
15
|
+
|
16
|
+
Key Features:
|
17
|
+
- Automatic job creation and management
|
18
|
+
- Task execution on Google Cloud compute resources
|
19
|
+
- Result file upload/download via Google Cloud Storage
|
20
|
+
- Error handling and status monitoring
|
21
|
+
- Resource cleanup and management
|
22
|
+
- Optimized file operations and caching
|
23
|
+
|
24
|
+
Classes:
|
25
|
+
GoogleCloudBatchProvider: Main provider for Google Cloud Batch operations
|
26
|
+
BatchJobConfig: Configuration for Google Cloud Batch jobs
|
27
|
+
BatchTaskConfig: Configuration for Google Cloud Batch tasks
|
28
|
+
BatchResourceConfig: Configuration for compute resources
|
29
|
+
|
30
|
+
References:
|
31
|
+
- https://cloud.google.com/batch/docs
|
32
|
+
- https://googleapis.dev/python/batch/latest/index.html
|
33
|
+
|
34
|
+
Config Example:
|
35
|
+
|
36
|
+
```dotenv
|
37
|
+
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account.json"
|
38
|
+
export GOOGLE_CLOUD_PROJECT="your-project-id"
|
39
|
+
export GOOGLE_CLOUD_REGION="us-central1"
|
40
|
+
export GCS_BUCKET="your-gcs-bucket"
|
41
|
+
```
|
42
|
+
|
43
|
+
```yaml
|
44
|
+
jobs:
|
45
|
+
my-job:
|
46
|
+
runs-on:
|
47
|
+
type: "gcp_batch"
|
48
|
+
with:
|
49
|
+
project_id: "${GOOGLE_CLOUD_PROJECT}"
|
50
|
+
region: "${GOOGLE_CLOUD_REGION}"
|
51
|
+
gcs_bucket: "${GCS_BUCKET}"
|
52
|
+
machine_type: "e2-standard-4"
|
53
|
+
max_parallel_tasks: 10
|
54
|
+
stages:
|
55
|
+
- name: "process"
|
56
|
+
type: "py"
|
57
|
+
run: |
|
58
|
+
# Your processing logic here
|
59
|
+
result.context.update({"output": "processed"})
|
60
|
+
```
|
61
|
+
|
62
|
+
"""
|
63
|
+
from __future__ import annotations
|
64
|
+
|
65
|
+
import json
|
66
|
+
import os
|
67
|
+
import tempfile
|
68
|
+
import time
|
69
|
+
from contextlib import contextmanager
|
70
|
+
from typing import Any, Optional
|
71
|
+
|
72
|
+
try:
|
73
|
+
from google.api_core import exceptions as google_exceptions
|
74
|
+
from google.api_core import retry
|
75
|
+
from google.cloud import batch_v1, storage
|
76
|
+
|
77
|
+
GCP_AVAILABLE = True
|
78
|
+
except ImportError:
|
79
|
+
GCP_AVAILABLE = False
|
80
|
+
|
81
|
+
from pydantic import BaseModel, Field
|
82
|
+
|
83
|
+
from ...__types import DictData
|
84
|
+
from ...job import Job
|
85
|
+
from ...result import FAILED, SUCCESS, Result
|
86
|
+
from ...traces import get_trace
|
87
|
+
from ...utils import gen_id
|
88
|
+
|
89
|
+
|
90
|
+
class BatchResourceConfig(BaseModel):
|
91
|
+
"""Google Cloud Batch resource configuration."""
|
92
|
+
|
93
|
+
machine_type: str = Field(
|
94
|
+
default="e2-standard-4", description="Machine type"
|
95
|
+
)
|
96
|
+
cpu_count: int = Field(default=4, description="Number of CPUs")
|
97
|
+
memory_mb: int = Field(default=16384, description="Memory in MB")
|
98
|
+
boot_disk_size_gb: int = Field(
|
99
|
+
default=50, description="Boot disk size in GB"
|
100
|
+
)
|
101
|
+
max_parallel_tasks: int = Field(
|
102
|
+
default=1, description="Maximum parallel tasks"
|
103
|
+
)
|
104
|
+
gpu_count: int = Field(default=0, description="Number of GPUs")
|
105
|
+
gpu_type: Optional[str] = Field(default=None, description="GPU type")
|
106
|
+
|
107
|
+
|
108
|
+
class BatchJobConfig(BaseModel):
|
109
|
+
"""Google Cloud Batch job configuration."""
|
110
|
+
|
111
|
+
job_name: str = Field(description="Unique job name")
|
112
|
+
project_id: str = Field(description="Google Cloud project ID")
|
113
|
+
region: str = Field(description="Google Cloud region")
|
114
|
+
gcs_bucket: str = Field(description="Google Cloud Storage bucket")
|
115
|
+
resource_config: Optional[BatchResourceConfig] = Field(
|
116
|
+
default=None, description="Resource configuration"
|
117
|
+
)
|
118
|
+
timeout_seconds: int = Field(
|
119
|
+
default=3600, description="Job timeout in seconds"
|
120
|
+
)
|
121
|
+
retry_count: int = Field(default=2, description="Number of retries")
|
122
|
+
preemptible: bool = Field(
|
123
|
+
default=False, description="Use preemptible instances"
|
124
|
+
)
|
125
|
+
|
126
|
+
|
127
|
+
class BatchTaskConfig(BaseModel):
|
128
|
+
"""Google Cloud Batch task configuration."""
|
129
|
+
|
130
|
+
task_name: str = Field(description="Unique task name")
|
131
|
+
command: list[str] = Field(description="Command to execute")
|
132
|
+
image: str = Field(
|
133
|
+
default="python:3.11-slim", description="Container image"
|
134
|
+
)
|
135
|
+
timeout_seconds: int = Field(
|
136
|
+
default=3600, description="Task timeout in seconds"
|
137
|
+
)
|
138
|
+
environment_variables: Optional[dict[str, str]] = Field(
|
139
|
+
default=None, description="Environment variables"
|
140
|
+
)
|
141
|
+
|
142
|
+
|
143
|
+
class GoogleCloudBatchProvider:
|
144
|
+
"""Google Cloud Batch provider for workflow job execution.
|
145
|
+
|
146
|
+
This provider handles the complete lifecycle of Google Cloud Batch operations
|
147
|
+
including job creation, task execution, and result retrieval. It integrates
|
148
|
+
with Google Cloud Storage for file management and provides comprehensive
|
149
|
+
error handling and monitoring.
|
150
|
+
|
151
|
+
Attributes:
|
152
|
+
batch_client: Google Cloud Batch client
|
153
|
+
storage_client: Google Cloud Storage client
|
154
|
+
project_id: Google Cloud project ID
|
155
|
+
region: Google Cloud region
|
156
|
+
gcs_bucket: Google Cloud Storage bucket name
|
157
|
+
job_config: Job configuration
|
158
|
+
task_config: Task configuration
|
159
|
+
|
160
|
+
Example:
|
161
|
+
```python
|
162
|
+
provider = GoogleCloudBatchProvider(
|
163
|
+
project_id="my-project",
|
164
|
+
region="us-central1",
|
165
|
+
gcs_bucket="my-workflow-bucket"
|
166
|
+
)
|
167
|
+
|
168
|
+
result = provider.execute_job(job, params, run_id="job-123")
|
169
|
+
```
|
170
|
+
"""
|
171
|
+
|
172
|
+
def __init__(
|
173
|
+
self,
|
174
|
+
project_id: str,
|
175
|
+
region: str,
|
176
|
+
gcs_bucket: str,
|
177
|
+
job_config: Optional[BatchJobConfig] = None,
|
178
|
+
task_config: Optional[BatchTaskConfig] = None,
|
179
|
+
credentials_path: Optional[str] = None,
|
180
|
+
):
|
181
|
+
"""Initialize Google Cloud Batch provider.
|
182
|
+
|
183
|
+
Args:
|
184
|
+
project_id: Google Cloud project ID
|
185
|
+
region: Google Cloud region
|
186
|
+
gcs_bucket: Google Cloud Storage bucket name
|
187
|
+
job_config: Job configuration
|
188
|
+
task_config: Task configuration
|
189
|
+
credentials_path: Path to service account credentials file
|
190
|
+
"""
|
191
|
+
if not GCP_AVAILABLE:
|
192
|
+
raise ImportError(
|
193
|
+
"Google Cloud dependencies not available. "
|
194
|
+
"Install with: pip install google-cloud-batch google-cloud-storage"
|
195
|
+
)
|
196
|
+
|
197
|
+
self.project_id = project_id
|
198
|
+
self.region = region
|
199
|
+
self.gcs_bucket = gcs_bucket
|
200
|
+
|
201
|
+
# Set credentials if provided
|
202
|
+
if credentials_path:
|
203
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
|
204
|
+
|
205
|
+
# Initialize Google Cloud clients with retry configuration
|
206
|
+
self.batch_client = batch_v1.BatchServiceClient()
|
207
|
+
self.storage_client = storage.Client(project=project_id)
|
208
|
+
|
209
|
+
# Set configurations
|
210
|
+
self.job_config = job_config
|
211
|
+
self.task_config = task_config
|
212
|
+
|
213
|
+
# Cache for bucket and blob operations
|
214
|
+
self._bucket_cache: Optional[storage.Bucket] = None
|
215
|
+
|
216
|
+
@property
|
217
|
+
def bucket(self) -> storage.Bucket:
|
218
|
+
"""Get or create cached bucket instance."""
|
219
|
+
if self._bucket_cache is None:
|
220
|
+
self._bucket_cache = self.storage_client.bucket(self.gcs_bucket)
|
221
|
+
return self._bucket_cache
|
222
|
+
|
223
|
+
def _ensure_gcs_bucket(self) -> None:
|
224
|
+
"""Ensure Google Cloud Storage bucket exists."""
|
225
|
+
try:
|
226
|
+
self.bucket.reload()
|
227
|
+
except google_exceptions.NotFound:
|
228
|
+
# Create bucket with optimized settings
|
229
|
+
bucket = self.storage_client.create_bucket(
|
230
|
+
self.gcs_bucket,
|
231
|
+
location=self.region,
|
232
|
+
storage_class=storage.StorageClass.STANDARD,
|
233
|
+
)
|
234
|
+
self._bucket_cache = bucket
|
235
|
+
|
236
|
+
@contextmanager
|
237
|
+
def _temp_file_context(self, suffix: str = ".tmp"):
|
238
|
+
"""Context manager for temporary file operations."""
|
239
|
+
temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
240
|
+
try:
|
241
|
+
yield temp_file.name
|
242
|
+
finally:
|
243
|
+
try:
|
244
|
+
os.unlink(temp_file.name)
|
245
|
+
except OSError:
|
246
|
+
pass
|
247
|
+
|
248
|
+
def _upload_file_to_gcs(self, file_path: str, gcs_blob_name: str) -> str:
|
249
|
+
"""Upload file to Google Cloud Storage with optimized settings.
|
250
|
+
|
251
|
+
Args:
|
252
|
+
file_path: Local file path
|
253
|
+
gcs_blob_name: GCS blob name
|
254
|
+
|
255
|
+
Returns:
|
256
|
+
str: GCS blob URL
|
257
|
+
"""
|
258
|
+
self._ensure_gcs_bucket()
|
259
|
+
|
260
|
+
blob = self.bucket.blob(gcs_blob_name)
|
261
|
+
|
262
|
+
# Set optimized metadata for workflow files
|
263
|
+
blob.metadata = {
|
264
|
+
"workflow_provider": "gcp_batch",
|
265
|
+
"upload_time": str(time.time()),
|
266
|
+
}
|
267
|
+
|
268
|
+
# Use optimized upload settings
|
269
|
+
with open(file_path, "rb") as data:
|
270
|
+
blob.upload_from_file(
|
271
|
+
data,
|
272
|
+
content_type="application/octet-stream",
|
273
|
+
timeout=300, # 5 minute timeout
|
274
|
+
)
|
275
|
+
|
276
|
+
return f"gs://{self.gcs_bucket}/{gcs_blob_name}"
|
277
|
+
|
278
|
+
def _download_file_from_gcs(
|
279
|
+
self, gcs_blob_name: str, local_path: str
|
280
|
+
) -> None:
|
281
|
+
"""Download file from Google Cloud Storage with optimized settings.
|
282
|
+
|
283
|
+
Args:
|
284
|
+
gcs_blob_name: GCS blob name
|
285
|
+
local_path: Local file path
|
286
|
+
"""
|
287
|
+
blob = self.bucket.blob(gcs_blob_name)
|
288
|
+
|
289
|
+
with open(local_path, "wb") as data:
|
290
|
+
blob.download_to_file(data, timeout=300)
|
291
|
+
|
292
|
+
def _create_job_definition(
|
293
|
+
self,
|
294
|
+
job_name: str,
|
295
|
+
task_script_gcs_url: str,
|
296
|
+
job_config_gcs_url: str,
|
297
|
+
params_gcs_url: str,
|
298
|
+
) -> batch_v1.Job:
|
299
|
+
"""Create optimized job definition.
|
300
|
+
|
301
|
+
Args:
|
302
|
+
job_name: Job name
|
303
|
+
task_script_gcs_url: GCS URL of task script
|
304
|
+
job_config_gcs_url: GCS URL of job configuration
|
305
|
+
params_gcs_url: GCS URL of parameters
|
306
|
+
|
307
|
+
Returns:
|
308
|
+
batch_v1.Job: Job definition
|
309
|
+
"""
|
310
|
+
job_config = self.job_config or BatchJobConfig(
|
311
|
+
job_name=job_name,
|
312
|
+
project_id=self.project_id,
|
313
|
+
region=self.region,
|
314
|
+
gcs_bucket=self.gcs_bucket,
|
315
|
+
)
|
316
|
+
|
317
|
+
resource_config = job_config.resource_config or BatchResourceConfig()
|
318
|
+
|
319
|
+
# Create optimized runnable
|
320
|
+
runnable = batch_v1.Runnable()
|
321
|
+
runnable.container = batch_v1.Runnable.Container()
|
322
|
+
runnable.container.image_uri = "python:3.11-slim"
|
323
|
+
runnable.container.commands = ["python3", "task_script.py"]
|
324
|
+
|
325
|
+
# Add environment variables with optimized settings
|
326
|
+
env_vars = {
|
327
|
+
"TASK_SCRIPT_URL": task_script_gcs_url,
|
328
|
+
"JOB_CONFIG_URL": job_config_gcs_url,
|
329
|
+
"PARAMS_URL": params_gcs_url,
|
330
|
+
"PYTHONUNBUFFERED": "1", # Ensure immediate output
|
331
|
+
"PYTHONDONTWRITEBYTECODE": "1", # Don't create .pyc files
|
332
|
+
}
|
333
|
+
|
334
|
+
if self.task_config and self.task_config.environment_variables:
|
335
|
+
env_vars.update(self.task_config.environment_variables)
|
336
|
+
|
337
|
+
runnable.container.environment = batch_v1.Environment()
|
338
|
+
runnable.container.environment.variables = env_vars
|
339
|
+
|
340
|
+
# Create optimized task specification
|
341
|
+
task = batch_v1.TaskSpec()
|
342
|
+
task.runnables = [runnable]
|
343
|
+
task.max_retry_count = job_config.retry_count
|
344
|
+
task.max_run_duration = f"{job_config.timeout_seconds}s"
|
345
|
+
|
346
|
+
# Configure compute resources
|
347
|
+
resources = batch_v1.ComputeResource()
|
348
|
+
resources.cpu_milli = resource_config.cpu_count * 1000
|
349
|
+
resources.memory_mib = resource_config.memory_mb
|
350
|
+
|
351
|
+
# Add GPU configuration if specified
|
352
|
+
if resource_config.gpu_count > 0 and resource_config.gpu_type:
|
353
|
+
resources.gpu_count = resource_config.gpu_count
|
354
|
+
resources.gpu_type = resource_config.gpu_type
|
355
|
+
|
356
|
+
task.compute_resource = resources
|
357
|
+
|
358
|
+
# Create job with optimized allocation policy
|
359
|
+
job = batch_v1.Job()
|
360
|
+
job.name = job_name
|
361
|
+
job.task_groups = [
|
362
|
+
batch_v1.TaskGroup(
|
363
|
+
task_spec=task,
|
364
|
+
task_count=1,
|
365
|
+
parallelism=resource_config.max_parallel_tasks,
|
366
|
+
)
|
367
|
+
]
|
368
|
+
|
369
|
+
# Configure allocation policy
|
370
|
+
job.allocation_policy = batch_v1.AllocationPolicy()
|
371
|
+
|
372
|
+
# Set provisioning model based on configuration
|
373
|
+
provisioning_model = (
|
374
|
+
batch_v1.AllocationPolicy.ProvisioningModel.PREEMPTIBLE
|
375
|
+
if job_config.preemptible
|
376
|
+
else batch_v1.AllocationPolicy.ProvisioningModel.STANDARD
|
377
|
+
)
|
378
|
+
|
379
|
+
job.allocation_policy.instances = [
|
380
|
+
batch_v1.AllocationPolicy.InstancePolicyOrTemplate(
|
381
|
+
install_gpu_drivers=resource_config.gpu_count > 0,
|
382
|
+
machine_type=resource_config.machine_type,
|
383
|
+
provisioning_model=provisioning_model,
|
384
|
+
)
|
385
|
+
]
|
386
|
+
|
387
|
+
return job
|
388
|
+
|
389
|
+
def _create_job(
|
390
|
+
self,
|
391
|
+
job_name: str,
|
392
|
+
task_script_gcs_url: str,
|
393
|
+
job_config_gcs_url: str,
|
394
|
+
params_gcs_url: str,
|
395
|
+
) -> str:
|
396
|
+
"""Create Google Cloud Batch job with optimized settings.
|
397
|
+
|
398
|
+
Args:
|
399
|
+
job_name: Job name
|
400
|
+
task_script_gcs_url: GCS URL of task script
|
401
|
+
job_config_gcs_url: GCS URL of job configuration
|
402
|
+
params_gcs_url: GCS URL of parameters
|
403
|
+
|
404
|
+
Returns:
|
405
|
+
str: Job name
|
406
|
+
"""
|
407
|
+
job = self._create_job_definition(
|
408
|
+
job_name, task_script_gcs_url, job_config_gcs_url, params_gcs_url
|
409
|
+
)
|
410
|
+
|
411
|
+
# Create the job with retry logic
|
412
|
+
parent = f"projects/{self.project_id}/locations/{self.region}"
|
413
|
+
|
414
|
+
request = batch_v1.CreateJobRequest(
|
415
|
+
parent=parent, job_id=job_name, job=job
|
416
|
+
)
|
417
|
+
|
418
|
+
# Use retry decorator for better reliability
|
419
|
+
@retry.Retry(
|
420
|
+
predicate=retry.if_exception_type(
|
421
|
+
google_exceptions.ServiceUnavailable
|
422
|
+
)
|
423
|
+
)
|
424
|
+
def create_job_with_retry():
|
425
|
+
operation = self.batch_client.create_job(request=request)
|
426
|
+
return operation.result()
|
427
|
+
|
428
|
+
result = create_job_with_retry()
|
429
|
+
return result.name
|
430
|
+
|
431
|
+
def _wait_for_job_completion(
|
432
|
+
self, job_name: str, timeout: int = 3600
|
433
|
+
) -> dict[str, Any]:
|
434
|
+
"""Wait for job completion with optimized polling.
|
435
|
+
|
436
|
+
Args:
|
437
|
+
job_name: Job name
|
438
|
+
timeout: Timeout in seconds
|
439
|
+
|
440
|
+
Returns:
|
441
|
+
Dict[str, Any]: Job results
|
442
|
+
"""
|
443
|
+
start_time = time.time()
|
444
|
+
poll_interval = 10 # Start with 10 second intervals
|
445
|
+
|
446
|
+
while time.time() - start_time < timeout:
|
447
|
+
try:
|
448
|
+
request = batch_v1.GetJobRequest(name=job_name)
|
449
|
+
job = self.batch_client.get_job(request=request)
|
450
|
+
|
451
|
+
if job.status.state == batch_v1.JobStatus.State.SUCCEEDED:
|
452
|
+
return self._process_successful_job(job, job_name)
|
453
|
+
|
454
|
+
elif job.status.state == batch_v1.JobStatus.State.FAILED:
|
455
|
+
return self._process_failed_job(job)
|
456
|
+
|
457
|
+
elif job.status.state in [
|
458
|
+
batch_v1.JobStatus.State.RUNNING,
|
459
|
+
batch_v1.JobStatus.State.SCHEDULED,
|
460
|
+
batch_v1.JobStatus.State.QUEUED,
|
461
|
+
]:
|
462
|
+
# Adaptive polling: increase interval for long-running jobs
|
463
|
+
if time.time() - start_time > 300: # After 5 minutes
|
464
|
+
poll_interval = min(
|
465
|
+
poll_interval * 1.5, 60
|
466
|
+
) # Max 60 seconds
|
467
|
+
|
468
|
+
time.sleep(poll_interval)
|
469
|
+
else:
|
470
|
+
# For other states, use shorter polling
|
471
|
+
time.sleep(5)
|
472
|
+
|
473
|
+
except google_exceptions.NotFound:
|
474
|
+
# Job might be deleted, wait a bit and retry
|
475
|
+
time.sleep(poll_interval)
|
476
|
+
except Exception:
|
477
|
+
# Continue polling on error with exponential backoff
|
478
|
+
poll_interval = min(poll_interval * 2, 60)
|
479
|
+
time.sleep(poll_interval)
|
480
|
+
|
481
|
+
return {"status": "timeout", "exit_code": 1}
|
482
|
+
|
483
|
+
def _process_successful_job(
|
484
|
+
self, job: batch_v1.Job, job_name: str
|
485
|
+
) -> dict[str, Any]:
|
486
|
+
"""Process successful job and download results.
|
487
|
+
|
488
|
+
Args:
|
489
|
+
job: Job object
|
490
|
+
job_name: Job name
|
491
|
+
|
492
|
+
Returns:
|
493
|
+
Dict[str, Any]: Job results with files
|
494
|
+
"""
|
495
|
+
result_files = {}
|
496
|
+
try:
|
497
|
+
# List objects in job's GCS prefix
|
498
|
+
job_id = job_name.split("/")[-1]
|
499
|
+
prefix = f"jobs/{job_id}/"
|
500
|
+
|
501
|
+
blobs = self.bucket.list_blobs(prefix=prefix)
|
502
|
+
|
503
|
+
# Download result files in parallel (simplified)
|
504
|
+
for blob in blobs:
|
505
|
+
if blob.name.endswith((".json", ".txt", ".log")):
|
506
|
+
with self._temp_file_context() as tmp_file:
|
507
|
+
blob.download_to_filename(tmp_file)
|
508
|
+
with open(tmp_file) as f:
|
509
|
+
result_files[blob.name] = f.read()
|
510
|
+
except Exception:
|
511
|
+
# File download failed, continue with empty results
|
512
|
+
pass
|
513
|
+
|
514
|
+
return {"status": "completed", "exit_code": 0, "files": result_files}
|
515
|
+
|
516
|
+
def _process_failed_job(self, job: batch_v1.Job) -> dict[str, Any]:
|
517
|
+
"""Process failed job and extract error information.
|
518
|
+
|
519
|
+
Args:
|
520
|
+
job: Job object
|
521
|
+
|
522
|
+
Returns:
|
523
|
+
Dict[str, Any]: Failure information
|
524
|
+
"""
|
525
|
+
failure_reason = "Job failed"
|
526
|
+
|
527
|
+
# Try to extract more detailed error information
|
528
|
+
if hasattr(job, "status") and hasattr(job.status, "status_events"):
|
529
|
+
for event in job.status.status_events:
|
530
|
+
if event.type_ == batch_v1.JobStatus.StatusEvent.Type.FAILED:
|
531
|
+
failure_reason = event.description or failure_reason
|
532
|
+
break
|
533
|
+
|
534
|
+
return {
|
535
|
+
"status": "failed",
|
536
|
+
"exit_code": 1,
|
537
|
+
"failure_reason": failure_reason,
|
538
|
+
}
|
539
|
+
|
540
|
+
def _create_optimized_task_script(
|
541
|
+
self, job: Job, params: DictData, run_id: str
|
542
|
+
) -> str:
|
543
|
+
"""Create optimized Python script for task execution.
|
544
|
+
|
545
|
+
Args:
|
546
|
+
job: Job to execute
|
547
|
+
params: Job parameters
|
548
|
+
run_id: Execution run ID
|
549
|
+
|
550
|
+
Returns:
|
551
|
+
str: Path to created script
|
552
|
+
"""
|
553
|
+
script_content = f'''#!/usr/bin/env python3
|
554
|
+
import json
|
555
|
+
import sys
|
556
|
+
import os
|
557
|
+
import subprocess
|
558
|
+
import time
|
559
|
+
from pathlib import Path
|
560
|
+
|
561
|
+
def install_package(package):
|
562
|
+
"""Install package with retry logic."""
|
563
|
+
for attempt in range(3):
|
564
|
+
try:
|
565
|
+
subprocess.run([sys.executable, '-m', 'pip', 'install', package],
|
566
|
+
check=True, capture_output=True, timeout=300)
|
567
|
+
return True
|
568
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
|
569
|
+
if attempt == 2:
|
570
|
+
raise
|
571
|
+
time.sleep(2 ** attempt)
|
572
|
+
|
573
|
+
def download_file(url, local_path):
|
574
|
+
"""Download file with retry logic."""
|
575
|
+
for attempt in range(3):
|
576
|
+
try:
|
577
|
+
subprocess.run(['gsutil', 'cp', url, local_path],
|
578
|
+
check=True, capture_output=True, timeout=300)
|
579
|
+
return True
|
580
|
+
except subprocess.CalledProcessError:
|
581
|
+
if attempt == 2:
|
582
|
+
raise
|
583
|
+
time.sleep(2 ** attempt)
|
584
|
+
|
585
|
+
# Install ddeutil-workflow with retry
|
586
|
+
install_package('ddeutil-workflow')
|
587
|
+
|
588
|
+
# Download files with retry
|
589
|
+
download_file(os.environ['TASK_SCRIPT_URL'], 'task_script.py')
|
590
|
+
download_file(os.environ['JOB_CONFIG_URL'], 'job_config.json')
|
591
|
+
download_file(os.environ['PARAMS_URL'], 'params.json')
|
592
|
+
|
593
|
+
# Add current directory to Python path
|
594
|
+
sys.path.insert(0, os.getcwd())
|
595
|
+
|
596
|
+
from ddeutil.workflow.job import local_execute
|
597
|
+
from ddeutil.workflow import Job
|
598
|
+
|
599
|
+
# Load job configuration
|
600
|
+
with open('job_config.json', 'r') as f:
|
601
|
+
job_data = json.load(f)
|
602
|
+
|
603
|
+
# Load parameters
|
604
|
+
with open('params.json', 'r') as f:
|
605
|
+
params = json.load(f)
|
606
|
+
|
607
|
+
# Create job instance
|
608
|
+
job = Job(**job_data)
|
609
|
+
|
610
|
+
# Execute job
|
611
|
+
result = local_execute(job, params, run_id='{run_id}')
|
612
|
+
|
613
|
+
# Save result
|
614
|
+
with open('result.json', 'w') as f:
|
615
|
+
json.dump(result.model_dump(), f, indent=2)
|
616
|
+
|
617
|
+
# Upload result to GCS with retry
|
618
|
+
job_id = '{run_id}'
|
619
|
+
bucket = '{self.gcs_bucket}'
|
620
|
+
|
621
|
+
# Create directory structure
|
622
|
+
subprocess.run(['gsutil', 'mkdir', '-p', f'gs://{{bucket}}/jobs/{{job_id}}'],
|
623
|
+
check=True, capture_output=True)
|
624
|
+
|
625
|
+
# Upload result file with retry
|
626
|
+
download_file('result.json', f'gs://{{bucket}}/jobs/{{job_id}}/result.json')
|
627
|
+
|
628
|
+
sys.exit(0 if result.status == 'success' else 1)
|
629
|
+
'''
|
630
|
+
|
631
|
+
with self._temp_file_context(suffix=".py") as script_path:
|
632
|
+
with open(script_path, "w") as f:
|
633
|
+
f.write(script_content)
|
634
|
+
return script_path
|
635
|
+
|
636
|
+
def execute_job(
|
637
|
+
self,
|
638
|
+
job: Job,
|
639
|
+
params: DictData,
|
640
|
+
*,
|
641
|
+
run_id: Optional[str] = None,
|
642
|
+
event: Optional[Any] = None,
|
643
|
+
) -> Result:
|
644
|
+
"""Execute job on Google Cloud Batch with optimized performance.
|
645
|
+
|
646
|
+
Args:
|
647
|
+
job: Job to execute
|
648
|
+
params: Job parameters
|
649
|
+
run_id: Execution run ID
|
650
|
+
event: Event for cancellation
|
651
|
+
|
652
|
+
Returns:
|
653
|
+
Result: Execution result
|
654
|
+
"""
|
655
|
+
if event and event.is_set():
|
656
|
+
return Result(
|
657
|
+
status=FAILED,
|
658
|
+
context={
|
659
|
+
"errors": {"message": "Execution was canceled before start"}
|
660
|
+
},
|
661
|
+
run_id=run_id or gen_id("gcp-batch"),
|
662
|
+
extras={},
|
663
|
+
)
|
664
|
+
|
665
|
+
# Generate run ID if not provided
|
666
|
+
if not run_id:
|
667
|
+
run_id = gen_id(job.id or "gcp-batch", unique=True)
|
668
|
+
|
669
|
+
trace = get_trace(run_id, extras=job.extras)
|
670
|
+
trace.info(f"[GCP_BATCH]: Starting job execution: {job.id}")
|
671
|
+
|
672
|
+
try:
|
673
|
+
# Create optimized task script
|
674
|
+
script_path = self._create_optimized_task_script(
|
675
|
+
job, params, run_id
|
676
|
+
)
|
677
|
+
|
678
|
+
# Prepare file paths
|
679
|
+
job_config_gcs_blob = f"jobs/{run_id}/job_config.json"
|
680
|
+
params_gcs_blob = f"jobs/{run_id}/params.json"
|
681
|
+
script_gcs_blob = f"jobs/{run_id}/task_script.py"
|
682
|
+
|
683
|
+
# Upload files efficiently
|
684
|
+
trace.info("[GCP_BATCH]: Uploading files to GCS")
|
685
|
+
|
686
|
+
with self._temp_file_context(suffix=".json") as job_config_path:
|
687
|
+
with open(job_config_path, "w") as f:
|
688
|
+
json.dump(job.model_dump(), f)
|
689
|
+
job_config_gcs_url = self._upload_file_to_gcs(
|
690
|
+
job_config_path, job_config_gcs_blob
|
691
|
+
)
|
692
|
+
|
693
|
+
with self._temp_file_context(suffix=".json") as params_path:
|
694
|
+
with open(params_path, "w") as f:
|
695
|
+
json.dump(params, f)
|
696
|
+
params_gcs_url = self._upload_file_to_gcs(
|
697
|
+
params_path, params_gcs_blob
|
698
|
+
)
|
699
|
+
|
700
|
+
task_script_gcs_url = self._upload_file_to_gcs(
|
701
|
+
script_path, script_gcs_blob
|
702
|
+
)
|
703
|
+
|
704
|
+
# Create job
|
705
|
+
job_name = f"workflow-job-{run_id}"
|
706
|
+
|
707
|
+
trace.info(f"[GCP_BATCH]: Creating job: {job_name}")
|
708
|
+
job_full_name = self._create_job(
|
709
|
+
job_name,
|
710
|
+
task_script_gcs_url,
|
711
|
+
job_config_gcs_url,
|
712
|
+
params_gcs_url,
|
713
|
+
)
|
714
|
+
|
715
|
+
# Wait for job completion
|
716
|
+
trace.info("[GCP_BATCH]: Waiting for job completion")
|
717
|
+
job_result = self._wait_for_job_completion(job_full_name)
|
718
|
+
|
719
|
+
# Process results
|
720
|
+
if job_result["status"] == "completed":
|
721
|
+
result_data = {}
|
722
|
+
result_file_key = f"jobs/{run_id}/result.json"
|
723
|
+
|
724
|
+
if result_file_key in job_result.get("files", {}):
|
725
|
+
try:
|
726
|
+
result_data = json.loads(
|
727
|
+
job_result["files"][result_file_key]
|
728
|
+
)
|
729
|
+
except (json.JSONDecodeError, KeyError):
|
730
|
+
result_data = {"status": SUCCESS}
|
731
|
+
|
732
|
+
trace.info("[GCP_BATCH]: Job completed successfully")
|
733
|
+
return Result(
|
734
|
+
status=SUCCESS,
|
735
|
+
context=result_data,
|
736
|
+
run_id=run_id,
|
737
|
+
extras=job.extras or {},
|
738
|
+
)
|
739
|
+
else:
|
740
|
+
error_msg = f"Job failed: {job_result.get('status', 'unknown')}"
|
741
|
+
if job_result.get("failure_reason"):
|
742
|
+
error_msg += f" - {job_result['failure_reason']}"
|
743
|
+
|
744
|
+
trace.error(f"[GCP_BATCH]: {error_msg}")
|
745
|
+
return Result(
|
746
|
+
status=FAILED,
|
747
|
+
context={"errors": {"message": error_msg}},
|
748
|
+
run_id=run_id,
|
749
|
+
extras=job.extras or {},
|
750
|
+
)
|
751
|
+
|
752
|
+
except Exception as e:
|
753
|
+
trace.error(f"[GCP_BATCH]: Execution failed: {str(e)}")
|
754
|
+
return Result(
|
755
|
+
status=FAILED,
|
756
|
+
context={"errors": {"message": str(e)}},
|
757
|
+
run_id=run_id,
|
758
|
+
extras=job.extras or {},
|
759
|
+
)
|
760
|
+
|
761
|
+
def cleanup(self, job_id: Optional[str] = None) -> None:
|
762
|
+
"""Clean up Google Cloud Batch resources efficiently.
|
763
|
+
|
764
|
+
Args:
|
765
|
+
job_id: Job ID to clean up (if None, cleans up all workflow jobs)
|
766
|
+
"""
|
767
|
+
try:
|
768
|
+
prefix = f"jobs/{job_id}/" if job_id else "jobs/"
|
769
|
+
blobs = self.bucket.list_blobs(prefix=prefix)
|
770
|
+
|
771
|
+
# Delete blobs in batches for better performance
|
772
|
+
batch_size = 100
|
773
|
+
blob_batch = []
|
774
|
+
|
775
|
+
for blob in blobs:
|
776
|
+
blob_batch.append(blob)
|
777
|
+
if len(blob_batch) >= batch_size:
|
778
|
+
self.bucket.delete_blobs(blob_batch)
|
779
|
+
blob_batch = []
|
780
|
+
|
781
|
+
# Delete remaining blobs
|
782
|
+
if blob_batch:
|
783
|
+
self.bucket.delete_blobs(blob_batch)
|
784
|
+
|
785
|
+
except Exception:
|
786
|
+
pass
|
787
|
+
|
788
|
+
|
789
|
+
def gcp_batch_execute(
|
790
|
+
job: Job,
|
791
|
+
params: DictData,
|
792
|
+
*,
|
793
|
+
run_id: Optional[str] = None,
|
794
|
+
event: Optional[Any] = None,
|
795
|
+
) -> Result:
|
796
|
+
"""Google Cloud Batch job execution function with optimized performance.
|
797
|
+
|
798
|
+
This function creates a Google Cloud Batch provider and executes the job
|
799
|
+
on Google Cloud Batch compute resources. It handles the complete lifecycle
|
800
|
+
including job creation, task submission, and result retrieval.
|
801
|
+
|
802
|
+
Args:
|
803
|
+
job: Job to execute
|
804
|
+
params: Job parameters
|
805
|
+
run_id: Execution run ID
|
806
|
+
event: Event for cancellation
|
807
|
+
|
808
|
+
Returns:
|
809
|
+
Result: Execution result
|
810
|
+
"""
|
811
|
+
# Extract Google Cloud Batch configuration from job
|
812
|
+
batch_args = job.runs_on.args
|
813
|
+
|
814
|
+
provider = GoogleCloudBatchProvider(
|
815
|
+
project_id=batch_args.project_id,
|
816
|
+
region=batch_args.region,
|
817
|
+
gcs_bucket=batch_args.gcs_bucket,
|
818
|
+
credentials_path=batch_args.credentials_path,
|
819
|
+
)
|
820
|
+
|
821
|
+
try:
|
822
|
+
return provider.execute_job(job, params, run_id=run_id, event=event)
|
823
|
+
finally:
|
824
|
+
# Clean up resources
|
825
|
+
if run_id:
|
826
|
+
provider.cleanup(run_id)
|