ddeutil-workflow 0.0.78__py3-none-any.whl → 0.0.79__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,826 @@
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
+ # Licensed under the MIT License. See LICENSE in the project root for
4
+ # license information.
5
+ # ------------------------------------------------------------------------------
6
+ """Google Cloud Batch Provider Module.
7
+
8
+ This module provides Google Cloud Batch integration for workflow job execution.
9
+ It handles job creation, task execution, and result retrieval using Google Cloud
10
+ Batch service and Google Cloud Storage.
11
+
12
+ The Google Cloud Batch provider enables running workflow jobs on Google Cloud
13
+ Batch compute resources, providing scalable and managed execution environments
14
+ for complex workflow processing.
15
+
16
+ Key Features:
17
+ - Automatic job creation and management
18
+ - Task execution on Google Cloud compute resources
19
+ - Result file upload/download via Google Cloud Storage
20
+ - Error handling and status monitoring
21
+ - Resource cleanup and management
22
+ - Optimized file operations and caching
23
+
24
+ Classes:
25
+ GoogleCloudBatchProvider: Main provider for Google Cloud Batch operations
26
+ BatchJobConfig: Configuration for Google Cloud Batch jobs
27
+ BatchTaskConfig: Configuration for Google Cloud Batch tasks
28
+ BatchResourceConfig: Configuration for compute resources
29
+
30
+ References:
31
+ - https://cloud.google.com/batch/docs
32
+ - https://googleapis.dev/python/batch/latest/index.html
33
+
34
+ Config Example:
35
+
36
+ ```dotenv
37
+ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account.json"
38
+ export GOOGLE_CLOUD_PROJECT="your-project-id"
39
+ export GOOGLE_CLOUD_REGION="us-central1"
40
+ export GCS_BUCKET="your-gcs-bucket"
41
+ ```
42
+
43
+ ```yaml
44
+ jobs:
45
+ my-job:
46
+ runs-on:
47
+ type: "gcp_batch"
48
+ with:
49
+ project_id: "${GOOGLE_CLOUD_PROJECT}"
50
+ region: "${GOOGLE_CLOUD_REGION}"
51
+ gcs_bucket: "${GCS_BUCKET}"
52
+ machine_type: "e2-standard-4"
53
+ max_parallel_tasks: 10
54
+ stages:
55
+ - name: "process"
56
+ type: "py"
57
+ run: |
58
+ # Your processing logic here
59
+ result.context.update({"output": "processed"})
60
+ ```
61
+
62
+ """
63
+ from __future__ import annotations
64
+
65
+ import json
66
+ import os
67
+ import tempfile
68
+ import time
69
+ from contextlib import contextmanager
70
+ from typing import Any, Optional
71
+
72
+ try:
73
+ from google.api_core import exceptions as google_exceptions
74
+ from google.api_core import retry
75
+ from google.cloud import batch_v1, storage
76
+
77
+ GCP_AVAILABLE = True
78
+ except ImportError:
79
+ GCP_AVAILABLE = False
80
+
81
+ from pydantic import BaseModel, Field
82
+
83
+ from ...__types import DictData
84
+ from ...job import Job
85
+ from ...result import FAILED, SUCCESS, Result
86
+ from ...traces import get_trace
87
+ from ...utils import gen_id
88
+
89
+
90
+ class BatchResourceConfig(BaseModel):
91
+ """Google Cloud Batch resource configuration."""
92
+
93
+ machine_type: str = Field(
94
+ default="e2-standard-4", description="Machine type"
95
+ )
96
+ cpu_count: int = Field(default=4, description="Number of CPUs")
97
+ memory_mb: int = Field(default=16384, description="Memory in MB")
98
+ boot_disk_size_gb: int = Field(
99
+ default=50, description="Boot disk size in GB"
100
+ )
101
+ max_parallel_tasks: int = Field(
102
+ default=1, description="Maximum parallel tasks"
103
+ )
104
+ gpu_count: int = Field(default=0, description="Number of GPUs")
105
+ gpu_type: Optional[str] = Field(default=None, description="GPU type")
106
+
107
+
108
+ class BatchJobConfig(BaseModel):
109
+ """Google Cloud Batch job configuration."""
110
+
111
+ job_name: str = Field(description="Unique job name")
112
+ project_id: str = Field(description="Google Cloud project ID")
113
+ region: str = Field(description="Google Cloud region")
114
+ gcs_bucket: str = Field(description="Google Cloud Storage bucket")
115
+ resource_config: Optional[BatchResourceConfig] = Field(
116
+ default=None, description="Resource configuration"
117
+ )
118
+ timeout_seconds: int = Field(
119
+ default=3600, description="Job timeout in seconds"
120
+ )
121
+ retry_count: int = Field(default=2, description="Number of retries")
122
+ preemptible: bool = Field(
123
+ default=False, description="Use preemptible instances"
124
+ )
125
+
126
+
127
+ class BatchTaskConfig(BaseModel):
128
+ """Google Cloud Batch task configuration."""
129
+
130
+ task_name: str = Field(description="Unique task name")
131
+ command: list[str] = Field(description="Command to execute")
132
+ image: str = Field(
133
+ default="python:3.11-slim", description="Container image"
134
+ )
135
+ timeout_seconds: int = Field(
136
+ default=3600, description="Task timeout in seconds"
137
+ )
138
+ environment_variables: Optional[dict[str, str]] = Field(
139
+ default=None, description="Environment variables"
140
+ )
141
+
142
+
143
+ class GoogleCloudBatchProvider:
144
+ """Google Cloud Batch provider for workflow job execution.
145
+
146
+ This provider handles the complete lifecycle of Google Cloud Batch operations
147
+ including job creation, task execution, and result retrieval. It integrates
148
+ with Google Cloud Storage for file management and provides comprehensive
149
+ error handling and monitoring.
150
+
151
+ Attributes:
152
+ batch_client: Google Cloud Batch client
153
+ storage_client: Google Cloud Storage client
154
+ project_id: Google Cloud project ID
155
+ region: Google Cloud region
156
+ gcs_bucket: Google Cloud Storage bucket name
157
+ job_config: Job configuration
158
+ task_config: Task configuration
159
+
160
+ Example:
161
+ ```python
162
+ provider = GoogleCloudBatchProvider(
163
+ project_id="my-project",
164
+ region="us-central1",
165
+ gcs_bucket="my-workflow-bucket"
166
+ )
167
+
168
+ result = provider.execute_job(job, params, run_id="job-123")
169
+ ```
170
+ """
171
+
172
+ def __init__(
173
+ self,
174
+ project_id: str,
175
+ region: str,
176
+ gcs_bucket: str,
177
+ job_config: Optional[BatchJobConfig] = None,
178
+ task_config: Optional[BatchTaskConfig] = None,
179
+ credentials_path: Optional[str] = None,
180
+ ):
181
+ """Initialize Google Cloud Batch provider.
182
+
183
+ Args:
184
+ project_id: Google Cloud project ID
185
+ region: Google Cloud region
186
+ gcs_bucket: Google Cloud Storage bucket name
187
+ job_config: Job configuration
188
+ task_config: Task configuration
189
+ credentials_path: Path to service account credentials file
190
+ """
191
+ if not GCP_AVAILABLE:
192
+ raise ImportError(
193
+ "Google Cloud dependencies not available. "
194
+ "Install with: pip install google-cloud-batch google-cloud-storage"
195
+ )
196
+
197
+ self.project_id = project_id
198
+ self.region = region
199
+ self.gcs_bucket = gcs_bucket
200
+
201
+ # Set credentials if provided
202
+ if credentials_path:
203
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
204
+
205
+ # Initialize Google Cloud clients with retry configuration
206
+ self.batch_client = batch_v1.BatchServiceClient()
207
+ self.storage_client = storage.Client(project=project_id)
208
+
209
+ # Set configurations
210
+ self.job_config = job_config
211
+ self.task_config = task_config
212
+
213
+ # Cache for bucket and blob operations
214
+ self._bucket_cache: Optional[storage.Bucket] = None
215
+
216
+ @property
217
+ def bucket(self) -> storage.Bucket:
218
+ """Get or create cached bucket instance."""
219
+ if self._bucket_cache is None:
220
+ self._bucket_cache = self.storage_client.bucket(self.gcs_bucket)
221
+ return self._bucket_cache
222
+
223
+ def _ensure_gcs_bucket(self) -> None:
224
+ """Ensure Google Cloud Storage bucket exists."""
225
+ try:
226
+ self.bucket.reload()
227
+ except google_exceptions.NotFound:
228
+ # Create bucket with optimized settings
229
+ bucket = self.storage_client.create_bucket(
230
+ self.gcs_bucket,
231
+ location=self.region,
232
+ storage_class=storage.StorageClass.STANDARD,
233
+ )
234
+ self._bucket_cache = bucket
235
+
236
+ @contextmanager
237
+ def _temp_file_context(self, suffix: str = ".tmp"):
238
+ """Context manager for temporary file operations."""
239
+ temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
240
+ try:
241
+ yield temp_file.name
242
+ finally:
243
+ try:
244
+ os.unlink(temp_file.name)
245
+ except OSError:
246
+ pass
247
+
248
+ def _upload_file_to_gcs(self, file_path: str, gcs_blob_name: str) -> str:
249
+ """Upload file to Google Cloud Storage with optimized settings.
250
+
251
+ Args:
252
+ file_path: Local file path
253
+ gcs_blob_name: GCS blob name
254
+
255
+ Returns:
256
+ str: GCS blob URL
257
+ """
258
+ self._ensure_gcs_bucket()
259
+
260
+ blob = self.bucket.blob(gcs_blob_name)
261
+
262
+ # Set optimized metadata for workflow files
263
+ blob.metadata = {
264
+ "workflow_provider": "gcp_batch",
265
+ "upload_time": str(time.time()),
266
+ }
267
+
268
+ # Use optimized upload settings
269
+ with open(file_path, "rb") as data:
270
+ blob.upload_from_file(
271
+ data,
272
+ content_type="application/octet-stream",
273
+ timeout=300, # 5 minute timeout
274
+ )
275
+
276
+ return f"gs://{self.gcs_bucket}/{gcs_blob_name}"
277
+
278
+ def _download_file_from_gcs(
279
+ self, gcs_blob_name: str, local_path: str
280
+ ) -> None:
281
+ """Download file from Google Cloud Storage with optimized settings.
282
+
283
+ Args:
284
+ gcs_blob_name: GCS blob name
285
+ local_path: Local file path
286
+ """
287
+ blob = self.bucket.blob(gcs_blob_name)
288
+
289
+ with open(local_path, "wb") as data:
290
+ blob.download_to_file(data, timeout=300)
291
+
292
+ def _create_job_definition(
293
+ self,
294
+ job_name: str,
295
+ task_script_gcs_url: str,
296
+ job_config_gcs_url: str,
297
+ params_gcs_url: str,
298
+ ) -> batch_v1.Job:
299
+ """Create optimized job definition.
300
+
301
+ Args:
302
+ job_name: Job name
303
+ task_script_gcs_url: GCS URL of task script
304
+ job_config_gcs_url: GCS URL of job configuration
305
+ params_gcs_url: GCS URL of parameters
306
+
307
+ Returns:
308
+ batch_v1.Job: Job definition
309
+ """
310
+ job_config = self.job_config or BatchJobConfig(
311
+ job_name=job_name,
312
+ project_id=self.project_id,
313
+ region=self.region,
314
+ gcs_bucket=self.gcs_bucket,
315
+ )
316
+
317
+ resource_config = job_config.resource_config or BatchResourceConfig()
318
+
319
+ # Create optimized runnable
320
+ runnable = batch_v1.Runnable()
321
+ runnable.container = batch_v1.Runnable.Container()
322
+ runnable.container.image_uri = "python:3.11-slim"
323
+ runnable.container.commands = ["python3", "task_script.py"]
324
+
325
+ # Add environment variables with optimized settings
326
+ env_vars = {
327
+ "TASK_SCRIPT_URL": task_script_gcs_url,
328
+ "JOB_CONFIG_URL": job_config_gcs_url,
329
+ "PARAMS_URL": params_gcs_url,
330
+ "PYTHONUNBUFFERED": "1", # Ensure immediate output
331
+ "PYTHONDONTWRITEBYTECODE": "1", # Don't create .pyc files
332
+ }
333
+
334
+ if self.task_config and self.task_config.environment_variables:
335
+ env_vars.update(self.task_config.environment_variables)
336
+
337
+ runnable.container.environment = batch_v1.Environment()
338
+ runnable.container.environment.variables = env_vars
339
+
340
+ # Create optimized task specification
341
+ task = batch_v1.TaskSpec()
342
+ task.runnables = [runnable]
343
+ task.max_retry_count = job_config.retry_count
344
+ task.max_run_duration = f"{job_config.timeout_seconds}s"
345
+
346
+ # Configure compute resources
347
+ resources = batch_v1.ComputeResource()
348
+ resources.cpu_milli = resource_config.cpu_count * 1000
349
+ resources.memory_mib = resource_config.memory_mb
350
+
351
+ # Add GPU configuration if specified
352
+ if resource_config.gpu_count > 0 and resource_config.gpu_type:
353
+ resources.gpu_count = resource_config.gpu_count
354
+ resources.gpu_type = resource_config.gpu_type
355
+
356
+ task.compute_resource = resources
357
+
358
+ # Create job with optimized allocation policy
359
+ job = batch_v1.Job()
360
+ job.name = job_name
361
+ job.task_groups = [
362
+ batch_v1.TaskGroup(
363
+ task_spec=task,
364
+ task_count=1,
365
+ parallelism=resource_config.max_parallel_tasks,
366
+ )
367
+ ]
368
+
369
+ # Configure allocation policy
370
+ job.allocation_policy = batch_v1.AllocationPolicy()
371
+
372
+ # Set provisioning model based on configuration
373
+ provisioning_model = (
374
+ batch_v1.AllocationPolicy.ProvisioningModel.PREEMPTIBLE
375
+ if job_config.preemptible
376
+ else batch_v1.AllocationPolicy.ProvisioningModel.STANDARD
377
+ )
378
+
379
+ job.allocation_policy.instances = [
380
+ batch_v1.AllocationPolicy.InstancePolicyOrTemplate(
381
+ install_gpu_drivers=resource_config.gpu_count > 0,
382
+ machine_type=resource_config.machine_type,
383
+ provisioning_model=provisioning_model,
384
+ )
385
+ ]
386
+
387
+ return job
388
+
389
+ def _create_job(
390
+ self,
391
+ job_name: str,
392
+ task_script_gcs_url: str,
393
+ job_config_gcs_url: str,
394
+ params_gcs_url: str,
395
+ ) -> str:
396
+ """Create Google Cloud Batch job with optimized settings.
397
+
398
+ Args:
399
+ job_name: Job name
400
+ task_script_gcs_url: GCS URL of task script
401
+ job_config_gcs_url: GCS URL of job configuration
402
+ params_gcs_url: GCS URL of parameters
403
+
404
+ Returns:
405
+ str: Job name
406
+ """
407
+ job = self._create_job_definition(
408
+ job_name, task_script_gcs_url, job_config_gcs_url, params_gcs_url
409
+ )
410
+
411
+ # Create the job with retry logic
412
+ parent = f"projects/{self.project_id}/locations/{self.region}"
413
+
414
+ request = batch_v1.CreateJobRequest(
415
+ parent=parent, job_id=job_name, job=job
416
+ )
417
+
418
+ # Use retry decorator for better reliability
419
+ @retry.Retry(
420
+ predicate=retry.if_exception_type(
421
+ google_exceptions.ServiceUnavailable
422
+ )
423
+ )
424
+ def create_job_with_retry():
425
+ operation = self.batch_client.create_job(request=request)
426
+ return operation.result()
427
+
428
+ result = create_job_with_retry()
429
+ return result.name
430
+
431
+ def _wait_for_job_completion(
432
+ self, job_name: str, timeout: int = 3600
433
+ ) -> dict[str, Any]:
434
+ """Wait for job completion with optimized polling.
435
+
436
+ Args:
437
+ job_name: Job name
438
+ timeout: Timeout in seconds
439
+
440
+ Returns:
441
+ Dict[str, Any]: Job results
442
+ """
443
+ start_time = time.time()
444
+ poll_interval = 10 # Start with 10 second intervals
445
+
446
+ while time.time() - start_time < timeout:
447
+ try:
448
+ request = batch_v1.GetJobRequest(name=job_name)
449
+ job = self.batch_client.get_job(request=request)
450
+
451
+ if job.status.state == batch_v1.JobStatus.State.SUCCEEDED:
452
+ return self._process_successful_job(job, job_name)
453
+
454
+ elif job.status.state == batch_v1.JobStatus.State.FAILED:
455
+ return self._process_failed_job(job)
456
+
457
+ elif job.status.state in [
458
+ batch_v1.JobStatus.State.RUNNING,
459
+ batch_v1.JobStatus.State.SCHEDULED,
460
+ batch_v1.JobStatus.State.QUEUED,
461
+ ]:
462
+ # Adaptive polling: increase interval for long-running jobs
463
+ if time.time() - start_time > 300: # After 5 minutes
464
+ poll_interval = min(
465
+ poll_interval * 1.5, 60
466
+ ) # Max 60 seconds
467
+
468
+ time.sleep(poll_interval)
469
+ else:
470
+ # For other states, use shorter polling
471
+ time.sleep(5)
472
+
473
+ except google_exceptions.NotFound:
474
+ # Job might be deleted, wait a bit and retry
475
+ time.sleep(poll_interval)
476
+ except Exception:
477
+ # Continue polling on error with exponential backoff
478
+ poll_interval = min(poll_interval * 2, 60)
479
+ time.sleep(poll_interval)
480
+
481
+ return {"status": "timeout", "exit_code": 1}
482
+
483
+ def _process_successful_job(
484
+ self, job: batch_v1.Job, job_name: str
485
+ ) -> dict[str, Any]:
486
+ """Process successful job and download results.
487
+
488
+ Args:
489
+ job: Job object
490
+ job_name: Job name
491
+
492
+ Returns:
493
+ Dict[str, Any]: Job results with files
494
+ """
495
+ result_files = {}
496
+ try:
497
+ # List objects in job's GCS prefix
498
+ job_id = job_name.split("/")[-1]
499
+ prefix = f"jobs/{job_id}/"
500
+
501
+ blobs = self.bucket.list_blobs(prefix=prefix)
502
+
503
+ # Download result files in parallel (simplified)
504
+ for blob in blobs:
505
+ if blob.name.endswith((".json", ".txt", ".log")):
506
+ with self._temp_file_context() as tmp_file:
507
+ blob.download_to_filename(tmp_file)
508
+ with open(tmp_file) as f:
509
+ result_files[blob.name] = f.read()
510
+ except Exception:
511
+ # File download failed, continue with empty results
512
+ pass
513
+
514
+ return {"status": "completed", "exit_code": 0, "files": result_files}
515
+
516
+ def _process_failed_job(self, job: batch_v1.Job) -> dict[str, Any]:
517
+ """Process failed job and extract error information.
518
+
519
+ Args:
520
+ job: Job object
521
+
522
+ Returns:
523
+ Dict[str, Any]: Failure information
524
+ """
525
+ failure_reason = "Job failed"
526
+
527
+ # Try to extract more detailed error information
528
+ if hasattr(job, "status") and hasattr(job.status, "status_events"):
529
+ for event in job.status.status_events:
530
+ if event.type_ == batch_v1.JobStatus.StatusEvent.Type.FAILED:
531
+ failure_reason = event.description or failure_reason
532
+ break
533
+
534
+ return {
535
+ "status": "failed",
536
+ "exit_code": 1,
537
+ "failure_reason": failure_reason,
538
+ }
539
+
540
+ def _create_optimized_task_script(
541
+ self, job: Job, params: DictData, run_id: str
542
+ ) -> str:
543
+ """Create optimized Python script for task execution.
544
+
545
+ Args:
546
+ job: Job to execute
547
+ params: Job parameters
548
+ run_id: Execution run ID
549
+
550
+ Returns:
551
+ str: Path to created script
552
+ """
553
+ script_content = f'''#!/usr/bin/env python3
554
+ import json
555
+ import sys
556
+ import os
557
+ import subprocess
558
+ import time
559
+ from pathlib import Path
560
+
561
+ def install_package(package):
562
+ """Install package with retry logic."""
563
+ for attempt in range(3):
564
+ try:
565
+ subprocess.run([sys.executable, '-m', 'pip', 'install', package],
566
+ check=True, capture_output=True, timeout=300)
567
+ return True
568
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
569
+ if attempt == 2:
570
+ raise
571
+ time.sleep(2 ** attempt)
572
+
573
+ def download_file(url, local_path):
574
+ """Download file with retry logic."""
575
+ for attempt in range(3):
576
+ try:
577
+ subprocess.run(['gsutil', 'cp', url, local_path],
578
+ check=True, capture_output=True, timeout=300)
579
+ return True
580
+ except subprocess.CalledProcessError:
581
+ if attempt == 2:
582
+ raise
583
+ time.sleep(2 ** attempt)
584
+
585
+ # Install ddeutil-workflow with retry
586
+ install_package('ddeutil-workflow')
587
+
588
+ # Download files with retry
589
+ download_file(os.environ['TASK_SCRIPT_URL'], 'task_script.py')
590
+ download_file(os.environ['JOB_CONFIG_URL'], 'job_config.json')
591
+ download_file(os.environ['PARAMS_URL'], 'params.json')
592
+
593
+ # Add current directory to Python path
594
+ sys.path.insert(0, os.getcwd())
595
+
596
+ from ddeutil.workflow.job import local_execute
597
+ from ddeutil.workflow import Job
598
+
599
+ # Load job configuration
600
+ with open('job_config.json', 'r') as f:
601
+ job_data = json.load(f)
602
+
603
+ # Load parameters
604
+ with open('params.json', 'r') as f:
605
+ params = json.load(f)
606
+
607
+ # Create job instance
608
+ job = Job(**job_data)
609
+
610
+ # Execute job
611
+ result = local_execute(job, params, run_id='{run_id}')
612
+
613
+ # Save result
614
+ with open('result.json', 'w') as f:
615
+ json.dump(result.model_dump(), f, indent=2)
616
+
617
+ # Upload result to GCS with retry
618
+ job_id = '{run_id}'
619
+ bucket = '{self.gcs_bucket}'
620
+
621
+ # Create directory structure
622
+ subprocess.run(['gsutil', 'mkdir', '-p', f'gs://{{bucket}}/jobs/{{job_id}}'],
623
+ check=True, capture_output=True)
624
+
625
+ # Upload result file with retry
626
+ download_file('result.json', f'gs://{{bucket}}/jobs/{{job_id}}/result.json')
627
+
628
+ sys.exit(0 if result.status == 'success' else 1)
629
+ '''
630
+
631
+ with self._temp_file_context(suffix=".py") as script_path:
632
+ with open(script_path, "w") as f:
633
+ f.write(script_content)
634
+ return script_path
635
+
636
+ def execute_job(
637
+ self,
638
+ job: Job,
639
+ params: DictData,
640
+ *,
641
+ run_id: Optional[str] = None,
642
+ event: Optional[Any] = None,
643
+ ) -> Result:
644
+ """Execute job on Google Cloud Batch with optimized performance.
645
+
646
+ Args:
647
+ job: Job to execute
648
+ params: Job parameters
649
+ run_id: Execution run ID
650
+ event: Event for cancellation
651
+
652
+ Returns:
653
+ Result: Execution result
654
+ """
655
+ if event and event.is_set():
656
+ return Result(
657
+ status=FAILED,
658
+ context={
659
+ "errors": {"message": "Execution was canceled before start"}
660
+ },
661
+ run_id=run_id or gen_id("gcp-batch"),
662
+ extras={},
663
+ )
664
+
665
+ # Generate run ID if not provided
666
+ if not run_id:
667
+ run_id = gen_id(job.id or "gcp-batch", unique=True)
668
+
669
+ trace = get_trace(run_id, extras=job.extras)
670
+ trace.info(f"[GCP_BATCH]: Starting job execution: {job.id}")
671
+
672
+ try:
673
+ # Create optimized task script
674
+ script_path = self._create_optimized_task_script(
675
+ job, params, run_id
676
+ )
677
+
678
+ # Prepare file paths
679
+ job_config_gcs_blob = f"jobs/{run_id}/job_config.json"
680
+ params_gcs_blob = f"jobs/{run_id}/params.json"
681
+ script_gcs_blob = f"jobs/{run_id}/task_script.py"
682
+
683
+ # Upload files efficiently
684
+ trace.info("[GCP_BATCH]: Uploading files to GCS")
685
+
686
+ with self._temp_file_context(suffix=".json") as job_config_path:
687
+ with open(job_config_path, "w") as f:
688
+ json.dump(job.model_dump(), f)
689
+ job_config_gcs_url = self._upload_file_to_gcs(
690
+ job_config_path, job_config_gcs_blob
691
+ )
692
+
693
+ with self._temp_file_context(suffix=".json") as params_path:
694
+ with open(params_path, "w") as f:
695
+ json.dump(params, f)
696
+ params_gcs_url = self._upload_file_to_gcs(
697
+ params_path, params_gcs_blob
698
+ )
699
+
700
+ task_script_gcs_url = self._upload_file_to_gcs(
701
+ script_path, script_gcs_blob
702
+ )
703
+
704
+ # Create job
705
+ job_name = f"workflow-job-{run_id}"
706
+
707
+ trace.info(f"[GCP_BATCH]: Creating job: {job_name}")
708
+ job_full_name = self._create_job(
709
+ job_name,
710
+ task_script_gcs_url,
711
+ job_config_gcs_url,
712
+ params_gcs_url,
713
+ )
714
+
715
+ # Wait for job completion
716
+ trace.info("[GCP_BATCH]: Waiting for job completion")
717
+ job_result = self._wait_for_job_completion(job_full_name)
718
+
719
+ # Process results
720
+ if job_result["status"] == "completed":
721
+ result_data = {}
722
+ result_file_key = f"jobs/{run_id}/result.json"
723
+
724
+ if result_file_key in job_result.get("files", {}):
725
+ try:
726
+ result_data = json.loads(
727
+ job_result["files"][result_file_key]
728
+ )
729
+ except (json.JSONDecodeError, KeyError):
730
+ result_data = {"status": SUCCESS}
731
+
732
+ trace.info("[GCP_BATCH]: Job completed successfully")
733
+ return Result(
734
+ status=SUCCESS,
735
+ context=result_data,
736
+ run_id=run_id,
737
+ extras=job.extras or {},
738
+ )
739
+ else:
740
+ error_msg = f"Job failed: {job_result.get('status', 'unknown')}"
741
+ if job_result.get("failure_reason"):
742
+ error_msg += f" - {job_result['failure_reason']}"
743
+
744
+ trace.error(f"[GCP_BATCH]: {error_msg}")
745
+ return Result(
746
+ status=FAILED,
747
+ context={"errors": {"message": error_msg}},
748
+ run_id=run_id,
749
+ extras=job.extras or {},
750
+ )
751
+
752
+ except Exception as e:
753
+ trace.error(f"[GCP_BATCH]: Execution failed: {str(e)}")
754
+ return Result(
755
+ status=FAILED,
756
+ context={"errors": {"message": str(e)}},
757
+ run_id=run_id,
758
+ extras=job.extras or {},
759
+ )
760
+
761
+ def cleanup(self, job_id: Optional[str] = None) -> None:
762
+ """Clean up Google Cloud Batch resources efficiently.
763
+
764
+ Args:
765
+ job_id: Job ID to clean up (if None, cleans up all workflow jobs)
766
+ """
767
+ try:
768
+ prefix = f"jobs/{job_id}/" if job_id else "jobs/"
769
+ blobs = self.bucket.list_blobs(prefix=prefix)
770
+
771
+ # Delete blobs in batches for better performance
772
+ batch_size = 100
773
+ blob_batch = []
774
+
775
+ for blob in blobs:
776
+ blob_batch.append(blob)
777
+ if len(blob_batch) >= batch_size:
778
+ self.bucket.delete_blobs(blob_batch)
779
+ blob_batch = []
780
+
781
+ # Delete remaining blobs
782
+ if blob_batch:
783
+ self.bucket.delete_blobs(blob_batch)
784
+
785
+ except Exception:
786
+ pass
787
+
788
+
789
+ def gcp_batch_execute(
790
+ job: Job,
791
+ params: DictData,
792
+ *,
793
+ run_id: Optional[str] = None,
794
+ event: Optional[Any] = None,
795
+ ) -> Result:
796
+ """Google Cloud Batch job execution function with optimized performance.
797
+
798
+ This function creates a Google Cloud Batch provider and executes the job
799
+ on Google Cloud Batch compute resources. It handles the complete lifecycle
800
+ including job creation, task submission, and result retrieval.
801
+
802
+ Args:
803
+ job: Job to execute
804
+ params: Job parameters
805
+ run_id: Execution run ID
806
+ event: Event for cancellation
807
+
808
+ Returns:
809
+ Result: Execution result
810
+ """
811
+ # Extract Google Cloud Batch configuration from job
812
+ batch_args = job.runs_on.args
813
+
814
+ provider = GoogleCloudBatchProvider(
815
+ project_id=batch_args.project_id,
816
+ region=batch_args.region,
817
+ gcs_bucket=batch_args.gcs_bucket,
818
+ credentials_path=batch_args.credentials_path,
819
+ )
820
+
821
+ try:
822
+ return provider.execute_job(job, params, run_id=run_id, event=event)
823
+ finally:
824
+ # Clean up resources
825
+ if run_id:
826
+ provider.cleanup(run_id)