ddeutil-workflow 0.0.78__py3-none-any.whl → 0.0.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ddeutil/workflow/__about__.py +1 -1
  2. ddeutil/workflow/__init__.py +2 -6
  3. ddeutil/workflow/api/routes/job.py +2 -2
  4. ddeutil/workflow/api/routes/logs.py +5 -5
  5. ddeutil/workflow/api/routes/workflows.py +3 -3
  6. ddeutil/workflow/audits.py +547 -176
  7. ddeutil/workflow/cli.py +19 -1
  8. ddeutil/workflow/conf.py +10 -20
  9. ddeutil/workflow/event.py +15 -6
  10. ddeutil/workflow/job.py +147 -74
  11. ddeutil/workflow/params.py +172 -58
  12. ddeutil/workflow/plugins/__init__.py +0 -0
  13. ddeutil/workflow/plugins/providers/__init__.py +0 -0
  14. ddeutil/workflow/plugins/providers/aws.py +908 -0
  15. ddeutil/workflow/plugins/providers/az.py +1003 -0
  16. ddeutil/workflow/plugins/providers/container.py +703 -0
  17. ddeutil/workflow/plugins/providers/gcs.py +826 -0
  18. ddeutil/workflow/result.py +6 -4
  19. ddeutil/workflow/reusables.py +151 -95
  20. ddeutil/workflow/stages.py +28 -28
  21. ddeutil/workflow/traces.py +1697 -541
  22. ddeutil/workflow/utils.py +109 -67
  23. ddeutil/workflow/workflow.py +42 -30
  24. {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.80.dist-info}/METADATA +39 -19
  25. ddeutil_workflow-0.0.80.dist-info/RECORD +36 -0
  26. ddeutil_workflow-0.0.78.dist-info/RECORD +0 -30
  27. {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.80.dist-info}/WHEEL +0 -0
  28. {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.80.dist-info}/entry_points.txt +0 -0
  29. {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.80.dist-info}/licenses/LICENSE +0 -0
  30. {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.80.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1003 @@
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) 2022 Korawich Anuttra. All rights reserved.
3
+ # Licensed under the MIT License. See LICENSE in the project root for
4
+ # license information.
5
+ # ------------------------------------------------------------------------------
6
+ """Azure Batch Provider Module.
7
+
8
+ This module provides Azure Batch integration for workflow job execution.
9
+ It handles pool creation, job submission, task execution, and result retrieval.
10
+
11
+ The Azure Batch provider enables running workflow jobs on Azure Batch compute
12
+ nodes, providing scalable and managed execution environments for complex
13
+ workflow processing.
14
+
15
+ Key Features:
16
+ - Automatic pool creation and management
17
+ - Job and task submission to Azure Batch
18
+ - Result file upload/download via Azure Storage
19
+ - Error handling and status monitoring
20
+ - Resource cleanup and management
21
+ - Optimized file operations and caching
22
+
23
+ Classes:
24
+ AzureBatchProvider: Main provider for Azure Batch operations
25
+ BatchPoolConfig: Configuration for Azure Batch pools
26
+ BatchJobConfig: Configuration for Azure Batch jobs
27
+ BatchTaskConfig: Configuration for Azure Batch tasks
28
+
29
+ References:
30
+ - https://docs.microsoft.com/en-us/azure/batch/batch-python-tutorial
31
+ - https://docs.microsoft.com/en-us/azure/batch/batch-api-basics
32
+
33
+ Config Example:
34
+
35
+ ```dotenv
36
+ export AZURE_BATCH_ACCOUNT_NAME="your-batch-account"
37
+ export AZURE_BATCH_ACCOUNT_KEY="your-batch-key"
38
+ export AZURE_BATCH_ACCOUNT_URL="https://your-batch-account.region.batch.azure.com"
39
+ export AZURE_STORAGE_ACCOUNT_NAME="your-storage-account"
40
+ export AZURE_STORAGE_ACCOUNT_KEY="your-storage-key"
41
+ ```
42
+
43
+ ```yaml
44
+ jobs:
45
+ my-job:
46
+ runs-on:
47
+ type: "azure_batch"
48
+ with:
49
+ batch_account_name: "${AZURE_BATCH_ACCOUNT_NAME}"
50
+ batch_account_key: "${AZURE_BATCH_ACCOUNT_KEY}"
51
+ batch_account_url: "${AZURE_BATCH_ACCOUNT_URL}"
52
+ storage_account_name: "${AZURE_STORAGE_ACCOUNT_NAME}"
53
+ storage_account_key: "${AZURE_STORAGE_ACCOUNT_KEY}"
54
+ stages:
55
+ - name: "process"
56
+ type: "py"
57
+ run: |
58
+ # Your processing logic here
59
+ result.context.update({"output": "processed"})
60
+ ```
61
+
62
+ """
63
+ from __future__ import annotations
64
+
65
+ import json
66
+ import os
67
+ import tempfile
68
+ import time
69
+ from contextlib import contextmanager
70
+ from typing import Any, Optional
71
+
72
+ try:
73
+ from azure.batch import BatchServiceClient
74
+ from azure.batch.batch_auth import SharedKeyCredentials
75
+ from azure.batch.models import (
76
+ AutoUserSpecification,
77
+ BatchErrorException,
78
+ CloudServiceConfiguration,
79
+ JobAddParameter,
80
+ NetworkConfiguration,
81
+ PoolAddParameter,
82
+ PoolInformation,
83
+ ResourceFile,
84
+ StartTask,
85
+ TaskAddParameter,
86
+ TaskState,
87
+ UserIdentity,
88
+ )
89
+ from azure.core.exceptions import AzureError
90
+ from azure.storage.blob import BlobServiceClient
91
+
92
+ AZURE_AVAILABLE = True
93
+ except ImportError:
94
+ AZURE_AVAILABLE = False
95
+
96
+ from pydantic import BaseModel, Field
97
+
98
+ from ...__types import DictData
99
+ from ...job import Job
100
+ from ...result import FAILED, SUCCESS, Result
101
+ from ...traces import get_trace
102
+ from ...utils import gen_id
103
+
104
+
105
+ class BatchPoolConfig(BaseModel):
106
+ """Azure Batch pool configuration."""
107
+
108
+ pool_id: str = Field(description="Unique pool identifier")
109
+ vm_size: str = Field(
110
+ default="Standard_D2s_v3", description="VM size for compute nodes"
111
+ )
112
+ node_count: int = Field(default=1, description="Number of compute nodes")
113
+ max_tasks_per_node: int = Field(
114
+ default=4, description="Maximum tasks per node"
115
+ )
116
+ enable_auto_scale: bool = Field(
117
+ default=False, description="Enable auto-scaling"
118
+ )
119
+ auto_scale_formula: Optional[str] = Field(
120
+ default=None, description="Auto-scale formula"
121
+ )
122
+ os_family: str = Field(
123
+ default="5", description="OS family (5=Ubuntu 20.04)"
124
+ )
125
+ os_version: str = Field(default="latest", description="OS version")
126
+ enable_inter_node_communication: bool = Field(
127
+ default=False, description="Enable inter-node communication"
128
+ )
129
+ network_configuration: Optional[dict[str, Any]] = Field(
130
+ default=None, description="Network configuration"
131
+ )
132
+
133
+
134
+ class BatchJobConfig(BaseModel):
135
+ """Azure Batch job configuration."""
136
+
137
+ job_id: str = Field(description="Unique job identifier")
138
+ pool_id: str = Field(description="Pool ID to run the job on")
139
+ display_name: Optional[str] = Field(
140
+ default=None, description="Job display name"
141
+ )
142
+ priority: int = Field(default=0, description="Job priority")
143
+ uses_task_dependencies: bool = Field(
144
+ default=False, description="Use task dependencies"
145
+ )
146
+ on_all_tasks_complete: str = Field(
147
+ default="noaction", description="Action when all tasks complete"
148
+ )
149
+ on_task_failure: str = Field(
150
+ default="noaction", description="Action when task fails"
151
+ )
152
+ metadata: Optional[list[dict[str, str]]] = Field(
153
+ default=None, description="Job metadata"
154
+ )
155
+
156
+
157
+ class BatchTaskConfig(BaseModel):
158
+ """Azure Batch task configuration."""
159
+
160
+ task_id: str = Field(description="Unique task identifier")
161
+ command_line: str = Field(description="Command line to execute")
162
+ resource_files: Optional[list[ResourceFile]] = Field(
163
+ default=None, description="Resource files"
164
+ )
165
+ environment_settings: Optional[dict[str, str]] = Field(
166
+ default=None, description="Environment variables"
167
+ )
168
+ max_wall_clock_time: Optional[str] = Field(
169
+ default="PT1H", description="Maximum wall clock time"
170
+ )
171
+ retention_time: Optional[str] = Field(
172
+ default="PT1H", description="Task retention time"
173
+ )
174
+ user_identity: Optional[dict[str, Any]] = Field(
175
+ default=None, description="User identity"
176
+ )
177
+ constraints: Optional[dict[str, Any]] = Field(
178
+ default=None, description="Task constraints"
179
+ )
180
+
181
+
182
+ class AzureBatchProvider:
183
+ """Azure Batch provider for workflow job execution.
184
+
185
+ This provider handles the complete lifecycle of Azure Batch operations
186
+ including pool creation, job submission, task execution, and result
187
+ retrieval. It integrates with Azure Storage for file management and
188
+ provides comprehensive error handling and monitoring.
189
+
190
+ Attributes:
191
+ batch_client: Azure Batch service client
192
+ blob_client: Azure Blob storage client
193
+ storage_container: Storage container name for files
194
+ pool_config: Pool configuration
195
+ job_config: Job configuration
196
+ task_config: Task configuration
197
+
198
+ Example:
199
+ ```python
200
+ provider = AzureBatchProvider(
201
+ batch_account_name="mybatchaccount",
202
+ batch_account_key="mykey",
203
+ batch_account_url="https://mybatchaccount.region.batch.azure.com",
204
+ storage_account_name="mystorageaccount",
205
+ storage_account_key="mystoragekey"
206
+ )
207
+
208
+ result = provider.execute_job(job, params, run_id="job-123")
209
+ ```
210
+ """
211
+
212
+ def __init__(
213
+ self,
214
+ batch_account_name: str,
215
+ batch_account_key: str,
216
+ batch_account_url: str,
217
+ storage_account_name: str,
218
+ storage_account_key: str,
219
+ storage_container: str = "workflow-files",
220
+ pool_config: Optional[BatchPoolConfig] = None,
221
+ job_config: Optional[BatchJobConfig] = None,
222
+ task_config: Optional[BatchTaskConfig] = None,
223
+ ):
224
+ """Initialize Azure Batch provider.
225
+
226
+ Args:
227
+ batch_account_name: Azure Batch account name
228
+ batch_account_key: Azure Batch account key
229
+ batch_account_url: Azure Batch account URL
230
+ storage_account_name: Azure Storage account name
231
+ storage_account_key: Azure Storage account key
232
+ storage_container: Storage container name for files
233
+ pool_config: Pool configuration
234
+ job_config: Job configuration
235
+ task_config: Task configuration
236
+ """
237
+ if not AZURE_AVAILABLE:
238
+ raise ImportError(
239
+ "Azure Batch dependencies not available. "
240
+ "Install with: pip install ddeutil-workflow[azure]"
241
+ )
242
+
243
+ self.batch_account_name = batch_account_name
244
+ self.batch_account_key = batch_account_key
245
+ self.batch_account_url = batch_account_url
246
+ self.storage_account_name = storage_account_name
247
+ self.storage_account_key = storage_account_key
248
+ self.storage_container = storage_container
249
+
250
+ # Initialize clients with optimized configuration
251
+ self.batch_client = self._create_batch_client()
252
+ self.blob_client = self._create_blob_client()
253
+
254
+ # Set configurations
255
+ self.pool_config = pool_config or BatchPoolConfig(
256
+ pool_id=f"workflow-pool-{gen_id('pool')}"
257
+ )
258
+ self.job_config = job_config
259
+ self.task_config = task_config
260
+
261
+ # Cache for container operations
262
+ self._container_exists: Optional[bool] = None
263
+
264
+ def _create_batch_client(self) -> BatchServiceClient:
265
+ """Create Azure Batch service client with optimized configuration."""
266
+ credentials = SharedKeyCredentials(
267
+ self.batch_account_name, self.batch_account_key
268
+ )
269
+ return BatchServiceClient(credentials, self.batch_account_url)
270
+
271
+ def _create_blob_client(self) -> BlobServiceClient:
272
+ """Create Azure Blob storage client with optimized configuration."""
273
+ connection_string = (
274
+ f"DefaultEndpointsProtocol=https;"
275
+ f"AccountName={self.storage_account_name};"
276
+ f"AccountKey={self.storage_account_key};"
277
+ f"EndpointSuffix=core.windows.net"
278
+ )
279
+ return BlobServiceClient.from_connection_string(connection_string)
280
+
281
+ @contextmanager
282
+ def _temp_file_context(self, suffix: str = ".tmp"):
283
+ """Context manager for temporary file operations."""
284
+ temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
285
+ try:
286
+ yield temp_file.name
287
+ finally:
288
+ try:
289
+ os.unlink(temp_file.name)
290
+ except OSError:
291
+ pass
292
+
293
+ def _ensure_storage_container(self) -> None:
294
+ """Ensure storage container exists with optimized settings."""
295
+ if self._container_exists is None:
296
+ container_client = self.blob_client.get_container_client(
297
+ self.storage_container
298
+ )
299
+ try:
300
+ container_client.get_container_properties()
301
+ self._container_exists = True
302
+ except AzureError:
303
+ # Create container with optimized settings
304
+ container_client.create_container(
305
+ metadata={
306
+ "workflow_provider": "azure_batch",
307
+ "created_time": str(time.time()),
308
+ }
309
+ )
310
+ self._container_exists = True
311
+
312
+ def _upload_file_to_storage(self, file_path: str, blob_name: str) -> str:
313
+ """Upload file to Azure Storage with optimized settings.
314
+
315
+ Args:
316
+ file_path: Local file path
317
+ blob_name: Blob name in storage
318
+
319
+ Returns:
320
+ str: Blob URL
321
+ """
322
+ self._ensure_storage_container()
323
+ container_client = self.blob_client.get_container_client(
324
+ self.storage_container
325
+ )
326
+ blob_client = container_client.get_blob_client(blob_name)
327
+
328
+ # Set optimized metadata
329
+ metadata = {
330
+ "workflow_provider": "azure_batch",
331
+ "upload_time": str(time.time()),
332
+ "content_type": "application/octet-stream",
333
+ }
334
+
335
+ with open(file_path, "rb") as data:
336
+ blob_client.upload_blob(
337
+ data,
338
+ overwrite=True,
339
+ metadata=metadata,
340
+ content_settings=None, # Let Azure determine content type
341
+ )
342
+
343
+ return blob_client.url
344
+
345
+ def _download_file_from_storage(
346
+ self, blob_name: str, local_path: str
347
+ ) -> None:
348
+ """Download file from Azure Storage with optimized settings.
349
+
350
+ Args:
351
+ blob_name: Blob name in storage
352
+ local_path: Local file path
353
+ """
354
+ container_client = self.blob_client.get_container_client(
355
+ self.storage_container
356
+ )
357
+ blob_client = container_client.get_blob_client(blob_name)
358
+
359
+ with open(local_path, "wb") as data:
360
+ blob_client.download_blob().readinto(data)
361
+
362
+ def _create_optimized_pool(self, pool_id: str) -> None:
363
+ """Create Azure Batch pool with optimized settings.
364
+
365
+ Args:
366
+ pool_id: Pool identifier
367
+ """
368
+ try:
369
+ self.batch_client.pool.get(pool_id)
370
+ return
371
+ except BatchErrorException as e:
372
+ if e.response.status_code != 404:
373
+ raise
374
+
375
+ pool_config = self.pool_config
376
+
377
+ # Create optimized start task for pool initialization
378
+ start_task = StartTask(
379
+ command_line=(
380
+ "apt-get update && "
381
+ "apt-get install -y python3 python3-pip curl && "
382
+ "pip3 install --no-cache-dir ddeutil-workflow && "
383
+ "echo 'Pool initialization completed'"
384
+ ),
385
+ wait_for_success=True,
386
+ user_identity=UserIdentity(
387
+ auto_user=AutoUserSpecification(
388
+ scope="pool", elevation_level="admin"
389
+ )
390
+ ),
391
+ max_task_retry_count=2,
392
+ )
393
+
394
+ # Build pool configuration
395
+ pool_params = {
396
+ "id": pool_id,
397
+ "vm_size": pool_config.vm_size,
398
+ "target_dedicated_nodes": pool_config.node_count,
399
+ "task_slots_per_node": pool_config.max_tasks_per_node,
400
+ "enable_auto_scale": pool_config.enable_auto_scale,
401
+ "start_task": start_task,
402
+ "enable_inter_node_communication": pool_config.enable_inter_node_communication,
403
+ }
404
+
405
+ # Add auto-scale formula if enabled
406
+ if pool_config.enable_auto_scale and pool_config.auto_scale_formula:
407
+ pool_params["auto_scale_formula"] = pool_config.auto_scale_formula
408
+
409
+ # Add network configuration if specified
410
+ if pool_config.network_configuration:
411
+ pool_params["network_configuration"] = NetworkConfiguration(
412
+ **pool_config.network_configuration
413
+ )
414
+
415
+ # Use Cloud Service configuration for better compatibility
416
+ pool_params["cloud_service_configuration"] = CloudServiceConfiguration(
417
+ os_family=pool_config.os_family, os_version=pool_config.os_version
418
+ )
419
+
420
+ new_pool = PoolAddParameter(**pool_params)
421
+ self.batch_client.pool.add(new_pool)
422
+
423
+ # Wait for pool to be ready with optimized polling
424
+ self._wait_for_pool_ready(pool_id)
425
+
426
+ def _wait_for_pool_ready(self, pool_id: str, timeout: int = 1800) -> None:
427
+ """Wait for pool to be ready with optimized polling.
428
+
429
+ Args:
430
+ pool_id: Pool identifier
431
+ timeout: Timeout in seconds
432
+ """
433
+ start_time = time.time()
434
+ poll_interval = 10
435
+
436
+ while time.time() - start_time < timeout:
437
+ try:
438
+ pool = self.batch_client.pool.get(pool_id)
439
+
440
+ if (
441
+ pool.state.value == "active"
442
+ and pool.allocation_state.value == "steady"
443
+ ):
444
+ return
445
+ elif pool.state.value in ["deleting", "upgrading"]:
446
+ raise Exception(
447
+ f"Pool {pool_id} is in invalid state: {pool.state.value}"
448
+ )
449
+
450
+ # Adaptive polling
451
+ if time.time() - start_time > 300: # After 5 minutes
452
+ poll_interval = min(poll_interval * 1.5, 60)
453
+
454
+ time.sleep(poll_interval)
455
+
456
+ except BatchErrorException as e:
457
+ if e.response.status_code == 404:
458
+ # Pool might be deleted, wait and retry
459
+ time.sleep(poll_interval)
460
+ else:
461
+ raise
462
+
463
+ raise Exception(
464
+ f"Pool {pool_id} did not become ready within {timeout} seconds"
465
+ )
466
+
467
+ def _create_job(self, job_id: str, pool_id: str) -> None:
468
+ """Create Azure Batch job with optimized settings.
469
+
470
+ Args:
471
+ job_id: Job identifier
472
+ pool_id: Pool identifier
473
+ """
474
+ job_config = self.job_config or BatchJobConfig(
475
+ job_id=job_id, pool_id=pool_id
476
+ )
477
+
478
+ # Build job parameters
479
+ job_params = {
480
+ "id": job_id,
481
+ "pool_info": PoolInformation(pool_id=pool_id),
482
+ "priority": job_config.priority,
483
+ "uses_task_dependencies": job_config.uses_task_dependencies,
484
+ "on_all_tasks_complete": job_config.on_all_tasks_complete,
485
+ "on_task_failure": job_config.on_task_failure,
486
+ }
487
+
488
+ # Add optional configurations
489
+ if job_config.display_name:
490
+ job_params["display_name"] = job_config.display_name
491
+
492
+ if job_config.metadata:
493
+ job_params["metadata"] = job_config.metadata
494
+
495
+ job = JobAddParameter(**job_params)
496
+ self.batch_client.job.add(job)
497
+
498
+ def _create_task(
499
+ self,
500
+ job_id: str,
501
+ task_id: str,
502
+ command_line: str,
503
+ resource_files: Optional[list[ResourceFile]] = None,
504
+ environment_settings: Optional[dict[str, str]] = None,
505
+ ) -> None:
506
+ """Create Azure Batch task with optimized settings.
507
+
508
+ Args:
509
+ job_id: Job identifier
510
+ task_id: Task identifier
511
+ command_line: Command line to execute
512
+ resource_files: Resource files for the task
513
+ environment_settings: Environment variables
514
+ """
515
+ task_config = self.task_config or BatchTaskConfig(
516
+ task_id=task_id, command_line=command_line
517
+ )
518
+
519
+ # Convert environment settings to Azure Batch format
520
+ env_settings = None
521
+ if environment_settings:
522
+ env_settings = [
523
+ {"name": k, "value": v} for k, v in environment_settings.items()
524
+ ]
525
+
526
+ # Add optimized environment variables
527
+ if env_settings is None:
528
+ env_settings = []
529
+
530
+ env_settings.extend(
531
+ [
532
+ {"name": "PYTHONUNBUFFERED", "value": "1"},
533
+ {"name": "PYTHONDONTWRITEBYTECODE", "value": "1"},
534
+ ]
535
+ )
536
+
537
+ # Build task parameters
538
+ task_params = {
539
+ "id": task_id,
540
+ "command_line": command_line,
541
+ "resource_files": resource_files or task_config.resource_files,
542
+ "environment_settings": env_settings,
543
+ "max_wall_clock_time": task_config.max_wall_clock_time,
544
+ "retention_time": task_config.retention_time,
545
+ }
546
+
547
+ # Add optional configurations
548
+ if task_config.user_identity:
549
+ task_params["user_identity"] = UserIdentity(
550
+ **task_config.user_identity
551
+ )
552
+
553
+ if task_config.constraints:
554
+ task_params["constraints"] = task_config.constraints
555
+
556
+ task = TaskAddParameter(**task_params)
557
+ self.batch_client.task.add(job_id, task)
558
+
559
+ def _wait_for_task_completion(
560
+ self, job_id: str, task_id: str, timeout: int = 3600
561
+ ) -> dict[str, Any]:
562
+ """Wait for task completion with optimized polling.
563
+
564
+ Args:
565
+ job_id: Job identifier
566
+ task_id: Task identifier
567
+ timeout: Timeout in seconds
568
+
569
+ Returns:
570
+ Dict[str, Any]: Task results
571
+ """
572
+ start_time = time.time()
573
+ poll_interval = 10
574
+
575
+ while time.time() - start_time < timeout:
576
+ try:
577
+ task = self.batch_client.task.get(job_id, task_id)
578
+
579
+ if task.state == TaskState.completed:
580
+ return self._process_successful_task(job_id, task_id, task)
581
+
582
+ elif task.state == TaskState.failed:
583
+ return self._process_failed_task(task)
584
+
585
+ elif task.state in [
586
+ TaskState.running,
587
+ TaskState.active,
588
+ TaskState.preparing,
589
+ ]:
590
+ # Adaptive polling: increase interval for long-running tasks
591
+ if time.time() - start_time > 300: # After 5 minutes
592
+ poll_interval = min(
593
+ poll_interval * 1.5, 60
594
+ ) # Max 60 seconds
595
+
596
+ time.sleep(poll_interval)
597
+ else:
598
+ # For other states, use shorter polling
599
+ time.sleep(5)
600
+
601
+ except BatchErrorException as e:
602
+ if e.response.status_code == 404:
603
+ # Task might be deleted, wait a bit and retry
604
+ time.sleep(poll_interval)
605
+ else:
606
+ # Continue polling on error with exponential backoff
607
+ poll_interval = min(poll_interval * 2, 60)
608
+ time.sleep(poll_interval)
609
+ except Exception:
610
+ # Continue polling on error with exponential backoff
611
+ poll_interval = min(poll_interval * 2, 60)
612
+ time.sleep(poll_interval)
613
+
614
+ return {"status": "timeout", "exit_code": 1}
615
+
616
+ def _process_successful_task(
617
+ self, job_id: str, task_id: str, task: Any
618
+ ) -> dict[str, Any]:
619
+ """Process successful task and download results.
620
+
621
+ Args:
622
+ job_id: Job identifier
623
+ task_id: Task identifier
624
+ task: Task object
625
+
626
+ Returns:
627
+ Dict[str, Any]: Task results with files
628
+ """
629
+ result_files = {}
630
+ try:
631
+ # Get task files
632
+ files = self.batch_client.file.list_from_task(job_id, task_id)
633
+ for file in files:
634
+ if file.name in ["stdout.txt", "stderr.txt", "result.json"]:
635
+ with self._temp_file_context() as tmp_file:
636
+ self.batch_client.file.get_from_task(
637
+ job_id, task_id, file.name, tmp_file
638
+ )
639
+ with open(tmp_file) as f:
640
+ result_files[file.name] = f.read()
641
+ except Exception:
642
+ # File download failed, continue with empty results
643
+ pass
644
+
645
+ return {
646
+ "status": "completed",
647
+ "exit_code": task.execution_info.exit_code,
648
+ "files": result_files,
649
+ }
650
+
651
+ def _process_failed_task(self, task: Any) -> dict[str, Any]:
652
+ """Process failed task and extract error information.
653
+
654
+ Args:
655
+ task: Task object
656
+
657
+ Returns:
658
+ Dict[str, Any]: Failure information
659
+ """
660
+ failure_reason = "Task failed"
661
+
662
+ # Try to extract more detailed error information
663
+ if hasattr(task, "execution_info") and task.execution_info:
664
+ if (
665
+ hasattr(task.execution_info, "failure_info")
666
+ and task.execution_info.failure_info
667
+ ):
668
+ failure_reason = str(task.execution_info.failure_info)
669
+
670
+ return {
671
+ "status": "failed",
672
+ "exit_code": (
673
+ task.execution_info.exit_code if task.execution_info else 1
674
+ ),
675
+ "failure_reason": failure_reason,
676
+ }
677
+
678
+ def _create_optimized_task_script(
679
+ self, job: Job, params: DictData, run_id: str
680
+ ) -> str:
681
+ """Create optimized Python script for task execution.
682
+
683
+ Args:
684
+ job: Job to execute
685
+ params: Job parameters
686
+ run_id: Execution run ID
687
+
688
+ Returns:
689
+ str: Path to created script
690
+ """
691
+ script_content = f'''#!/usr/bin/env python3
692
+ import json
693
+ import sys
694
+ import os
695
+ import subprocess
696
+ import time
697
+ from pathlib import Path
698
+
699
+ def install_package(package):
700
+ """Install package with retry logic."""
701
+ for attempt in range(3):
702
+ try:
703
+ subprocess.run([sys.executable, '-m', 'pip', 'install', package],
704
+ check=True, capture_output=True, timeout=300)
705
+ return True
706
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
707
+ if attempt == 2:
708
+ raise
709
+ time.sleep(2 ** attempt)
710
+
711
+ def download_file(blob_url, local_path):
712
+ """Download file with retry logic."""
713
+ for attempt in range(3):
714
+ try:
715
+ subprocess.run(['az', 'storage', 'blob', 'download',
716
+ '--account-name', os.environ['STORAGE_ACCOUNT_NAME'],
717
+ '--account-key', os.environ['STORAGE_ACCOUNT_KEY'],
718
+ '--container-name', os.environ['STORAGE_CONTAINER'],
719
+ '--name', blob_url, '--file', local_path],
720
+ check=True, capture_output=True, timeout=300)
721
+ return True
722
+ except subprocess.CalledProcessError:
723
+ if attempt == 2:
724
+ raise
725
+ time.sleep(2 ** attempt)
726
+
727
+ # Install ddeutil-workflow with retry
728
+ install_package('ddeutil-workflow')
729
+
730
+ # Download files with retry
731
+ download_file(os.environ['JOB_CONFIG_BLOB'], 'job_config.json')
732
+ download_file(os.environ['PARAMS_BLOB'], 'params.json')
733
+ download_file(os.environ['SCRIPT_BLOB'], 'task_script.py')
734
+
735
+ # Add current directory to Python path
736
+ sys.path.insert(0, os.getcwd())
737
+
738
+ from ddeutil.workflow.job import local_execute
739
+ from ddeutil.workflow import Job
740
+
741
+ # Load job configuration
742
+ with open('job_config.json', 'r') as f:
743
+ job_data = json.load(f)
744
+
745
+ # Load parameters
746
+ with open('params.json', 'r') as f:
747
+ params = json.load(f)
748
+
749
+ # Create job instance
750
+ job = Job(**job_data)
751
+
752
+ # Execute job
753
+ result = local_execute(job, params, run_id='{run_id}')
754
+
755
+ # Save result
756
+ with open('result.json', 'w') as f:
757
+ json.dump(result.model_dump(), f, indent=2)
758
+
759
+ # Upload result to Azure Storage with retry
760
+ job_id = '{run_id}'
761
+ container = os.environ['STORAGE_CONTAINER']
762
+
763
+ # Upload result file with retry
764
+ download_file('result.json', f'jobs/{{job_id}}/result.json')
765
+
766
+ sys.exit(0 if result.status == 'success' else 1)
767
+ '''
768
+
769
+ with self._temp_file_context(suffix=".py") as script_path:
770
+ with open(script_path, "w") as f:
771
+ f.write(script_content)
772
+ return script_path
773
+
774
+ def execute_job(
775
+ self,
776
+ job: Job,
777
+ params: DictData,
778
+ *,
779
+ run_id: Optional[str] = None,
780
+ event: Optional[Any] = None,
781
+ ) -> Result:
782
+ """Execute job on Azure Batch with optimized performance.
783
+
784
+ Args:
785
+ job: Job to execute
786
+ params: Job parameters
787
+ run_id: Execution run ID
788
+ event: Event for cancellation
789
+
790
+ Returns:
791
+ Result: Execution result
792
+ """
793
+ if event and event.is_set():
794
+ return Result(
795
+ status=FAILED,
796
+ context={
797
+ "errors": {"message": "Execution was canceled before start"}
798
+ },
799
+ run_id=run_id or gen_id("azure-batch"),
800
+ extras={},
801
+ )
802
+
803
+ # Generate run ID if not provided
804
+ if not run_id:
805
+ run_id = gen_id(job.id or "azure-batch", unique=True)
806
+
807
+ trace = get_trace(run_id, extras=job.extras)
808
+ trace.info(f"[AZURE_BATCH]: Starting job execution: {job.id}")
809
+
810
+ try:
811
+ # Create pool if not exists
812
+ pool_id = self.pool_config.pool_id
813
+ trace.info(f"[AZURE_BATCH]: Ensuring pool exists: {pool_id}")
814
+ self._create_optimized_pool(pool_id)
815
+
816
+ # Create job
817
+ job_id = f"workflow-job-{run_id}"
818
+ trace.info(f"[AZURE_BATCH]: Creating job: {job_id}")
819
+ self._create_job(job_id, pool_id)
820
+
821
+ # Create optimized task script
822
+ script_path = self._create_optimized_task_script(
823
+ job, params, run_id
824
+ )
825
+
826
+ # Upload files efficiently
827
+ job_config_blob = f"{run_id}/job_config.json"
828
+ params_blob = f"{run_id}/params.json"
829
+ script_blob = f"{run_id}/task_script.py"
830
+
831
+ # Upload files efficiently
832
+ trace.info("[AZURE_BATCH]: Uploading files to storage")
833
+
834
+ with self._temp_file_context(suffix=".json") as job_config_path:
835
+ with open(job_config_path, "w") as f:
836
+ json.dump(job.model_dump(), f)
837
+ self._upload_file_to_storage(job_config_path, job_config_blob)
838
+
839
+ with self._temp_file_context(suffix=".json") as params_path:
840
+ with open(params_path, "w") as f:
841
+ json.dump(params, f)
842
+ self._upload_file_to_storage(params_path, params_blob)
843
+
844
+ self._upload_file_to_storage(script_path, script_blob)
845
+
846
+ # Create resource files
847
+ resource_files = [
848
+ ResourceFile(
849
+ file_path="job_config.json",
850
+ blob_source=self._upload_file_to_storage(
851
+ job_config_path, job_config_blob
852
+ ),
853
+ ),
854
+ ResourceFile(
855
+ file_path="params.json",
856
+ blob_source=self._upload_file_to_storage(
857
+ params_path, params_blob
858
+ ),
859
+ ),
860
+ ResourceFile(
861
+ file_path="task_script.py",
862
+ blob_source=self._upload_file_to_storage(
863
+ script_path, script_blob
864
+ ),
865
+ ),
866
+ ]
867
+
868
+ # Create task with optimized settings
869
+ task_id = f"workflow-task-{run_id}"
870
+ command_line = "python3 task_script.py"
871
+
872
+ # Set environment variables for the task
873
+ environment_settings = {
874
+ "STORAGE_ACCOUNT_NAME": self.storage_account_name,
875
+ "STORAGE_ACCOUNT_KEY": self.storage_account_key,
876
+ "STORAGE_CONTAINER": self.storage_container,
877
+ "JOB_CONFIG_BLOB": job_config_blob,
878
+ "PARAMS_BLOB": params_blob,
879
+ "SCRIPT_BLOB": script_blob,
880
+ }
881
+
882
+ trace.info(f"[AZURE_BATCH]: Creating task: {task_id}")
883
+ self._create_task(
884
+ job_id=job_id,
885
+ task_id=task_id,
886
+ command_line=command_line,
887
+ resource_files=resource_files,
888
+ environment_settings=environment_settings,
889
+ )
890
+
891
+ # Wait for task completion
892
+ trace.info("[AZURE_BATCH]: Waiting for task completion")
893
+ task_result = self._wait_for_task_completion(job_id, task_id)
894
+
895
+ # Process results
896
+ if task_result["status"] == "completed":
897
+ result_data = {}
898
+ if "result.json" in task_result.get("files", {}):
899
+ try:
900
+ result_data = json.loads(
901
+ task_result["files"]["result.json"]
902
+ )
903
+ except (json.JSONDecodeError, KeyError):
904
+ result_data = {"status": SUCCESS}
905
+
906
+ trace.info("[AZURE_BATCH]: Task completed successfully")
907
+ return Result(
908
+ status=SUCCESS,
909
+ context=result_data,
910
+ run_id=run_id,
911
+ extras=job.extras or {},
912
+ )
913
+ else:
914
+ error_msg = (
915
+ f"Task failed: {task_result.get('status', 'unknown')}"
916
+ )
917
+ if task_result.get("failure_reason"):
918
+ error_msg += f" - {task_result['failure_reason']}"
919
+
920
+ trace.error(f"[AZURE_BATCH]: {error_msg}")
921
+ return Result(
922
+ status=FAILED,
923
+ context={"errors": {"message": error_msg}},
924
+ run_id=run_id,
925
+ extras=job.extras or {},
926
+ )
927
+
928
+ except Exception as e:
929
+ trace.error(f"[AZURE_BATCH]: Execution failed: {str(e)}")
930
+ return Result(
931
+ status=FAILED,
932
+ context={"errors": {"message": str(e)}},
933
+ run_id=run_id,
934
+ extras=job.extras or {},
935
+ )
936
+
937
+ def cleanup(self, job_id: Optional[str] = None) -> None:
938
+ """Clean up Azure Batch resources efficiently.
939
+
940
+ Args:
941
+ job_id: Job ID to clean up (if None, cleans up all workflow jobs)
942
+ """
943
+ try:
944
+ if job_id:
945
+ # Delete specific job
946
+ self.batch_client.job.delete(job_id)
947
+ else:
948
+ # Delete all workflow jobs efficiently
949
+ jobs = self.batch_client.job.list()
950
+ workflow_jobs = [
951
+ job for job in jobs if job.id.startswith("workflow-job-")
952
+ ]
953
+
954
+ # Delete jobs in parallel (simplified approach)
955
+ for job in workflow_jobs:
956
+ try:
957
+ self.batch_client.job.delete(job.id)
958
+ except BatchErrorException:
959
+ # Job might already be deleted
960
+ pass
961
+ except Exception:
962
+ pass
963
+
964
+
965
+ def azure_batch_execute(
966
+ job: Job,
967
+ params: DictData,
968
+ *,
969
+ run_id: Optional[str] = None,
970
+ event: Optional[Any] = None,
971
+ ) -> Result:
972
+ """Azure Batch job execution function with optimized performance.
973
+
974
+ This function creates an Azure Batch provider and executes the job
975
+ on Azure Batch compute nodes. It handles the complete lifecycle
976
+ including pool creation, job submission, and result retrieval.
977
+
978
+ Args:
979
+ job: Job to execute
980
+ params: Job parameters
981
+ run_id: Execution run ID
982
+ event: Event for cancellation
983
+
984
+ Returns:
985
+ Result: Execution result
986
+ """
987
+ # Extract Azure Batch configuration from job
988
+ batch_args = job.runs_on.args
989
+
990
+ provider = AzureBatchProvider(
991
+ batch_account_name=batch_args.batch_account_name,
992
+ batch_account_key=batch_args.batch_account_key.get_secret_value(),
993
+ batch_account_url=batch_args.batch_account_url,
994
+ storage_account_name=batch_args.storage_account_name,
995
+ storage_account_key=batch_args.storage_account_key.get_secret_value(),
996
+ )
997
+
998
+ try:
999
+ return provider.execute_job(job, params, run_id=run_id, event=event)
1000
+ finally:
1001
+ # Clean up resources
1002
+ if run_id:
1003
+ provider.cleanup(f"workflow-job-{run_id}")