ddeutil-workflow 0.0.78__py3-none-any.whl → 0.0.79__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddeutil/workflow/__about__.py +1 -1
- ddeutil/workflow/__init__.py +1 -5
- ddeutil/workflow/api/routes/job.py +2 -2
- ddeutil/workflow/audits.py +554 -112
- ddeutil/workflow/cli.py +19 -1
- ddeutil/workflow/conf.py +9 -21
- ddeutil/workflow/event.py +15 -6
- ddeutil/workflow/job.py +147 -73
- ddeutil/workflow/params.py +172 -58
- ddeutil/workflow/plugins/__init__.py +0 -0
- ddeutil/workflow/plugins/providers/__init__.py +0 -0
- ddeutil/workflow/plugins/providers/aws.py +908 -0
- ddeutil/workflow/plugins/providers/az.py +1003 -0
- ddeutil/workflow/plugins/providers/container.py +703 -0
- ddeutil/workflow/plugins/providers/gcs.py +826 -0
- ddeutil/workflow/result.py +6 -4
- ddeutil/workflow/reusables.py +151 -95
- ddeutil/workflow/stages.py +28 -28
- ddeutil/workflow/traces.py +1678 -540
- ddeutil/workflow/utils.py +109 -67
- ddeutil/workflow/workflow.py +20 -11
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/METADATA +52 -19
- ddeutil_workflow-0.0.79.dist-info/RECORD +36 -0
- ddeutil_workflow-0.0.78.dist-info/RECORD +0 -30
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/WHEEL +0 -0
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/entry_points.txt +0 -0
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/licenses/LICENSE +0 -0
- {ddeutil_workflow-0.0.78.dist-info → ddeutil_workflow-0.0.79.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1003 @@
|
|
1
|
+
# ------------------------------------------------------------------------------
|
2
|
+
# Copyright (c) 2022 Korawich Anuttra. All rights reserved.
|
3
|
+
# Licensed under the MIT License. See LICENSE in the project root for
|
4
|
+
# license information.
|
5
|
+
# ------------------------------------------------------------------------------
|
6
|
+
"""Azure Batch Provider Module.
|
7
|
+
|
8
|
+
This module provides Azure Batch integration for workflow job execution.
|
9
|
+
It handles pool creation, job submission, task execution, and result retrieval.
|
10
|
+
|
11
|
+
The Azure Batch provider enables running workflow jobs on Azure Batch compute
|
12
|
+
nodes, providing scalable and managed execution environments for complex
|
13
|
+
workflow processing.
|
14
|
+
|
15
|
+
Key Features:
|
16
|
+
- Automatic pool creation and management
|
17
|
+
- Job and task submission to Azure Batch
|
18
|
+
- Result file upload/download via Azure Storage
|
19
|
+
- Error handling and status monitoring
|
20
|
+
- Resource cleanup and management
|
21
|
+
- Optimized file operations and caching
|
22
|
+
|
23
|
+
Classes:
|
24
|
+
AzureBatchProvider: Main provider for Azure Batch operations
|
25
|
+
BatchPoolConfig: Configuration for Azure Batch pools
|
26
|
+
BatchJobConfig: Configuration for Azure Batch jobs
|
27
|
+
BatchTaskConfig: Configuration for Azure Batch tasks
|
28
|
+
|
29
|
+
References:
|
30
|
+
- https://docs.microsoft.com/en-us/azure/batch/batch-python-tutorial
|
31
|
+
- https://docs.microsoft.com/en-us/azure/batch/batch-api-basics
|
32
|
+
|
33
|
+
Config Example:
|
34
|
+
|
35
|
+
```dotenv
|
36
|
+
export AZURE_BATCH_ACCOUNT_NAME="your-batch-account"
|
37
|
+
export AZURE_BATCH_ACCOUNT_KEY="your-batch-key"
|
38
|
+
export AZURE_BATCH_ACCOUNT_URL="https://your-batch-account.region.batch.azure.com"
|
39
|
+
export AZURE_STORAGE_ACCOUNT_NAME="your-storage-account"
|
40
|
+
export AZURE_STORAGE_ACCOUNT_KEY="your-storage-key"
|
41
|
+
```
|
42
|
+
|
43
|
+
```yaml
|
44
|
+
jobs:
|
45
|
+
my-job:
|
46
|
+
runs-on:
|
47
|
+
type: "azure_batch"
|
48
|
+
with:
|
49
|
+
batch_account_name: "${AZURE_BATCH_ACCOUNT_NAME}"
|
50
|
+
batch_account_key: "${AZURE_BATCH_ACCOUNT_KEY}"
|
51
|
+
batch_account_url: "${AZURE_BATCH_ACCOUNT_URL}"
|
52
|
+
storage_account_name: "${AZURE_STORAGE_ACCOUNT_NAME}"
|
53
|
+
storage_account_key: "${AZURE_STORAGE_ACCOUNT_KEY}"
|
54
|
+
stages:
|
55
|
+
- name: "process"
|
56
|
+
type: "py"
|
57
|
+
run: |
|
58
|
+
# Your processing logic here
|
59
|
+
result.context.update({"output": "processed"})
|
60
|
+
```
|
61
|
+
|
62
|
+
"""
|
63
|
+
from __future__ import annotations
|
64
|
+
|
65
|
+
import json
|
66
|
+
import os
|
67
|
+
import tempfile
|
68
|
+
import time
|
69
|
+
from contextlib import contextmanager
|
70
|
+
from typing import Any, Optional
|
71
|
+
|
72
|
+
try:
|
73
|
+
from azure.batch import BatchServiceClient
|
74
|
+
from azure.batch.batch_auth import SharedKeyCredentials
|
75
|
+
from azure.batch.models import (
|
76
|
+
AutoUserSpecification,
|
77
|
+
BatchErrorException,
|
78
|
+
CloudServiceConfiguration,
|
79
|
+
JobAddParameter,
|
80
|
+
NetworkConfiguration,
|
81
|
+
PoolAddParameter,
|
82
|
+
PoolInformation,
|
83
|
+
ResourceFile,
|
84
|
+
StartTask,
|
85
|
+
TaskAddParameter,
|
86
|
+
TaskState,
|
87
|
+
UserIdentity,
|
88
|
+
)
|
89
|
+
from azure.core.exceptions import AzureError
|
90
|
+
from azure.storage.blob import BlobServiceClient
|
91
|
+
|
92
|
+
AZURE_AVAILABLE = True
|
93
|
+
except ImportError:
|
94
|
+
AZURE_AVAILABLE = False
|
95
|
+
|
96
|
+
from pydantic import BaseModel, Field
|
97
|
+
|
98
|
+
from ...__types import DictData
|
99
|
+
from ...job import Job
|
100
|
+
from ...result import FAILED, SUCCESS, Result
|
101
|
+
from ...traces import get_trace
|
102
|
+
from ...utils import gen_id
|
103
|
+
|
104
|
+
|
105
|
+
class BatchPoolConfig(BaseModel):
|
106
|
+
"""Azure Batch pool configuration."""
|
107
|
+
|
108
|
+
pool_id: str = Field(description="Unique pool identifier")
|
109
|
+
vm_size: str = Field(
|
110
|
+
default="Standard_D2s_v3", description="VM size for compute nodes"
|
111
|
+
)
|
112
|
+
node_count: int = Field(default=1, description="Number of compute nodes")
|
113
|
+
max_tasks_per_node: int = Field(
|
114
|
+
default=4, description="Maximum tasks per node"
|
115
|
+
)
|
116
|
+
enable_auto_scale: bool = Field(
|
117
|
+
default=False, description="Enable auto-scaling"
|
118
|
+
)
|
119
|
+
auto_scale_formula: Optional[str] = Field(
|
120
|
+
default=None, description="Auto-scale formula"
|
121
|
+
)
|
122
|
+
os_family: str = Field(
|
123
|
+
default="5", description="OS family (5=Ubuntu 20.04)"
|
124
|
+
)
|
125
|
+
os_version: str = Field(default="latest", description="OS version")
|
126
|
+
enable_inter_node_communication: bool = Field(
|
127
|
+
default=False, description="Enable inter-node communication"
|
128
|
+
)
|
129
|
+
network_configuration: Optional[dict[str, Any]] = Field(
|
130
|
+
default=None, description="Network configuration"
|
131
|
+
)
|
132
|
+
|
133
|
+
|
134
|
+
class BatchJobConfig(BaseModel):
|
135
|
+
"""Azure Batch job configuration."""
|
136
|
+
|
137
|
+
job_id: str = Field(description="Unique job identifier")
|
138
|
+
pool_id: str = Field(description="Pool ID to run the job on")
|
139
|
+
display_name: Optional[str] = Field(
|
140
|
+
default=None, description="Job display name"
|
141
|
+
)
|
142
|
+
priority: int = Field(default=0, description="Job priority")
|
143
|
+
uses_task_dependencies: bool = Field(
|
144
|
+
default=False, description="Use task dependencies"
|
145
|
+
)
|
146
|
+
on_all_tasks_complete: str = Field(
|
147
|
+
default="noaction", description="Action when all tasks complete"
|
148
|
+
)
|
149
|
+
on_task_failure: str = Field(
|
150
|
+
default="noaction", description="Action when task fails"
|
151
|
+
)
|
152
|
+
metadata: Optional[list[dict[str, str]]] = Field(
|
153
|
+
default=None, description="Job metadata"
|
154
|
+
)
|
155
|
+
|
156
|
+
|
157
|
+
class BatchTaskConfig(BaseModel):
|
158
|
+
"""Azure Batch task configuration."""
|
159
|
+
|
160
|
+
task_id: str = Field(description="Unique task identifier")
|
161
|
+
command_line: str = Field(description="Command line to execute")
|
162
|
+
resource_files: Optional[list[ResourceFile]] = Field(
|
163
|
+
default=None, description="Resource files"
|
164
|
+
)
|
165
|
+
environment_settings: Optional[dict[str, str]] = Field(
|
166
|
+
default=None, description="Environment variables"
|
167
|
+
)
|
168
|
+
max_wall_clock_time: Optional[str] = Field(
|
169
|
+
default="PT1H", description="Maximum wall clock time"
|
170
|
+
)
|
171
|
+
retention_time: Optional[str] = Field(
|
172
|
+
default="PT1H", description="Task retention time"
|
173
|
+
)
|
174
|
+
user_identity: Optional[dict[str, Any]] = Field(
|
175
|
+
default=None, description="User identity"
|
176
|
+
)
|
177
|
+
constraints: Optional[dict[str, Any]] = Field(
|
178
|
+
default=None, description="Task constraints"
|
179
|
+
)
|
180
|
+
|
181
|
+
|
182
|
+
class AzureBatchProvider:
|
183
|
+
"""Azure Batch provider for workflow job execution.
|
184
|
+
|
185
|
+
This provider handles the complete lifecycle of Azure Batch operations
|
186
|
+
including pool creation, job submission, task execution, and result
|
187
|
+
retrieval. It integrates with Azure Storage for file management and
|
188
|
+
provides comprehensive error handling and monitoring.
|
189
|
+
|
190
|
+
Attributes:
|
191
|
+
batch_client: Azure Batch service client
|
192
|
+
blob_client: Azure Blob storage client
|
193
|
+
storage_container: Storage container name for files
|
194
|
+
pool_config: Pool configuration
|
195
|
+
job_config: Job configuration
|
196
|
+
task_config: Task configuration
|
197
|
+
|
198
|
+
Example:
|
199
|
+
```python
|
200
|
+
provider = AzureBatchProvider(
|
201
|
+
batch_account_name="mybatchaccount",
|
202
|
+
batch_account_key="mykey",
|
203
|
+
batch_account_url="https://mybatchaccount.region.batch.azure.com",
|
204
|
+
storage_account_name="mystorageaccount",
|
205
|
+
storage_account_key="mystoragekey"
|
206
|
+
)
|
207
|
+
|
208
|
+
result = provider.execute_job(job, params, run_id="job-123")
|
209
|
+
```
|
210
|
+
"""
|
211
|
+
|
212
|
+
def __init__(
|
213
|
+
self,
|
214
|
+
batch_account_name: str,
|
215
|
+
batch_account_key: str,
|
216
|
+
batch_account_url: str,
|
217
|
+
storage_account_name: str,
|
218
|
+
storage_account_key: str,
|
219
|
+
storage_container: str = "workflow-files",
|
220
|
+
pool_config: Optional[BatchPoolConfig] = None,
|
221
|
+
job_config: Optional[BatchJobConfig] = None,
|
222
|
+
task_config: Optional[BatchTaskConfig] = None,
|
223
|
+
):
|
224
|
+
"""Initialize Azure Batch provider.
|
225
|
+
|
226
|
+
Args:
|
227
|
+
batch_account_name: Azure Batch account name
|
228
|
+
batch_account_key: Azure Batch account key
|
229
|
+
batch_account_url: Azure Batch account URL
|
230
|
+
storage_account_name: Azure Storage account name
|
231
|
+
storage_account_key: Azure Storage account key
|
232
|
+
storage_container: Storage container name for files
|
233
|
+
pool_config: Pool configuration
|
234
|
+
job_config: Job configuration
|
235
|
+
task_config: Task configuration
|
236
|
+
"""
|
237
|
+
if not AZURE_AVAILABLE:
|
238
|
+
raise ImportError(
|
239
|
+
"Azure Batch dependencies not available. "
|
240
|
+
"Install with: pip install ddeutil-workflow[azure]"
|
241
|
+
)
|
242
|
+
|
243
|
+
self.batch_account_name = batch_account_name
|
244
|
+
self.batch_account_key = batch_account_key
|
245
|
+
self.batch_account_url = batch_account_url
|
246
|
+
self.storage_account_name = storage_account_name
|
247
|
+
self.storage_account_key = storage_account_key
|
248
|
+
self.storage_container = storage_container
|
249
|
+
|
250
|
+
# Initialize clients with optimized configuration
|
251
|
+
self.batch_client = self._create_batch_client()
|
252
|
+
self.blob_client = self._create_blob_client()
|
253
|
+
|
254
|
+
# Set configurations
|
255
|
+
self.pool_config = pool_config or BatchPoolConfig(
|
256
|
+
pool_id=f"workflow-pool-{gen_id('pool')}"
|
257
|
+
)
|
258
|
+
self.job_config = job_config
|
259
|
+
self.task_config = task_config
|
260
|
+
|
261
|
+
# Cache for container operations
|
262
|
+
self._container_exists: Optional[bool] = None
|
263
|
+
|
264
|
+
def _create_batch_client(self) -> BatchServiceClient:
|
265
|
+
"""Create Azure Batch service client with optimized configuration."""
|
266
|
+
credentials = SharedKeyCredentials(
|
267
|
+
self.batch_account_name, self.batch_account_key
|
268
|
+
)
|
269
|
+
return BatchServiceClient(credentials, self.batch_account_url)
|
270
|
+
|
271
|
+
def _create_blob_client(self) -> BlobServiceClient:
|
272
|
+
"""Create Azure Blob storage client with optimized configuration."""
|
273
|
+
connection_string = (
|
274
|
+
f"DefaultEndpointsProtocol=https;"
|
275
|
+
f"AccountName={self.storage_account_name};"
|
276
|
+
f"AccountKey={self.storage_account_key};"
|
277
|
+
f"EndpointSuffix=core.windows.net"
|
278
|
+
)
|
279
|
+
return BlobServiceClient.from_connection_string(connection_string)
|
280
|
+
|
281
|
+
@contextmanager
|
282
|
+
def _temp_file_context(self, suffix: str = ".tmp"):
|
283
|
+
"""Context manager for temporary file operations."""
|
284
|
+
temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
285
|
+
try:
|
286
|
+
yield temp_file.name
|
287
|
+
finally:
|
288
|
+
try:
|
289
|
+
os.unlink(temp_file.name)
|
290
|
+
except OSError:
|
291
|
+
pass
|
292
|
+
|
293
|
+
def _ensure_storage_container(self) -> None:
|
294
|
+
"""Ensure storage container exists with optimized settings."""
|
295
|
+
if self._container_exists is None:
|
296
|
+
container_client = self.blob_client.get_container_client(
|
297
|
+
self.storage_container
|
298
|
+
)
|
299
|
+
try:
|
300
|
+
container_client.get_container_properties()
|
301
|
+
self._container_exists = True
|
302
|
+
except AzureError:
|
303
|
+
# Create container with optimized settings
|
304
|
+
container_client.create_container(
|
305
|
+
metadata={
|
306
|
+
"workflow_provider": "azure_batch",
|
307
|
+
"created_time": str(time.time()),
|
308
|
+
}
|
309
|
+
)
|
310
|
+
self._container_exists = True
|
311
|
+
|
312
|
+
def _upload_file_to_storage(self, file_path: str, blob_name: str) -> str:
|
313
|
+
"""Upload file to Azure Storage with optimized settings.
|
314
|
+
|
315
|
+
Args:
|
316
|
+
file_path: Local file path
|
317
|
+
blob_name: Blob name in storage
|
318
|
+
|
319
|
+
Returns:
|
320
|
+
str: Blob URL
|
321
|
+
"""
|
322
|
+
self._ensure_storage_container()
|
323
|
+
container_client = self.blob_client.get_container_client(
|
324
|
+
self.storage_container
|
325
|
+
)
|
326
|
+
blob_client = container_client.get_blob_client(blob_name)
|
327
|
+
|
328
|
+
# Set optimized metadata
|
329
|
+
metadata = {
|
330
|
+
"workflow_provider": "azure_batch",
|
331
|
+
"upload_time": str(time.time()),
|
332
|
+
"content_type": "application/octet-stream",
|
333
|
+
}
|
334
|
+
|
335
|
+
with open(file_path, "rb") as data:
|
336
|
+
blob_client.upload_blob(
|
337
|
+
data,
|
338
|
+
overwrite=True,
|
339
|
+
metadata=metadata,
|
340
|
+
content_settings=None, # Let Azure determine content type
|
341
|
+
)
|
342
|
+
|
343
|
+
return blob_client.url
|
344
|
+
|
345
|
+
def _download_file_from_storage(
|
346
|
+
self, blob_name: str, local_path: str
|
347
|
+
) -> None:
|
348
|
+
"""Download file from Azure Storage with optimized settings.
|
349
|
+
|
350
|
+
Args:
|
351
|
+
blob_name: Blob name in storage
|
352
|
+
local_path: Local file path
|
353
|
+
"""
|
354
|
+
container_client = self.blob_client.get_container_client(
|
355
|
+
self.storage_container
|
356
|
+
)
|
357
|
+
blob_client = container_client.get_blob_client(blob_name)
|
358
|
+
|
359
|
+
with open(local_path, "wb") as data:
|
360
|
+
blob_client.download_blob().readinto(data)
|
361
|
+
|
362
|
+
def _create_optimized_pool(self, pool_id: str) -> None:
|
363
|
+
"""Create Azure Batch pool with optimized settings.
|
364
|
+
|
365
|
+
Args:
|
366
|
+
pool_id: Pool identifier
|
367
|
+
"""
|
368
|
+
try:
|
369
|
+
self.batch_client.pool.get(pool_id)
|
370
|
+
return
|
371
|
+
except BatchErrorException as e:
|
372
|
+
if e.response.status_code != 404:
|
373
|
+
raise
|
374
|
+
|
375
|
+
pool_config = self.pool_config
|
376
|
+
|
377
|
+
# Create optimized start task for pool initialization
|
378
|
+
start_task = StartTask(
|
379
|
+
command_line=(
|
380
|
+
"apt-get update && "
|
381
|
+
"apt-get install -y python3 python3-pip curl && "
|
382
|
+
"pip3 install --no-cache-dir ddeutil-workflow && "
|
383
|
+
"echo 'Pool initialization completed'"
|
384
|
+
),
|
385
|
+
wait_for_success=True,
|
386
|
+
user_identity=UserIdentity(
|
387
|
+
auto_user=AutoUserSpecification(
|
388
|
+
scope="pool", elevation_level="admin"
|
389
|
+
)
|
390
|
+
),
|
391
|
+
max_task_retry_count=2,
|
392
|
+
)
|
393
|
+
|
394
|
+
# Build pool configuration
|
395
|
+
pool_params = {
|
396
|
+
"id": pool_id,
|
397
|
+
"vm_size": pool_config.vm_size,
|
398
|
+
"target_dedicated_nodes": pool_config.node_count,
|
399
|
+
"task_slots_per_node": pool_config.max_tasks_per_node,
|
400
|
+
"enable_auto_scale": pool_config.enable_auto_scale,
|
401
|
+
"start_task": start_task,
|
402
|
+
"enable_inter_node_communication": pool_config.enable_inter_node_communication,
|
403
|
+
}
|
404
|
+
|
405
|
+
# Add auto-scale formula if enabled
|
406
|
+
if pool_config.enable_auto_scale and pool_config.auto_scale_formula:
|
407
|
+
pool_params["auto_scale_formula"] = pool_config.auto_scale_formula
|
408
|
+
|
409
|
+
# Add network configuration if specified
|
410
|
+
if pool_config.network_configuration:
|
411
|
+
pool_params["network_configuration"] = NetworkConfiguration(
|
412
|
+
**pool_config.network_configuration
|
413
|
+
)
|
414
|
+
|
415
|
+
# Use Cloud Service configuration for better compatibility
|
416
|
+
pool_params["cloud_service_configuration"] = CloudServiceConfiguration(
|
417
|
+
os_family=pool_config.os_family, os_version=pool_config.os_version
|
418
|
+
)
|
419
|
+
|
420
|
+
new_pool = PoolAddParameter(**pool_params)
|
421
|
+
self.batch_client.pool.add(new_pool)
|
422
|
+
|
423
|
+
# Wait for pool to be ready with optimized polling
|
424
|
+
self._wait_for_pool_ready(pool_id)
|
425
|
+
|
426
|
+
def _wait_for_pool_ready(self, pool_id: str, timeout: int = 1800) -> None:
|
427
|
+
"""Wait for pool to be ready with optimized polling.
|
428
|
+
|
429
|
+
Args:
|
430
|
+
pool_id: Pool identifier
|
431
|
+
timeout: Timeout in seconds
|
432
|
+
"""
|
433
|
+
start_time = time.time()
|
434
|
+
poll_interval = 10
|
435
|
+
|
436
|
+
while time.time() - start_time < timeout:
|
437
|
+
try:
|
438
|
+
pool = self.batch_client.pool.get(pool_id)
|
439
|
+
|
440
|
+
if (
|
441
|
+
pool.state.value == "active"
|
442
|
+
and pool.allocation_state.value == "steady"
|
443
|
+
):
|
444
|
+
return
|
445
|
+
elif pool.state.value in ["deleting", "upgrading"]:
|
446
|
+
raise Exception(
|
447
|
+
f"Pool {pool_id} is in invalid state: {pool.state.value}"
|
448
|
+
)
|
449
|
+
|
450
|
+
# Adaptive polling
|
451
|
+
if time.time() - start_time > 300: # After 5 minutes
|
452
|
+
poll_interval = min(poll_interval * 1.5, 60)
|
453
|
+
|
454
|
+
time.sleep(poll_interval)
|
455
|
+
|
456
|
+
except BatchErrorException as e:
|
457
|
+
if e.response.status_code == 404:
|
458
|
+
# Pool might be deleted, wait and retry
|
459
|
+
time.sleep(poll_interval)
|
460
|
+
else:
|
461
|
+
raise
|
462
|
+
|
463
|
+
raise Exception(
|
464
|
+
f"Pool {pool_id} did not become ready within {timeout} seconds"
|
465
|
+
)
|
466
|
+
|
467
|
+
def _create_job(self, job_id: str, pool_id: str) -> None:
|
468
|
+
"""Create Azure Batch job with optimized settings.
|
469
|
+
|
470
|
+
Args:
|
471
|
+
job_id: Job identifier
|
472
|
+
pool_id: Pool identifier
|
473
|
+
"""
|
474
|
+
job_config = self.job_config or BatchJobConfig(
|
475
|
+
job_id=job_id, pool_id=pool_id
|
476
|
+
)
|
477
|
+
|
478
|
+
# Build job parameters
|
479
|
+
job_params = {
|
480
|
+
"id": job_id,
|
481
|
+
"pool_info": PoolInformation(pool_id=pool_id),
|
482
|
+
"priority": job_config.priority,
|
483
|
+
"uses_task_dependencies": job_config.uses_task_dependencies,
|
484
|
+
"on_all_tasks_complete": job_config.on_all_tasks_complete,
|
485
|
+
"on_task_failure": job_config.on_task_failure,
|
486
|
+
}
|
487
|
+
|
488
|
+
# Add optional configurations
|
489
|
+
if job_config.display_name:
|
490
|
+
job_params["display_name"] = job_config.display_name
|
491
|
+
|
492
|
+
if job_config.metadata:
|
493
|
+
job_params["metadata"] = job_config.metadata
|
494
|
+
|
495
|
+
job = JobAddParameter(**job_params)
|
496
|
+
self.batch_client.job.add(job)
|
497
|
+
|
498
|
+
def _create_task(
|
499
|
+
self,
|
500
|
+
job_id: str,
|
501
|
+
task_id: str,
|
502
|
+
command_line: str,
|
503
|
+
resource_files: Optional[list[ResourceFile]] = None,
|
504
|
+
environment_settings: Optional[dict[str, str]] = None,
|
505
|
+
) -> None:
|
506
|
+
"""Create Azure Batch task with optimized settings.
|
507
|
+
|
508
|
+
Args:
|
509
|
+
job_id: Job identifier
|
510
|
+
task_id: Task identifier
|
511
|
+
command_line: Command line to execute
|
512
|
+
resource_files: Resource files for the task
|
513
|
+
environment_settings: Environment variables
|
514
|
+
"""
|
515
|
+
task_config = self.task_config or BatchTaskConfig(
|
516
|
+
task_id=task_id, command_line=command_line
|
517
|
+
)
|
518
|
+
|
519
|
+
# Convert environment settings to Azure Batch format
|
520
|
+
env_settings = None
|
521
|
+
if environment_settings:
|
522
|
+
env_settings = [
|
523
|
+
{"name": k, "value": v} for k, v in environment_settings.items()
|
524
|
+
]
|
525
|
+
|
526
|
+
# Add optimized environment variables
|
527
|
+
if env_settings is None:
|
528
|
+
env_settings = []
|
529
|
+
|
530
|
+
env_settings.extend(
|
531
|
+
[
|
532
|
+
{"name": "PYTHONUNBUFFERED", "value": "1"},
|
533
|
+
{"name": "PYTHONDONTWRITEBYTECODE", "value": "1"},
|
534
|
+
]
|
535
|
+
)
|
536
|
+
|
537
|
+
# Build task parameters
|
538
|
+
task_params = {
|
539
|
+
"id": task_id,
|
540
|
+
"command_line": command_line,
|
541
|
+
"resource_files": resource_files or task_config.resource_files,
|
542
|
+
"environment_settings": env_settings,
|
543
|
+
"max_wall_clock_time": task_config.max_wall_clock_time,
|
544
|
+
"retention_time": task_config.retention_time,
|
545
|
+
}
|
546
|
+
|
547
|
+
# Add optional configurations
|
548
|
+
if task_config.user_identity:
|
549
|
+
task_params["user_identity"] = UserIdentity(
|
550
|
+
**task_config.user_identity
|
551
|
+
)
|
552
|
+
|
553
|
+
if task_config.constraints:
|
554
|
+
task_params["constraints"] = task_config.constraints
|
555
|
+
|
556
|
+
task = TaskAddParameter(**task_params)
|
557
|
+
self.batch_client.task.add(job_id, task)
|
558
|
+
|
559
|
+
def _wait_for_task_completion(
|
560
|
+
self, job_id: str, task_id: str, timeout: int = 3600
|
561
|
+
) -> dict[str, Any]:
|
562
|
+
"""Wait for task completion with optimized polling.
|
563
|
+
|
564
|
+
Args:
|
565
|
+
job_id: Job identifier
|
566
|
+
task_id: Task identifier
|
567
|
+
timeout: Timeout in seconds
|
568
|
+
|
569
|
+
Returns:
|
570
|
+
Dict[str, Any]: Task results
|
571
|
+
"""
|
572
|
+
start_time = time.time()
|
573
|
+
poll_interval = 10
|
574
|
+
|
575
|
+
while time.time() - start_time < timeout:
|
576
|
+
try:
|
577
|
+
task = self.batch_client.task.get(job_id, task_id)
|
578
|
+
|
579
|
+
if task.state == TaskState.completed:
|
580
|
+
return self._process_successful_task(job_id, task_id, task)
|
581
|
+
|
582
|
+
elif task.state == TaskState.failed:
|
583
|
+
return self._process_failed_task(task)
|
584
|
+
|
585
|
+
elif task.state in [
|
586
|
+
TaskState.running,
|
587
|
+
TaskState.active,
|
588
|
+
TaskState.preparing,
|
589
|
+
]:
|
590
|
+
# Adaptive polling: increase interval for long-running tasks
|
591
|
+
if time.time() - start_time > 300: # After 5 minutes
|
592
|
+
poll_interval = min(
|
593
|
+
poll_interval * 1.5, 60
|
594
|
+
) # Max 60 seconds
|
595
|
+
|
596
|
+
time.sleep(poll_interval)
|
597
|
+
else:
|
598
|
+
# For other states, use shorter polling
|
599
|
+
time.sleep(5)
|
600
|
+
|
601
|
+
except BatchErrorException as e:
|
602
|
+
if e.response.status_code == 404:
|
603
|
+
# Task might be deleted, wait a bit and retry
|
604
|
+
time.sleep(poll_interval)
|
605
|
+
else:
|
606
|
+
# Continue polling on error with exponential backoff
|
607
|
+
poll_interval = min(poll_interval * 2, 60)
|
608
|
+
time.sleep(poll_interval)
|
609
|
+
except Exception:
|
610
|
+
# Continue polling on error with exponential backoff
|
611
|
+
poll_interval = min(poll_interval * 2, 60)
|
612
|
+
time.sleep(poll_interval)
|
613
|
+
|
614
|
+
return {"status": "timeout", "exit_code": 1}
|
615
|
+
|
616
|
+
def _process_successful_task(
|
617
|
+
self, job_id: str, task_id: str, task: Any
|
618
|
+
) -> dict[str, Any]:
|
619
|
+
"""Process successful task and download results.
|
620
|
+
|
621
|
+
Args:
|
622
|
+
job_id: Job identifier
|
623
|
+
task_id: Task identifier
|
624
|
+
task: Task object
|
625
|
+
|
626
|
+
Returns:
|
627
|
+
Dict[str, Any]: Task results with files
|
628
|
+
"""
|
629
|
+
result_files = {}
|
630
|
+
try:
|
631
|
+
# Get task files
|
632
|
+
files = self.batch_client.file.list_from_task(job_id, task_id)
|
633
|
+
for file in files:
|
634
|
+
if file.name in ["stdout.txt", "stderr.txt", "result.json"]:
|
635
|
+
with self._temp_file_context() as tmp_file:
|
636
|
+
self.batch_client.file.get_from_task(
|
637
|
+
job_id, task_id, file.name, tmp_file
|
638
|
+
)
|
639
|
+
with open(tmp_file) as f:
|
640
|
+
result_files[file.name] = f.read()
|
641
|
+
except Exception:
|
642
|
+
# File download failed, continue with empty results
|
643
|
+
pass
|
644
|
+
|
645
|
+
return {
|
646
|
+
"status": "completed",
|
647
|
+
"exit_code": task.execution_info.exit_code,
|
648
|
+
"files": result_files,
|
649
|
+
}
|
650
|
+
|
651
|
+
def _process_failed_task(self, task: Any) -> dict[str, Any]:
|
652
|
+
"""Process failed task and extract error information.
|
653
|
+
|
654
|
+
Args:
|
655
|
+
task: Task object
|
656
|
+
|
657
|
+
Returns:
|
658
|
+
Dict[str, Any]: Failure information
|
659
|
+
"""
|
660
|
+
failure_reason = "Task failed"
|
661
|
+
|
662
|
+
# Try to extract more detailed error information
|
663
|
+
if hasattr(task, "execution_info") and task.execution_info:
|
664
|
+
if (
|
665
|
+
hasattr(task.execution_info, "failure_info")
|
666
|
+
and task.execution_info.failure_info
|
667
|
+
):
|
668
|
+
failure_reason = str(task.execution_info.failure_info)
|
669
|
+
|
670
|
+
return {
|
671
|
+
"status": "failed",
|
672
|
+
"exit_code": (
|
673
|
+
task.execution_info.exit_code if task.execution_info else 1
|
674
|
+
),
|
675
|
+
"failure_reason": failure_reason,
|
676
|
+
}
|
677
|
+
|
678
|
+
def _create_optimized_task_script(
|
679
|
+
self, job: Job, params: DictData, run_id: str
|
680
|
+
) -> str:
|
681
|
+
"""Create optimized Python script for task execution.
|
682
|
+
|
683
|
+
Args:
|
684
|
+
job: Job to execute
|
685
|
+
params: Job parameters
|
686
|
+
run_id: Execution run ID
|
687
|
+
|
688
|
+
Returns:
|
689
|
+
str: Path to created script
|
690
|
+
"""
|
691
|
+
script_content = f'''#!/usr/bin/env python3
|
692
|
+
import json
|
693
|
+
import sys
|
694
|
+
import os
|
695
|
+
import subprocess
|
696
|
+
import time
|
697
|
+
from pathlib import Path
|
698
|
+
|
699
|
+
def install_package(package):
|
700
|
+
"""Install package with retry logic."""
|
701
|
+
for attempt in range(3):
|
702
|
+
try:
|
703
|
+
subprocess.run([sys.executable, '-m', 'pip', 'install', package],
|
704
|
+
check=True, capture_output=True, timeout=300)
|
705
|
+
return True
|
706
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
|
707
|
+
if attempt == 2:
|
708
|
+
raise
|
709
|
+
time.sleep(2 ** attempt)
|
710
|
+
|
711
|
+
def download_file(blob_url, local_path):
|
712
|
+
"""Download file with retry logic."""
|
713
|
+
for attempt in range(3):
|
714
|
+
try:
|
715
|
+
subprocess.run(['az', 'storage', 'blob', 'download',
|
716
|
+
'--account-name', os.environ['STORAGE_ACCOUNT_NAME'],
|
717
|
+
'--account-key', os.environ['STORAGE_ACCOUNT_KEY'],
|
718
|
+
'--container-name', os.environ['STORAGE_CONTAINER'],
|
719
|
+
'--name', blob_url, '--file', local_path],
|
720
|
+
check=True, capture_output=True, timeout=300)
|
721
|
+
return True
|
722
|
+
except subprocess.CalledProcessError:
|
723
|
+
if attempt == 2:
|
724
|
+
raise
|
725
|
+
time.sleep(2 ** attempt)
|
726
|
+
|
727
|
+
# Install ddeutil-workflow with retry
|
728
|
+
install_package('ddeutil-workflow')
|
729
|
+
|
730
|
+
# Download files with retry
|
731
|
+
download_file(os.environ['JOB_CONFIG_BLOB'], 'job_config.json')
|
732
|
+
download_file(os.environ['PARAMS_BLOB'], 'params.json')
|
733
|
+
download_file(os.environ['SCRIPT_BLOB'], 'task_script.py')
|
734
|
+
|
735
|
+
# Add current directory to Python path
|
736
|
+
sys.path.insert(0, os.getcwd())
|
737
|
+
|
738
|
+
from ddeutil.workflow.job import local_execute
|
739
|
+
from ddeutil.workflow import Job
|
740
|
+
|
741
|
+
# Load job configuration
|
742
|
+
with open('job_config.json', 'r') as f:
|
743
|
+
job_data = json.load(f)
|
744
|
+
|
745
|
+
# Load parameters
|
746
|
+
with open('params.json', 'r') as f:
|
747
|
+
params = json.load(f)
|
748
|
+
|
749
|
+
# Create job instance
|
750
|
+
job = Job(**job_data)
|
751
|
+
|
752
|
+
# Execute job
|
753
|
+
result = local_execute(job, params, run_id='{run_id}')
|
754
|
+
|
755
|
+
# Save result
|
756
|
+
with open('result.json', 'w') as f:
|
757
|
+
json.dump(result.model_dump(), f, indent=2)
|
758
|
+
|
759
|
+
# Upload result to Azure Storage with retry
|
760
|
+
job_id = '{run_id}'
|
761
|
+
container = os.environ['STORAGE_CONTAINER']
|
762
|
+
|
763
|
+
# Upload result file with retry
|
764
|
+
download_file('result.json', f'jobs/{{job_id}}/result.json')
|
765
|
+
|
766
|
+
sys.exit(0 if result.status == 'success' else 1)
|
767
|
+
'''
|
768
|
+
|
769
|
+
with self._temp_file_context(suffix=".py") as script_path:
|
770
|
+
with open(script_path, "w") as f:
|
771
|
+
f.write(script_content)
|
772
|
+
return script_path
|
773
|
+
|
774
|
+
def execute_job(
|
775
|
+
self,
|
776
|
+
job: Job,
|
777
|
+
params: DictData,
|
778
|
+
*,
|
779
|
+
run_id: Optional[str] = None,
|
780
|
+
event: Optional[Any] = None,
|
781
|
+
) -> Result:
|
782
|
+
"""Execute job on Azure Batch with optimized performance.
|
783
|
+
|
784
|
+
Args:
|
785
|
+
job: Job to execute
|
786
|
+
params: Job parameters
|
787
|
+
run_id: Execution run ID
|
788
|
+
event: Event for cancellation
|
789
|
+
|
790
|
+
Returns:
|
791
|
+
Result: Execution result
|
792
|
+
"""
|
793
|
+
if event and event.is_set():
|
794
|
+
return Result(
|
795
|
+
status=FAILED,
|
796
|
+
context={
|
797
|
+
"errors": {"message": "Execution was canceled before start"}
|
798
|
+
},
|
799
|
+
run_id=run_id or gen_id("azure-batch"),
|
800
|
+
extras={},
|
801
|
+
)
|
802
|
+
|
803
|
+
# Generate run ID if not provided
|
804
|
+
if not run_id:
|
805
|
+
run_id = gen_id(job.id or "azure-batch", unique=True)
|
806
|
+
|
807
|
+
trace = get_trace(run_id, extras=job.extras)
|
808
|
+
trace.info(f"[AZURE_BATCH]: Starting job execution: {job.id}")
|
809
|
+
|
810
|
+
try:
|
811
|
+
# Create pool if not exists
|
812
|
+
pool_id = self.pool_config.pool_id
|
813
|
+
trace.info(f"[AZURE_BATCH]: Ensuring pool exists: {pool_id}")
|
814
|
+
self._create_optimized_pool(pool_id)
|
815
|
+
|
816
|
+
# Create job
|
817
|
+
job_id = f"workflow-job-{run_id}"
|
818
|
+
trace.info(f"[AZURE_BATCH]: Creating job: {job_id}")
|
819
|
+
self._create_job(job_id, pool_id)
|
820
|
+
|
821
|
+
# Create optimized task script
|
822
|
+
script_path = self._create_optimized_task_script(
|
823
|
+
job, params, run_id
|
824
|
+
)
|
825
|
+
|
826
|
+
# Upload files efficiently
|
827
|
+
job_config_blob = f"{run_id}/job_config.json"
|
828
|
+
params_blob = f"{run_id}/params.json"
|
829
|
+
script_blob = f"{run_id}/task_script.py"
|
830
|
+
|
831
|
+
# Upload files efficiently
|
832
|
+
trace.info("[AZURE_BATCH]: Uploading files to storage")
|
833
|
+
|
834
|
+
with self._temp_file_context(suffix=".json") as job_config_path:
|
835
|
+
with open(job_config_path, "w") as f:
|
836
|
+
json.dump(job.model_dump(), f)
|
837
|
+
self._upload_file_to_storage(job_config_path, job_config_blob)
|
838
|
+
|
839
|
+
with self._temp_file_context(suffix=".json") as params_path:
|
840
|
+
with open(params_path, "w") as f:
|
841
|
+
json.dump(params, f)
|
842
|
+
self._upload_file_to_storage(params_path, params_blob)
|
843
|
+
|
844
|
+
self._upload_file_to_storage(script_path, script_blob)
|
845
|
+
|
846
|
+
# Create resource files
|
847
|
+
resource_files = [
|
848
|
+
ResourceFile(
|
849
|
+
file_path="job_config.json",
|
850
|
+
blob_source=self._upload_file_to_storage(
|
851
|
+
job_config_path, job_config_blob
|
852
|
+
),
|
853
|
+
),
|
854
|
+
ResourceFile(
|
855
|
+
file_path="params.json",
|
856
|
+
blob_source=self._upload_file_to_storage(
|
857
|
+
params_path, params_blob
|
858
|
+
),
|
859
|
+
),
|
860
|
+
ResourceFile(
|
861
|
+
file_path="task_script.py",
|
862
|
+
blob_source=self._upload_file_to_storage(
|
863
|
+
script_path, script_blob
|
864
|
+
),
|
865
|
+
),
|
866
|
+
]
|
867
|
+
|
868
|
+
# Create task with optimized settings
|
869
|
+
task_id = f"workflow-task-{run_id}"
|
870
|
+
command_line = "python3 task_script.py"
|
871
|
+
|
872
|
+
# Set environment variables for the task
|
873
|
+
environment_settings = {
|
874
|
+
"STORAGE_ACCOUNT_NAME": self.storage_account_name,
|
875
|
+
"STORAGE_ACCOUNT_KEY": self.storage_account_key,
|
876
|
+
"STORAGE_CONTAINER": self.storage_container,
|
877
|
+
"JOB_CONFIG_BLOB": job_config_blob,
|
878
|
+
"PARAMS_BLOB": params_blob,
|
879
|
+
"SCRIPT_BLOB": script_blob,
|
880
|
+
}
|
881
|
+
|
882
|
+
trace.info(f"[AZURE_BATCH]: Creating task: {task_id}")
|
883
|
+
self._create_task(
|
884
|
+
job_id=job_id,
|
885
|
+
task_id=task_id,
|
886
|
+
command_line=command_line,
|
887
|
+
resource_files=resource_files,
|
888
|
+
environment_settings=environment_settings,
|
889
|
+
)
|
890
|
+
|
891
|
+
# Wait for task completion
|
892
|
+
trace.info("[AZURE_BATCH]: Waiting for task completion")
|
893
|
+
task_result = self._wait_for_task_completion(job_id, task_id)
|
894
|
+
|
895
|
+
# Process results
|
896
|
+
if task_result["status"] == "completed":
|
897
|
+
result_data = {}
|
898
|
+
if "result.json" in task_result.get("files", {}):
|
899
|
+
try:
|
900
|
+
result_data = json.loads(
|
901
|
+
task_result["files"]["result.json"]
|
902
|
+
)
|
903
|
+
except (json.JSONDecodeError, KeyError):
|
904
|
+
result_data = {"status": SUCCESS}
|
905
|
+
|
906
|
+
trace.info("[AZURE_BATCH]: Task completed successfully")
|
907
|
+
return Result(
|
908
|
+
status=SUCCESS,
|
909
|
+
context=result_data,
|
910
|
+
run_id=run_id,
|
911
|
+
extras=job.extras or {},
|
912
|
+
)
|
913
|
+
else:
|
914
|
+
error_msg = (
|
915
|
+
f"Task failed: {task_result.get('status', 'unknown')}"
|
916
|
+
)
|
917
|
+
if task_result.get("failure_reason"):
|
918
|
+
error_msg += f" - {task_result['failure_reason']}"
|
919
|
+
|
920
|
+
trace.error(f"[AZURE_BATCH]: {error_msg}")
|
921
|
+
return Result(
|
922
|
+
status=FAILED,
|
923
|
+
context={"errors": {"message": error_msg}},
|
924
|
+
run_id=run_id,
|
925
|
+
extras=job.extras or {},
|
926
|
+
)
|
927
|
+
|
928
|
+
except Exception as e:
|
929
|
+
trace.error(f"[AZURE_BATCH]: Execution failed: {str(e)}")
|
930
|
+
return Result(
|
931
|
+
status=FAILED,
|
932
|
+
context={"errors": {"message": str(e)}},
|
933
|
+
run_id=run_id,
|
934
|
+
extras=job.extras or {},
|
935
|
+
)
|
936
|
+
|
937
|
+
def cleanup(self, job_id: Optional[str] = None) -> None:
|
938
|
+
"""Clean up Azure Batch resources efficiently.
|
939
|
+
|
940
|
+
Args:
|
941
|
+
job_id: Job ID to clean up (if None, cleans up all workflow jobs)
|
942
|
+
"""
|
943
|
+
try:
|
944
|
+
if job_id:
|
945
|
+
# Delete specific job
|
946
|
+
self.batch_client.job.delete(job_id)
|
947
|
+
else:
|
948
|
+
# Delete all workflow jobs efficiently
|
949
|
+
jobs = self.batch_client.job.list()
|
950
|
+
workflow_jobs = [
|
951
|
+
job for job in jobs if job.id.startswith("workflow-job-")
|
952
|
+
]
|
953
|
+
|
954
|
+
# Delete jobs in parallel (simplified approach)
|
955
|
+
for job in workflow_jobs:
|
956
|
+
try:
|
957
|
+
self.batch_client.job.delete(job.id)
|
958
|
+
except BatchErrorException:
|
959
|
+
# Job might already be deleted
|
960
|
+
pass
|
961
|
+
except Exception:
|
962
|
+
pass
|
963
|
+
|
964
|
+
|
965
|
+
def azure_batch_execute(
|
966
|
+
job: Job,
|
967
|
+
params: DictData,
|
968
|
+
*,
|
969
|
+
run_id: Optional[str] = None,
|
970
|
+
event: Optional[Any] = None,
|
971
|
+
) -> Result:
|
972
|
+
"""Azure Batch job execution function with optimized performance.
|
973
|
+
|
974
|
+
This function creates an Azure Batch provider and executes the job
|
975
|
+
on Azure Batch compute nodes. It handles the complete lifecycle
|
976
|
+
including pool creation, job submission, and result retrieval.
|
977
|
+
|
978
|
+
Args:
|
979
|
+
job: Job to execute
|
980
|
+
params: Job parameters
|
981
|
+
run_id: Execution run ID
|
982
|
+
event: Event for cancellation
|
983
|
+
|
984
|
+
Returns:
|
985
|
+
Result: Execution result
|
986
|
+
"""
|
987
|
+
# Extract Azure Batch configuration from job
|
988
|
+
batch_args = job.runs_on.args
|
989
|
+
|
990
|
+
provider = AzureBatchProvider(
|
991
|
+
batch_account_name=batch_args.batch_account_name,
|
992
|
+
batch_account_key=batch_args.batch_account_key.get_secret_value(),
|
993
|
+
batch_account_url=batch_args.batch_account_url,
|
994
|
+
storage_account_name=batch_args.storage_account_name,
|
995
|
+
storage_account_key=batch_args.storage_account_key.get_secret_value(),
|
996
|
+
)
|
997
|
+
|
998
|
+
try:
|
999
|
+
return provider.execute_job(job, params, run_id=run_id, event=event)
|
1000
|
+
finally:
|
1001
|
+
# Clean up resources
|
1002
|
+
if run_id:
|
1003
|
+
provider.cleanup(f"workflow-job-{run_id}")
|