py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of py2ls might be problematic. Click here for more details.
- py2ls/.DS_Store +0 -0
- py2ls/.git/.DS_Store +0 -0
- py2ls/.git/index +0 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/objects/.DS_Store +0 -0
- py2ls/.git/refs/.DS_Store +0 -0
- py2ls/ImageLoader.py +621 -0
- py2ls/__init__.py +7 -5
- py2ls/apptainer2ls.py +3940 -0
- py2ls/batman.py +164 -42
- py2ls/bio.py +2595 -0
- py2ls/cell_image_clf.py +1632 -0
- py2ls/container2ls.py +4635 -0
- py2ls/corr.py +475 -0
- py2ls/data/.DS_Store +0 -0
- py2ls/data/email/email_html_template.html +88 -0
- py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
- py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
- py2ls/data/mygenes_fields_241022.txt +355 -0
- py2ls/data/re_common_pattern.json +173 -0
- py2ls/data/sns_info.json +74 -0
- py2ls/data/styles/.DS_Store +0 -0
- py2ls/data/styles/example/.DS_Store +0 -0
- py2ls/data/styles/stylelib/.DS_Store +0 -0
- py2ls/data/styles/stylelib/grid.mplstyle +15 -0
- py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
- py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
- py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
- py2ls/data/styles/stylelib/light.mplstyl +6 -0
- py2ls/data/styles/stylelib/muted.mplstyle +6 -0
- py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
- py2ls/data/styles/stylelib/nature.mplstyle +31 -0
- py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
- py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
- py2ls/data/styles/stylelib/paper.mplstyle +290 -0
- py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
- py2ls/data/styles/stylelib/retro.mplstyle +4 -0
- py2ls/data/styles/stylelib/sans.mplstyle +10 -0
- py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
- py2ls/data/styles/stylelib/science.mplstyle +48 -0
- py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
- py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
- py2ls/data/tiles.csv +146 -0
- py2ls/data/usages_pd.json +1417 -0
- py2ls/data/usages_sns.json +31 -0
- py2ls/docker2ls.py +5446 -0
- py2ls/ec2ls.py +61 -0
- py2ls/fetch_update.py +145 -0
- py2ls/ich2ls.py +1955 -296
- py2ls/im2.py +8242 -0
- py2ls/image_ml2ls.py +2100 -0
- py2ls/ips.py +33909 -3418
- py2ls/ml2ls.py +7700 -0
- py2ls/mol.py +289 -0
- py2ls/mount2ls.py +1307 -0
- py2ls/netfinder.py +873 -351
- py2ls/nl2ls.py +283 -0
- py2ls/ocr.py +1581 -458
- py2ls/plot.py +10394 -314
- py2ls/rna2ls.py +311 -0
- py2ls/ssh2ls.md +456 -0
- py2ls/ssh2ls.py +5933 -0
- py2ls/ssh2ls_v01.py +2204 -0
- py2ls/stats.py +66 -172
- py2ls/temp20251124.py +509 -0
- py2ls/translator.py +2 -0
- py2ls/utils/decorators.py +3564 -0
- py2ls/utils_bio.py +3453 -0
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
- {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/container2ls.py
ADDED
|
@@ -0,0 +1,4635 @@
|
|
|
1
|
+
"""
|
|
2
|
+
通用容器调度器 (Universal Container Scheduler)
|
|
3
|
+
=============================================
|
|
4
|
+
|
|
5
|
+
一个企业级的、生产就绪的命令执行框架,支持本地执行、容器化执行(Docker/Apptainer)
|
|
6
|
+
以及HPC集群调度(SLURM/PBS/Torque)。提供完整的作业管理、监控、重试和资源管理功能。
|
|
7
|
+
|
|
8
|
+
核心特性:
|
|
9
|
+
多后端支持:Local, Docker, Apptainer, SLURM, PBS
|
|
10
|
+
完整的作业管理:提交、执行、监控、取消
|
|
11
|
+
智能重试机制:指数退避、条件重试
|
|
12
|
+
资源管理:CPU、内存、GPU等资源请求和限制
|
|
13
|
+
优先级队列:支持作业优先级调度
|
|
14
|
+
超时监控:自动检测和取消超时作业
|
|
15
|
+
结果缓存:避免重复执行相同命令
|
|
16
|
+
持久化存储:SQLite数据库存储作业和结果
|
|
17
|
+
插件系统:可扩展的插件架构
|
|
18
|
+
健康检查:全面的系统健康监控
|
|
19
|
+
指标收集:性能指标和统计信息
|
|
20
|
+
命令行接口:完整的CLI支持
|
|
21
|
+
设计原则:
|
|
22
|
+
---------
|
|
23
|
+
- **统一接口**: 所有后端使用相同的API
|
|
24
|
+
- **配置驱动**: 通过配置类管理复杂参数
|
|
25
|
+
- **类型安全**: 完整的类型注解
|
|
26
|
+
- **可观测性**: 内置监控和日志
|
|
27
|
+
- **可扩展性**: 插件架构,易于扩展新功能
|
|
28
|
+
|
|
29
|
+
快速开始:
|
|
30
|
+
---------
|
|
31
|
+
```python
|
|
32
|
+
from universal_scheduler import ContainerScheduler, Backend, ResourceRequest
|
|
33
|
+
|
|
34
|
+
# 创建调度器实例
|
|
35
|
+
scheduler = ContainerScheduler()
|
|
36
|
+
|
|
37
|
+
# 运行简单命令
|
|
38
|
+
result = scheduler.run("echo 'Hello, World!'", backend=Backend.LOCAL)
|
|
39
|
+
|
|
40
|
+
# 使用Docker容器
|
|
41
|
+
result = scheduler.run(
|
|
42
|
+
cmd="python -c 'import numpy; print(numpy.__version__)'",
|
|
43
|
+
backend=Backend.DOCKER,
|
|
44
|
+
image="python:3.9-slim",
|
|
45
|
+
mounts={"/data": "/data"}
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# 提交SLURM作业
|
|
49
|
+
result = scheduler.run(
|
|
50
|
+
cmd="python train_model.py",
|
|
51
|
+
backend=Backend.SLURM,
|
|
52
|
+
resource=ResourceRequest(cpus=8, memory_gb=32, gpus=1),
|
|
53
|
+
job_name="model_training"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# 文件结构建议:
|
|
57
|
+
universal_scheduler/
|
|
58
|
+
├── __init__.py # 主模块
|
|
59
|
+
├── scheduler.py # 主调度器类
|
|
60
|
+
├── models.py # 数据模型(ResourceRequest, JobResult等)
|
|
61
|
+
├── plugins.py # 插件系统
|
|
62
|
+
├── storage.py # 存储和缓存
|
|
63
|
+
├── monitors.py # 监控和指标
|
|
64
|
+
├── backends.py # 后端实现
|
|
65
|
+
├── cli.py # 命令行接口
|
|
66
|
+
├── config/
|
|
67
|
+
│ └── default.yaml # 默认配置
|
|
68
|
+
└── examples/ # 示例代码
|
|
69
|
+
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
import subprocess
|
|
73
|
+
import shlex
|
|
74
|
+
import time
|
|
75
|
+
import logging
|
|
76
|
+
import os
|
|
77
|
+
import signal
|
|
78
|
+
import sys
|
|
79
|
+
import tempfile
|
|
80
|
+
import json
|
|
81
|
+
import hashlib
|
|
82
|
+
import pickle
|
|
83
|
+
import sqlite3
|
|
84
|
+
import uuid
|
|
85
|
+
import socket
|
|
86
|
+
import threading
|
|
87
|
+
import inspect
|
|
88
|
+
import asyncio
|
|
89
|
+
import heapq
|
|
90
|
+
import random
|
|
91
|
+
import multiprocessing as mp
|
|
92
|
+
from pathlib import Path
|
|
93
|
+
from typing import Dict, List, Optional, Union, Callable, Any, Tuple, Set, Type
|
|
94
|
+
from dataclasses import dataclass, field, asdict
|
|
95
|
+
from enum import Enum, auto
|
|
96
|
+
from datetime import datetime, timedelta
|
|
97
|
+
from concurrent.futures import ThreadPoolExecutor, Future, ProcessPoolExecutor
|
|
98
|
+
from abc import ABC, abstractmethod
|
|
99
|
+
from functools import wraps, lru_cache
|
|
100
|
+
from contextlib import contextmanager
|
|
101
|
+
import warnings
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
import yaml
|
|
105
|
+
YAML_AVAILABLE = True
|
|
106
|
+
except ImportError:
|
|
107
|
+
YAML_AVAILABLE = False
|
|
108
|
+
warnings.warn("PyYAML not installed, YAML config support disabled")
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
import psutil
|
|
112
|
+
PSUTIL_AVAILABLE = True
|
|
113
|
+
except ImportError:
|
|
114
|
+
PSUTIL_AVAILABLE = False
|
|
115
|
+
warnings.warn("psutil not installed, resource monitoring limited")
|
|
116
|
+
|
|
117
|
+
# ============================================================================
|
|
118
|
+
# 核心枚举和数据类型
|
|
119
|
+
# ============================================================================
|
|
120
|
+
|
|
121
|
+
class Backend(Enum):
|
|
122
|
+
"""支持的执行后端"""
|
|
123
|
+
LOCAL = "local"
|
|
124
|
+
DOCKER = "docker"
|
|
125
|
+
APPTAINER = "apptainer"
|
|
126
|
+
SLURM = "slurm"
|
|
127
|
+
PBS = "pbs"
|
|
128
|
+
KUBERNETES = "kubernetes"
|
|
129
|
+
AWS_BATCH = "aws_batch"
|
|
130
|
+
AZURE_BATCH = "azure_batch"
|
|
131
|
+
|
|
132
|
+
class JobPriority(Enum):
|
|
133
|
+
"""作业优先级"""
|
|
134
|
+
LOWEST = 0
|
|
135
|
+
LOW = 1
|
|
136
|
+
NORMAL = 2
|
|
137
|
+
HIGH = 3
|
|
138
|
+
HIGHEST = 4
|
|
139
|
+
CRITICAL = 5
|
|
140
|
+
|
|
141
|
+
class JobStatus(Enum):
|
|
142
|
+
"""作业状态"""
|
|
143
|
+
CREATED = "created"
|
|
144
|
+
PENDING = "pending"
|
|
145
|
+
QUEUED = "queued"
|
|
146
|
+
RUNNING = "running"
|
|
147
|
+
COMPLETED = "completed"
|
|
148
|
+
FAILED = "failed"
|
|
149
|
+
CANCELLED = "cancelled"
|
|
150
|
+
TIMEOUT = "timeout"
|
|
151
|
+
SUSPENDED = "suspended"
|
|
152
|
+
UNKNOWN = "unknown"
|
|
153
|
+
|
|
154
|
+
class ResourceType(Enum):
|
|
155
|
+
"""资源类型"""
|
|
156
|
+
CPU = "cpu"
|
|
157
|
+
GPU = "gpu"
|
|
158
|
+
MEMORY = "memory"
|
|
159
|
+
DISK = "disk"
|
|
160
|
+
NETWORK = "network"
|
|
161
|
+
TIME = "time"
|
|
162
|
+
|
|
163
|
+
# ============================================================================
|
|
164
|
+
# 核心数据类
|
|
165
|
+
# ============================================================================
|
|
166
|
+
|
|
167
|
+
@dataclass
|
|
168
|
+
class ResourceRequest:
|
|
169
|
+
"""
|
|
170
|
+
资源请求配置
|
|
171
|
+
定义作业所需的计算资源,包括CPU、内存、GPU等。
|
|
172
|
+
支持不同后端的资源映射。
|
|
173
|
+
"""
|
|
174
|
+
cpus: int = 1
|
|
175
|
+
memory_gb: float = 1.0
|
|
176
|
+
memory_mb: Optional[int] = None
|
|
177
|
+
gpus: int = 0
|
|
178
|
+
gpu_type: Optional[str] = None
|
|
179
|
+
time_minutes: Optional[int] = None
|
|
180
|
+
time_hours: Optional[int] = None
|
|
181
|
+
partition: Optional[str] = None
|
|
182
|
+
queue: Optional[str] = None
|
|
183
|
+
nodes: int = 1
|
|
184
|
+
tasks_per_node: int = 1
|
|
185
|
+
account: Optional[str] = None
|
|
186
|
+
reservation: Optional[str] = None
|
|
187
|
+
qos: Optional[str] = None
|
|
188
|
+
walltime: Optional[str] = None
|
|
189
|
+
exclusive: bool = False
|
|
190
|
+
constraints: Optional[str] = None
|
|
191
|
+
features: Optional[str] = None
|
|
192
|
+
|
|
193
|
+
def __post_init__(self):
|
|
194
|
+
"""后初始化处理,确保内存单位一致"""
|
|
195
|
+
if self.memory_mb is None and self.memory_gb is not None:
|
|
196
|
+
self.memory_mb = int(self.memory_gb * 1024)
|
|
197
|
+
|
|
198
|
+
# 确保时间单位一致
|
|
199
|
+
if self.time_hours is not None and self.time_minutes is None:
|
|
200
|
+
self.time_minutes = self.time_hours * 60
|
|
201
|
+
elif self.time_minutes is not None and self.time_hours is None:
|
|
202
|
+
self.time_hours = self.time_minutes / 60
|
|
203
|
+
|
|
204
|
+
@property
|
|
205
|
+
def total_cpus(self) -> int:
|
|
206
|
+
"""总CPU核心数"""
|
|
207
|
+
return self.cpus * self.nodes * self.tasks_per_node
|
|
208
|
+
|
|
209
|
+
def to_slurm_directives(self) -> Dict[str, str]:
|
|
210
|
+
"""转换为SLURM指令"""
|
|
211
|
+
directives = {}
|
|
212
|
+
if self.cpus > 1:
|
|
213
|
+
directives["--cpus-per-task"] = str(self.cpus)
|
|
214
|
+
if self.memory_mb:
|
|
215
|
+
directives["--mem"] = f"{self.memory_mb}M"
|
|
216
|
+
if self.gpus > 0:
|
|
217
|
+
gres = f"gpu:{self.gpus}"
|
|
218
|
+
if self.gpu_type:
|
|
219
|
+
gres = f"gpu:{self.gpu_type}:{self.gpus}"
|
|
220
|
+
directives["--gres"] = gres
|
|
221
|
+
if self.time_minutes:
|
|
222
|
+
directives["--time"] = str(self.time_minutes)
|
|
223
|
+
if self.partition:
|
|
224
|
+
directives["--partition"] = self.partition
|
|
225
|
+
if self.account:
|
|
226
|
+
directives["--account"] = self.account
|
|
227
|
+
if self.qos:
|
|
228
|
+
directives["--qos"] = self.qos
|
|
229
|
+
if self.nodes > 1:
|
|
230
|
+
directives["--nodes"] = str(self.nodes)
|
|
231
|
+
if self.tasks_per_node > 1:
|
|
232
|
+
directives["--ntasks-per-node"] = str(self.tasks_per_node)
|
|
233
|
+
if self.exclusive:
|
|
234
|
+
directives["--exclusive"] = ""
|
|
235
|
+
if self.constraints:
|
|
236
|
+
directives["--constraint"] = self.constraints
|
|
237
|
+
return directives
|
|
238
|
+
|
|
239
|
+
def to_pbs_directives(self) -> Dict[str, str]:
|
|
240
|
+
"""转换为PBS指令"""
|
|
241
|
+
directives = {}
|
|
242
|
+
directives["-l nodes"] = f"{self.nodes}:ppn={self.cpus}"
|
|
243
|
+
if self.memory_mb:
|
|
244
|
+
directives["-l mem"] = f"{self.memory_mb}mb"
|
|
245
|
+
if self.time_hours:
|
|
246
|
+
directives["-l walltime"] = f"{self.time_hours}:00:00"
|
|
247
|
+
if self.queue:
|
|
248
|
+
directives["-q"] = self.queue
|
|
249
|
+
if self.gpus > 0:
|
|
250
|
+
directives["-l gpus"] = str(self.gpus)
|
|
251
|
+
if self.gpu_type:
|
|
252
|
+
directives["-l gputype"] = self.gpu_type
|
|
253
|
+
return directives
|
|
254
|
+
|
|
255
|
+
@dataclass
|
|
256
|
+
class RetryConfig:
|
|
257
|
+
"""
|
|
258
|
+
重试策略配置
|
|
259
|
+
定义作业失败时的重试行为,支持指数退避、条件重试等策略。
|
|
260
|
+
"""
|
|
261
|
+
max_attempts: int = 1
|
|
262
|
+
delay_seconds: float = 1.0
|
|
263
|
+
backoff_factor: float = 2.0
|
|
264
|
+
max_delay_seconds: float = 300.0
|
|
265
|
+
jitter_seconds: float = 0.0
|
|
266
|
+
retry_on_exit_codes: List[int] = field(default_factory=list)
|
|
267
|
+
retry_on_timeout: bool = True
|
|
268
|
+
retry_on_signal: bool = False
|
|
269
|
+
retry_on_memory_error: bool = True
|
|
270
|
+
retry_on_disk_full: bool = True
|
|
271
|
+
retry_on_network_error: bool = True
|
|
272
|
+
retry_condition: Optional[Callable[['JobResult'], bool]] = None
|
|
273
|
+
|
|
274
|
+
def get_delay(self, attempt: int) -> float:
|
|
275
|
+
"""计算第attempt次重试的延迟时间"""
|
|
276
|
+
delay = self.delay_seconds * (self.backoff_factor ** (attempt - 1))
|
|
277
|
+
delay = min(delay, self.max_delay_seconds)
|
|
278
|
+
|
|
279
|
+
# 添加抖动
|
|
280
|
+
if self.jitter_seconds > 0:
|
|
281
|
+
delay += random.uniform(-self.jitter_seconds, self.jitter_seconds)
|
|
282
|
+
delay = max(0, delay)
|
|
283
|
+
|
|
284
|
+
return delay
|
|
285
|
+
|
|
286
|
+
def should_retry(self, result: 'JobResult', attempt: int) -> bool:
|
|
287
|
+
"""判断是否需要重试"""
|
|
288
|
+
if attempt >= self.max_attempts:
|
|
289
|
+
return False
|
|
290
|
+
|
|
291
|
+
# 检查退出码
|
|
292
|
+
if result.exit_code is not None:
|
|
293
|
+
if self.retry_on_exit_codes:
|
|
294
|
+
if result.exit_code in self.retry_on_exit_codes:
|
|
295
|
+
return True
|
|
296
|
+
elif result.exit_code != 0:
|
|
297
|
+
return True
|
|
298
|
+
|
|
299
|
+
# 检查超时
|
|
300
|
+
if result.status == JobStatus.TIMEOUT and self.retry_on_timeout:
|
|
301
|
+
return True
|
|
302
|
+
|
|
303
|
+
# 检查信号
|
|
304
|
+
if result.exit_code is not None and result.exit_code < 0 and self.retry_on_signal:
|
|
305
|
+
return True
|
|
306
|
+
|
|
307
|
+
# 检查错误信息中的关键词
|
|
308
|
+
error_msg = (result.error_message or "").lower()
|
|
309
|
+
stderr = (result.stderr or "").lower()
|
|
310
|
+
|
|
311
|
+
if self.retry_on_memory_error and any(
|
|
312
|
+
keyword in error_msg or keyword in stderr
|
|
313
|
+
for keyword in ["memory", "oom", "out of memory"]
|
|
314
|
+
):
|
|
315
|
+
return True
|
|
316
|
+
|
|
317
|
+
if self.retry_on_disk_full and any(
|
|
318
|
+
keyword in error_msg or keyword in stderr
|
|
319
|
+
for keyword in ["disk full", "no space", "quota exceeded"]
|
|
320
|
+
):
|
|
321
|
+
return True
|
|
322
|
+
|
|
323
|
+
if self.retry_on_network_error and any(
|
|
324
|
+
keyword in error_msg or keyword in stderr
|
|
325
|
+
for keyword in ["network", "connection", "timeout", "refused"]
|
|
326
|
+
):
|
|
327
|
+
return True
|
|
328
|
+
|
|
329
|
+
# 自定义条件
|
|
330
|
+
if self.retry_condition and self.retry_condition(result):
|
|
331
|
+
return True
|
|
332
|
+
|
|
333
|
+
return False
|
|
334
|
+
|
|
335
|
+
@dataclass
|
|
336
|
+
class ExecutionConfig:
|
|
337
|
+
"""
|
|
338
|
+
执行配置
|
|
339
|
+
定义命令执行的详细配置,包括工作目录、环境变量、挂载点等。
|
|
340
|
+
"""
|
|
341
|
+
workdir: Optional[Path] = None
|
|
342
|
+
env: Dict[str, str] = field(default_factory=dict)
|
|
343
|
+
mounts: Dict[Path, Path] = field(default_factory=dict)
|
|
344
|
+
shell: str = "/bin/bash"
|
|
345
|
+
clean_temp: bool = True
|
|
346
|
+
capture_output: bool = True
|
|
347
|
+
stdout: Optional[Path] = None
|
|
348
|
+
stderr: Optional[Path] = None
|
|
349
|
+
stdin: Optional[str] = None
|
|
350
|
+
timeout: Optional[int] = None
|
|
351
|
+
check: bool = False
|
|
352
|
+
silent: bool = False
|
|
353
|
+
user: Optional[str] = None
|
|
354
|
+
group: Optional[str] = None
|
|
355
|
+
network_mode: Optional[str] = None
|
|
356
|
+
security_opts: Optional[List[str]] = None
|
|
357
|
+
ulimits: Optional[Dict[str, Tuple[int, int]]] = None
|
|
358
|
+
tmpfs: Optional[Dict[str, str]] = None
|
|
359
|
+
read_only: bool = False
|
|
360
|
+
detach: bool = False
|
|
361
|
+
|
|
362
|
+
def __post_init__(self):
|
|
363
|
+
"""确保路径是绝对路径"""
|
|
364
|
+
if self.workdir is not None:
|
|
365
|
+
self.workdir = Path(self.workdir).resolve()
|
|
366
|
+
if self.stdout is not None:
|
|
367
|
+
self.stdout = Path(self.stdout).resolve()
|
|
368
|
+
if self.stderr is not None:
|
|
369
|
+
self.stderr = Path(self.stderr).resolve()
|
|
370
|
+
|
|
371
|
+
# 转换挂载路径
|
|
372
|
+
mounts = {}
|
|
373
|
+
for host_path, container_path in self.mounts.items():
|
|
374
|
+
mounts[Path(host_path).resolve()] = Path(container_path)
|
|
375
|
+
self.mounts = mounts
|
|
376
|
+
|
|
377
|
+
@dataclass
|
|
378
|
+
class JobResult:
|
|
379
|
+
"""
|
|
380
|
+
作业执行结果
|
|
381
|
+
封装作业的执行结果,包括状态、退出码、输出、时间统计等。
|
|
382
|
+
"""
|
|
383
|
+
job_id: str
|
|
384
|
+
status: JobStatus
|
|
385
|
+
exit_code: Optional[int] = None
|
|
386
|
+
stdout: Optional[str] = None
|
|
387
|
+
stderr: Optional[str] = None
|
|
388
|
+
start_time: Optional[datetime] = None
|
|
389
|
+
end_time: Optional[datetime] = None
|
|
390
|
+
duration: Optional[float] = None
|
|
391
|
+
attempts: int = 0
|
|
392
|
+
error_message: Optional[str] = None
|
|
393
|
+
backend: Optional[str] = None
|
|
394
|
+
command: Optional[str] = None
|
|
395
|
+
resource_usage: Optional[Dict[str, Any]] = None
|
|
396
|
+
metrics: Dict[str, Any] = field(default_factory=dict)
|
|
397
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
|
398
|
+
parent_job_id: Optional[str] = None
|
|
399
|
+
|
|
400
|
+
def success(self) -> bool:
|
|
401
|
+
"""作业是否成功完成"""
|
|
402
|
+
return self.status == JobStatus.COMPLETED and (self.exit_code == 0 or self.exit_code is None)
|
|
403
|
+
|
|
404
|
+
def failed(self) -> bool:
|
|
405
|
+
"""作业是否失败"""
|
|
406
|
+
return self.status in {JobStatus.FAILED, JobStatus.TIMEOUT}
|
|
407
|
+
|
|
408
|
+
def running(self) -> bool:
|
|
409
|
+
"""作业是否正在运行"""
|
|
410
|
+
return self.status == JobStatus.RUNNING
|
|
411
|
+
|
|
412
|
+
def cancelled(self) -> bool:
|
|
413
|
+
"""作业是否被取消"""
|
|
414
|
+
return self.status == JobStatus.CANCELLED
|
|
415
|
+
|
|
416
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
417
|
+
"""转换为字典,用于序列化"""
|
|
418
|
+
data = asdict(self)
|
|
419
|
+
data['status'] = self.status.value
|
|
420
|
+
return data
|
|
421
|
+
|
|
422
|
+
def to_json(self) -> str:
|
|
423
|
+
"""转换为JSON字符串"""
|
|
424
|
+
return json.dumps(self.to_dict(), default=str)
|
|
425
|
+
|
|
426
|
+
@classmethod
|
|
427
|
+
def from_json(cls, json_str: str) -> 'JobResult':
|
|
428
|
+
"""从JSON字符串创建JobResult"""
|
|
429
|
+
data = json.loads(json_str)
|
|
430
|
+
data['status'] = JobStatus(data['status'])
|
|
431
|
+
if data['start_time']:
|
|
432
|
+
data['start_time'] = datetime.fromisoformat(data['start_time'].replace('Z', '+00:00'))
|
|
433
|
+
if data['end_time']:
|
|
434
|
+
data['end_time'] = datetime.fromisoformat(data['end_time'].replace('Z', '+00:00'))
|
|
435
|
+
return cls(**data)
|
|
436
|
+
|
|
437
|
+
@dataclass
|
|
438
|
+
class JobDependency:
|
|
439
|
+
"""
|
|
440
|
+
作业依赖关系
|
|
441
|
+
定义作业之间的依赖关系,支持复杂的工作流。
|
|
442
|
+
"""
|
|
443
|
+
job_id: str
|
|
444
|
+
condition: Optional[Callable[[JobResult], bool]] = None
|
|
445
|
+
timeout: Optional[float] = None
|
|
446
|
+
propagate_status: bool = True
|
|
447
|
+
|
|
448
|
+
@dataclass
|
|
449
|
+
class JobDefinition:
|
|
450
|
+
"""
|
|
451
|
+
作业定义
|
|
452
|
+
完整定义作业的所有属性和配置。
|
|
453
|
+
"""
|
|
454
|
+
cmd: str
|
|
455
|
+
backend: Backend = Backend.LOCAL
|
|
456
|
+
image: Optional[str] = None
|
|
457
|
+
config: ExecutionConfig = field(default_factory=ExecutionConfig)
|
|
458
|
+
resource: ResourceRequest = field(default_factory=ResourceRequest)
|
|
459
|
+
retry: RetryConfig = field(default_factory=RetryConfig)
|
|
460
|
+
job_id: Optional[str] = None
|
|
461
|
+
name: Optional[str] = None
|
|
462
|
+
description: Optional[str] = None
|
|
463
|
+
priority: JobPriority = JobPriority.NORMAL
|
|
464
|
+
dependencies: List[JobDependency] = field(default_factory=list)
|
|
465
|
+
tags: Dict[str, str] = field(default_factory=dict)
|
|
466
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
467
|
+
callback: Optional[Callable[[JobResult], Any]] = None
|
|
468
|
+
result_handler: Optional[Callable[[JobResult], JobResult]] = None
|
|
469
|
+
|
|
470
|
+
def __post_init__(self):
|
|
471
|
+
if self.job_id is None:
|
|
472
|
+
self.job_id = f"job_{uuid.uuid4().hex[:8]}"
|
|
473
|
+
if self.name is None:
|
|
474
|
+
self.name = self.job_id
|
|
475
|
+
|
|
476
|
+
# ============================================================================
|
|
477
|
+
# 配置管理
|
|
478
|
+
# ============================================================================
|
|
479
|
+
|
|
480
|
+
@dataclass
|
|
481
|
+
class SchedulerConfig:
|
|
482
|
+
"""
|
|
483
|
+
调度器配置
|
|
484
|
+
完整的调度器配置,支持从YAML文件加载和保存。
|
|
485
|
+
"""
|
|
486
|
+
default_backend: Backend = Backend.LOCAL
|
|
487
|
+
default_image: Optional[str] = None
|
|
488
|
+
max_concurrent: int = 4
|
|
489
|
+
database_url: Optional[str] = None
|
|
490
|
+
cache_dir: Optional[Path] = None
|
|
491
|
+
log_level: str = "INFO"
|
|
492
|
+
log_file: Optional[Path] = None
|
|
493
|
+
enable_priority_queue: bool = False
|
|
494
|
+
enable_timeout_monitor: bool = True
|
|
495
|
+
health_check_interval: int = 30
|
|
496
|
+
cleanup_interval: int = 300
|
|
497
|
+
web_monitor_enabled: bool = False
|
|
498
|
+
web_monitor_port: int = 8080
|
|
499
|
+
|
|
500
|
+
# 资源限制
|
|
501
|
+
max_cpu_percent: float = 90.0
|
|
502
|
+
max_memory_percent: float = 90.0
|
|
503
|
+
max_disk_percent: float = 85.0
|
|
504
|
+
|
|
505
|
+
# 缓存配置
|
|
506
|
+
cache_max_size_mb: int = 1024
|
|
507
|
+
cache_max_age_days: int = 30
|
|
508
|
+
|
|
509
|
+
# 数据库配置
|
|
510
|
+
db_cleanup_days: int = 90
|
|
511
|
+
db_backup_days: int = 7
|
|
512
|
+
|
|
513
|
+
def __post_init__(self):
|
|
514
|
+
if self.cache_dir is not None:
|
|
515
|
+
self.cache_dir = Path(self.cache_dir)
|
|
516
|
+
if self.log_file is not None:
|
|
517
|
+
self.log_file = Path(self.log_file)
|
|
518
|
+
|
|
519
|
+
@classmethod
|
|
520
|
+
def load(cls, config_path: Optional[Path] = None) -> 'SchedulerConfig':
|
|
521
|
+
"""从文件加载配置"""
|
|
522
|
+
if not YAML_AVAILABLE:
|
|
523
|
+
return cls()
|
|
524
|
+
|
|
525
|
+
if config_path is None:
|
|
526
|
+
# 查找默认配置文件位置
|
|
527
|
+
possible_paths = [
|
|
528
|
+
Path("config/scheduler.yaml"),
|
|
529
|
+
Path("scheduler.yaml"),
|
|
530
|
+
Path("~/.universal-scheduler/config.yaml").expanduser(),
|
|
531
|
+
Path("/etc/universal-scheduler/config.yaml")
|
|
532
|
+
]
|
|
533
|
+
|
|
534
|
+
for path in possible_paths:
|
|
535
|
+
if path.exists():
|
|
536
|
+
config_path = path
|
|
537
|
+
break
|
|
538
|
+
|
|
539
|
+
if config_path and config_path.exists():
|
|
540
|
+
with open(config_path) as f:
|
|
541
|
+
data = yaml.safe_load(f)
|
|
542
|
+
|
|
543
|
+
# 转换字符串为枚举
|
|
544
|
+
if "default_backend" in data and isinstance(data["default_backend"], str):
|
|
545
|
+
data["default_backend"] = Backend(data["default_backend"])
|
|
546
|
+
|
|
547
|
+
return cls(**data)
|
|
548
|
+
|
|
549
|
+
return cls() # 返回默认配置
|
|
550
|
+
|
|
551
|
+
def save(self, config_path: Path):
|
|
552
|
+
"""保存配置到文件"""
|
|
553
|
+
if not YAML_AVAILABLE:
|
|
554
|
+
raise ImportError("PyYAML is required to save configuration")
|
|
555
|
+
|
|
556
|
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
557
|
+
with open(config_path, 'w') as f:
|
|
558
|
+
data = self.to_dict()
|
|
559
|
+
# 转换枚举为字符串以便YAML序列化
|
|
560
|
+
if "default_backend" in data:
|
|
561
|
+
data["default_backend"] = data["default_backend"].value
|
|
562
|
+
yaml.dump(data, f, default_flow_style=False)
|
|
563
|
+
|
|
564
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
565
|
+
"""转换为字典"""
|
|
566
|
+
data = asdict(self)
|
|
567
|
+
# 转换枚举
|
|
568
|
+
if isinstance(data["default_backend"], Backend):
|
|
569
|
+
data["default_backend"] = data["default_backend"].value
|
|
570
|
+
# 转换路径
|
|
571
|
+
for key in ["cache_dir", "log_file"]:
|
|
572
|
+
if data[key] is not None:
|
|
573
|
+
data[key] = str(data[key])
|
|
574
|
+
return data
|
|
575
|
+
|
|
576
|
+
def validate(self) -> List[str]:
|
|
577
|
+
"""验证配置,返回错误列表"""
|
|
578
|
+
errors = []
|
|
579
|
+
|
|
580
|
+
if self.max_concurrent <= 0:
|
|
581
|
+
errors.append("max_concurrent must be positive")
|
|
582
|
+
|
|
583
|
+
if self.max_cpu_percent <= 0 or self.max_cpu_percent > 100:
|
|
584
|
+
errors.append("max_cpu_percent must be between 0 and 100")
|
|
585
|
+
|
|
586
|
+
if self.cache_dir and not self.cache_dir.parent.exists():
|
|
587
|
+
errors.append(f"Cache directory parent does not exist: {self.cache_dir.parent}")
|
|
588
|
+
|
|
589
|
+
return errors
|
|
590
|
+
|
|
591
|
+
# ============================================================================
|
|
592
|
+
# 插件系统
|
|
593
|
+
# ============================================================================
|
|
594
|
+
|
|
595
|
+
class Plugin(ABC):
|
|
596
|
+
"""
|
|
597
|
+
插件基类
|
|
598
|
+
扩展调度器功能的插件接口,可以添加监控、日志、缓存等功能。
|
|
599
|
+
"""
|
|
600
|
+
|
|
601
|
+
@abstractmethod
|
|
602
|
+
def on_job_submit(self, job: JobDefinition) -> None:
|
|
603
|
+
"""作业提交时调用"""
|
|
604
|
+
pass
|
|
605
|
+
|
|
606
|
+
@abstractmethod
|
|
607
|
+
def on_job_start(self, job_id: str) -> None:
|
|
608
|
+
"""作业开始时调用"""
|
|
609
|
+
pass
|
|
610
|
+
|
|
611
|
+
@abstractmethod
|
|
612
|
+
def on_job_complete(self, result: JobResult) -> None:
|
|
613
|
+
"""作业完成时调用"""
|
|
614
|
+
pass
|
|
615
|
+
|
|
616
|
+
@abstractmethod
|
|
617
|
+
def on_error(self, error: Exception) -> None:
|
|
618
|
+
"""发生错误时调用"""
|
|
619
|
+
pass
|
|
620
|
+
|
|
621
|
+
class NotificationPlugin(Plugin):
|
|
622
|
+
"""发送通知的插件"""
|
|
623
|
+
|
|
624
|
+
def __init__(self, webhook_url: Optional[str] = None, email: Optional[str] = None):
|
|
625
|
+
self.webhook_url = webhook_url
|
|
626
|
+
self.email = email
|
|
627
|
+
self.notification_count = 0
|
|
628
|
+
|
|
629
|
+
def on_job_submit(self, job: JobDefinition) -> None:
|
|
630
|
+
self._send_notification(f"Job submitted: {job.name} ({job.job_id})")
|
|
631
|
+
|
|
632
|
+
def on_job_start(self, job_id: str) -> None:
|
|
633
|
+
self._send_notification(f"Job started: {job_id}")
|
|
634
|
+
|
|
635
|
+
def on_job_complete(self, result: JobResult) -> None:
|
|
636
|
+
status = "SUCCESS" if result.success() else "FAILED"
|
|
637
|
+
self._send_notification(f"Job completed: {result.job_id} - {status}")
|
|
638
|
+
|
|
639
|
+
def on_error(self, error: Exception) -> None:
|
|
640
|
+
self._send_notification(f"Error: {str(error)}", level="ERROR")
|
|
641
|
+
|
|
642
|
+
def _send_notification(self, message: str, level: str = "INFO"):
|
|
643
|
+
"""发送通知"""
|
|
644
|
+
self.notification_count += 1
|
|
645
|
+
timestamp = datetime.now().isoformat()
|
|
646
|
+
full_message = f"[{timestamp}] [{level}] {message}"
|
|
647
|
+
|
|
648
|
+
# 控制台输出
|
|
649
|
+
print(full_message)
|
|
650
|
+
|
|
651
|
+
# Webhook通知
|
|
652
|
+
if self.webhook_url:
|
|
653
|
+
try:
|
|
654
|
+
import requests
|
|
655
|
+
requests.post(self.webhook_url, json={"message": full_message}, timeout=5)
|
|
656
|
+
except ImportError:
|
|
657
|
+
print("requests module not installed, webhook disabled")
|
|
658
|
+
except Exception as e:
|
|
659
|
+
print(f"Webhook failed: {e}")
|
|
660
|
+
|
|
661
|
+
# 邮件通知(简化示例)
|
|
662
|
+
if self.email and level == "ERROR":
|
|
663
|
+
print(f"Would send email to {self.email}: {full_message}")
|
|
664
|
+
|
|
665
|
+
class ResourceLogger(Plugin):
|
|
666
|
+
"""资源使用日志插件"""
|
|
667
|
+
|
|
668
|
+
def __init__(self, log_file: Optional[Path] = None):
|
|
669
|
+
self.log_file = log_file
|
|
670
|
+
self.resource_logs = []
|
|
671
|
+
|
|
672
|
+
def on_job_submit(self, job: JobDefinition) -> None:
|
|
673
|
+
log_entry = {
|
|
674
|
+
"timestamp": datetime.now().isoformat(),
|
|
675
|
+
"event": "submit",
|
|
676
|
+
"job_id": job.job_id,
|
|
677
|
+
"name": job.name,
|
|
678
|
+
"resource": {
|
|
679
|
+
"cpus": job.resource.cpus,
|
|
680
|
+
"memory_gb": job.resource.memory_gb,
|
|
681
|
+
"gpus": job.resource.gpus
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
self.resource_logs.append(log_entry)
|
|
685
|
+
print(f"[RESOURCE] Job {job.job_id} submitted with resource: {job.resource}")
|
|
686
|
+
|
|
687
|
+
def on_job_start(self, job_id: str) -> None:
|
|
688
|
+
print(f"[RESOURCE] Job {job_id} started")
|
|
689
|
+
|
|
690
|
+
def on_job_complete(self, result: JobResult) -> None:
|
|
691
|
+
if result.resource_usage:
|
|
692
|
+
log_entry = {
|
|
693
|
+
"timestamp": datetime.now().isoformat(),
|
|
694
|
+
"event": "complete",
|
|
695
|
+
"job_id": result.job_id,
|
|
696
|
+
"resource_usage": result.resource_usage,
|
|
697
|
+
"duration": result.duration
|
|
698
|
+
}
|
|
699
|
+
self.resource_logs.append(log_entry)
|
|
700
|
+
print(f"[RESOURCE] Job {result.job_id} used: {result.resource_usage}")
|
|
701
|
+
|
|
702
|
+
def on_error(self, error: Exception) -> None:
|
|
703
|
+
print(f"[RESOURCE] Error: {error}")
|
|
704
|
+
|
|
705
|
+
def save_logs(self):
|
|
706
|
+
"""保存资源日志到文件"""
|
|
707
|
+
if self.log_file:
|
|
708
|
+
with open(self.log_file, 'w') as f:
|
|
709
|
+
json.dump(self.resource_logs, f, indent=2, default=str)
|
|
710
|
+
|
|
711
|
+
# ============================================================================
|
|
712
|
+
# 监控和指标收集
|
|
713
|
+
# ============================================================================
|
|
714
|
+
|
|
715
|
+
class MetricsCollector:
|
|
716
|
+
"""
|
|
717
|
+
指标收集器
|
|
718
|
+
收集和报告作业执行的各种指标。
|
|
719
|
+
"""
|
|
720
|
+
|
|
721
|
+
def __init__(self):
|
|
722
|
+
self._metrics = {
|
|
723
|
+
"jobs_total": 0,
|
|
724
|
+
"jobs_completed": 0,
|
|
725
|
+
"jobs_failed": 0,
|
|
726
|
+
"jobs_running": 0,
|
|
727
|
+
"total_duration": 0.0,
|
|
728
|
+
"total_cpu_hours": 0.0,
|
|
729
|
+
"total_memory_gb_hours": 0.0,
|
|
730
|
+
"retries_total": 0,
|
|
731
|
+
"backend_stats": {},
|
|
732
|
+
"resource_stats": {},
|
|
733
|
+
"timestamps": []
|
|
734
|
+
}
|
|
735
|
+
self._lock = threading.Lock()
|
|
736
|
+
|
|
737
|
+
def record_job_start(self, job: JobDefinition):
|
|
738
|
+
"""记录作业开始"""
|
|
739
|
+
with self._lock:
|
|
740
|
+
self._metrics["jobs_total"] += 1
|
|
741
|
+
self._metrics["jobs_running"] += 1
|
|
742
|
+
|
|
743
|
+
# 记录后端统计
|
|
744
|
+
backend = job.backend.value
|
|
745
|
+
self._metrics["backend_stats"].setdefault(backend, 0)
|
|
746
|
+
self._metrics["backend_stats"][backend] += 1
|
|
747
|
+
|
|
748
|
+
def record_job_complete(self, result: JobResult, job: JobDefinition):
|
|
749
|
+
"""记录作业完成"""
|
|
750
|
+
with self._lock:
|
|
751
|
+
self._metrics["jobs_running"] -= 1
|
|
752
|
+
|
|
753
|
+
if result.success():
|
|
754
|
+
self._metrics["jobs_completed"] += 1
|
|
755
|
+
else:
|
|
756
|
+
self._metrics["jobs_failed"] += 1
|
|
757
|
+
|
|
758
|
+
# 记录时长
|
|
759
|
+
if result.duration:
|
|
760
|
+
self._metrics["total_duration"] += result.duration
|
|
761
|
+
|
|
762
|
+
# 计算资源使用
|
|
763
|
+
if job.resource:
|
|
764
|
+
cpu_hours = job.resource.total_cpus * result.duration / 3600
|
|
765
|
+
self._metrics["total_cpu_hours"] += cpu_hours
|
|
766
|
+
|
|
767
|
+
if job.resource.memory_gb:
|
|
768
|
+
mem_hours = job.resource.memory_gb * result.duration / 3600
|
|
769
|
+
self._metrics["total_memory_gb_hours"] += mem_hours
|
|
770
|
+
|
|
771
|
+
# 记录重试次数
|
|
772
|
+
if result.attempts > 1:
|
|
773
|
+
self._metrics["retries_total"] += (result.attempts - 1)
|
|
774
|
+
|
|
775
|
+
# 记录时间戳
|
|
776
|
+
self._metrics["timestamps"].append({
|
|
777
|
+
"job_id": result.job_id,
|
|
778
|
+
"start_time": result.start_time.isoformat() if result.start_time else None,
|
|
779
|
+
"end_time": result.end_time.isoformat() if result.end_time else None,
|
|
780
|
+
"status": result.status.value,
|
|
781
|
+
"backend": job.backend.value
|
|
782
|
+
})
|
|
783
|
+
|
|
784
|
+
def get_metrics(self) -> Dict[str, Any]:
|
|
785
|
+
"""获取所有指标"""
|
|
786
|
+
with self._lock:
|
|
787
|
+
metrics = self._metrics.copy()
|
|
788
|
+
|
|
789
|
+
# 计算成功率
|
|
790
|
+
if metrics["jobs_total"] > 0:
|
|
791
|
+
metrics["success_rate"] = metrics["jobs_completed"] / metrics["jobs_total"]
|
|
792
|
+
else:
|
|
793
|
+
metrics["success_rate"] = 0.0
|
|
794
|
+
|
|
795
|
+
# 计算平均时长
|
|
796
|
+
completed = metrics["jobs_completed"] + metrics["jobs_failed"]
|
|
797
|
+
if completed > 0:
|
|
798
|
+
metrics["avg_duration"] = metrics["total_duration"] / completed
|
|
799
|
+
else:
|
|
800
|
+
metrics["avg_duration"] = 0.0
|
|
801
|
+
|
|
802
|
+
return metrics
|
|
803
|
+
|
|
804
|
+
def reset(self):
|
|
805
|
+
"""重置指标"""
|
|
806
|
+
with self._lock:
|
|
807
|
+
self._metrics = {
|
|
808
|
+
"jobs_total": 0,
|
|
809
|
+
"jobs_completed": 0,
|
|
810
|
+
"jobs_failed": 0,
|
|
811
|
+
"jobs_running": 0,
|
|
812
|
+
"total_duration": 0.0,
|
|
813
|
+
"total_cpu_hours": 0.0,
|
|
814
|
+
"total_memory_gb_hours": 0.0,
|
|
815
|
+
"retries_total": 0,
|
|
816
|
+
"backend_stats": {},
|
|
817
|
+
"resource_stats": {},
|
|
818
|
+
"timestamps": []
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
class ResourceMonitor:
|
|
822
|
+
"""资源使用监控"""
|
|
823
|
+
def __init__(self):
|
|
824
|
+
self.start_time = datetime.now()
|
|
825
|
+
self.resource_usage = {}
|
|
826
|
+
self._lock = threading.Lock()
|
|
827
|
+
|
|
828
|
+
def record_usage(self, job_id: str, usage: Dict[str, Any]):
|
|
829
|
+
"""记录资源使用"""
|
|
830
|
+
with self._lock:
|
|
831
|
+
self.resource_usage[job_id] = {
|
|
832
|
+
"timestamp": datetime.now(),
|
|
833
|
+
"usage": usage
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
def get_system_usage(self) -> Dict[str, Any]:
|
|
837
|
+
"""获取系统资源使用情况"""
|
|
838
|
+
if not PSUTIL_AVAILABLE:
|
|
839
|
+
return {"error": "psutil not installed"}
|
|
840
|
+
|
|
841
|
+
try:
|
|
842
|
+
cpu_percent = psutil.cpu_percent(interval=0.1)
|
|
843
|
+
memory = psutil.virtual_memory()
|
|
844
|
+
disk = psutil.disk_usage('/')
|
|
845
|
+
|
|
846
|
+
return {
|
|
847
|
+
"cpu_percent": cpu_percent,
|
|
848
|
+
"memory_total_gb": memory.total / (1024**3),
|
|
849
|
+
"memory_used_gb": memory.used / (1024**3),
|
|
850
|
+
"memory_percent": memory.percent,
|
|
851
|
+
"disk_total_gb": disk.total / (1024**3),
|
|
852
|
+
"disk_used_gb": disk.used / (1024**3),
|
|
853
|
+
"disk_percent": disk.percent,
|
|
854
|
+
"uptime_seconds": (datetime.now() - self.start_time).total_seconds()
|
|
855
|
+
}
|
|
856
|
+
except Exception as e:
|
|
857
|
+
return {"error": str(e)}
|
|
858
|
+
|
|
859
|
+
def get_job_usage(self, job_id: str) -> Optional[Dict[str, Any]]:
|
|
860
|
+
"""获取作业资源使用"""
|
|
861
|
+
with self._lock:
|
|
862
|
+
return self.resource_usage.get(job_id)
|
|
863
|
+
|
|
864
|
+
# ============================================================================
|
|
865
|
+
# 存储和缓存
|
|
866
|
+
# ============================================================================
|
|
867
|
+
|
|
868
|
+
class JobStore:
|
|
869
|
+
"""
|
|
870
|
+
作业存储
|
|
871
|
+
持久化存储作业定义和结果,支持查询和历史记录。
|
|
872
|
+
"""
|
|
873
|
+
|
|
874
|
+
def __init__(self, db_path: Union[str, Path] = "jobs.db"):
|
|
875
|
+
self.db_path = Path(db_path)
|
|
876
|
+
self._init_db()
|
|
877
|
+
|
|
878
|
+
def _init_db(self):
|
|
879
|
+
"""初始化数据库"""
|
|
880
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
881
|
+
# 作业表
|
|
882
|
+
conn.execute("""
|
|
883
|
+
CREATE TABLE IF NOT EXISTS jobs (
|
|
884
|
+
job_id TEXT PRIMARY KEY,
|
|
885
|
+
name TEXT,
|
|
886
|
+
description TEXT,
|
|
887
|
+
cmd TEXT,
|
|
888
|
+
backend TEXT,
|
|
889
|
+
image TEXT,
|
|
890
|
+
config_json TEXT,
|
|
891
|
+
resource_json TEXT,
|
|
892
|
+
retry_json TEXT,
|
|
893
|
+
priority INTEGER,
|
|
894
|
+
tags_json TEXT,
|
|
895
|
+
metadata_json TEXT,
|
|
896
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
897
|
+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
898
|
+
)
|
|
899
|
+
""")
|
|
900
|
+
|
|
901
|
+
# 结果表
|
|
902
|
+
conn.execute("""
|
|
903
|
+
CREATE TABLE IF NOT EXISTS job_results (
|
|
904
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
905
|
+
job_id TEXT,
|
|
906
|
+
attempt INTEGER,
|
|
907
|
+
status TEXT,
|
|
908
|
+
exit_code INTEGER,
|
|
909
|
+
stdout TEXT,
|
|
910
|
+
stderr TEXT,
|
|
911
|
+
error_message TEXT,
|
|
912
|
+
start_time TIMESTAMP,
|
|
913
|
+
end_time TIMESTAMP,
|
|
914
|
+
duration REAL,
|
|
915
|
+
resource_usage_json TEXT,
|
|
916
|
+
metrics_json TEXT,
|
|
917
|
+
tags_json TEXT,
|
|
918
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
919
|
+
FOREIGN KEY (job_id) REFERENCES jobs (job_id)
|
|
920
|
+
)
|
|
921
|
+
""")
|
|
922
|
+
|
|
923
|
+
# 索引
|
|
924
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_job_results_job_id ON job_results (job_id)")
|
|
925
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_job_results_status ON job_results (status)")
|
|
926
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_job_results_start_time ON job_results (start_time)")
|
|
927
|
+
|
|
928
|
+
conn.commit()
|
|
929
|
+
|
|
930
|
+
def save_job(self, job: JobDefinition):
|
|
931
|
+
"""保存作业定义"""
|
|
932
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
933
|
+
conn.execute("""
|
|
934
|
+
INSERT OR REPLACE INTO jobs
|
|
935
|
+
(job_id, name, description, cmd, backend, image, config_json,
|
|
936
|
+
resource_json, retry_json, priority, tags_json, metadata_json)
|
|
937
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
938
|
+
""", (
|
|
939
|
+
job.job_id,
|
|
940
|
+
job.name,
|
|
941
|
+
job.description,
|
|
942
|
+
job.cmd,
|
|
943
|
+
job.backend.value,
|
|
944
|
+
job.image,
|
|
945
|
+
json.dumps(asdict(job.config)),
|
|
946
|
+
json.dumps(asdict(job.resource)),
|
|
947
|
+
json.dumps(asdict(job.retry)),
|
|
948
|
+
job.priority.value,
|
|
949
|
+
json.dumps(job.tags),
|
|
950
|
+
json.dumps(job.metadata)
|
|
951
|
+
))
|
|
952
|
+
conn.commit()
|
|
953
|
+
|
|
954
|
+
def save_result(self, result: JobResult):
|
|
955
|
+
"""保存作业结果"""
|
|
956
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
957
|
+
conn.execute("""
|
|
958
|
+
INSERT INTO job_results
|
|
959
|
+
(job_id, attempt, status, exit_code, stdout, stderr, error_message,
|
|
960
|
+
start_time, end_time, duration, resource_usage_json, metrics_json, tags_json)
|
|
961
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
962
|
+
""", (
|
|
963
|
+
result.job_id,
|
|
964
|
+
result.attempts,
|
|
965
|
+
result.status.value,
|
|
966
|
+
result.exit_code,
|
|
967
|
+
result.stdout,
|
|
968
|
+
result.stderr,
|
|
969
|
+
result.error_message,
|
|
970
|
+
result.start_time.isoformat() if result.start_time else None,
|
|
971
|
+
result.end_time.isoformat() if result.end_time else None,
|
|
972
|
+
result.duration,
|
|
973
|
+
json.dumps(result.resource_usage) if result.resource_usage else None,
|
|
974
|
+
json.dumps(result.metrics),
|
|
975
|
+
json.dumps(result.tags)
|
|
976
|
+
))
|
|
977
|
+
conn.commit()
|
|
978
|
+
|
|
979
|
+
def get_job(self, job_id: str) -> Optional[JobDefinition]:
|
|
980
|
+
"""获取作业定义"""
|
|
981
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
982
|
+
cursor = conn.execute("""
|
|
983
|
+
SELECT * FROM jobs WHERE job_id = ?
|
|
984
|
+
""", (job_id,))
|
|
985
|
+
row = cursor.fetchone()
|
|
986
|
+
|
|
987
|
+
if not row:
|
|
988
|
+
return None
|
|
989
|
+
|
|
990
|
+
# 重建作业定义
|
|
991
|
+
try:
|
|
992
|
+
config_data = json.loads(row[6])
|
|
993
|
+
resource_data = json.loads(row[7])
|
|
994
|
+
retry_data = json.loads(row[8])
|
|
995
|
+
tags = json.loads(row[10])
|
|
996
|
+
metadata = json.loads(row[11])
|
|
997
|
+
except json.JSONDecodeError as e:
|
|
998
|
+
print(f"Error decoding JSON for job {job_id}: {e}")
|
|
999
|
+
return None
|
|
1000
|
+
|
|
1001
|
+
job = JobDefinition(
|
|
1002
|
+
job_id=row[0],
|
|
1003
|
+
name=row[1],
|
|
1004
|
+
description=row[2],
|
|
1005
|
+
cmd=row[3],
|
|
1006
|
+
backend=Backend(row[4]),
|
|
1007
|
+
image=row[5],
|
|
1008
|
+
config=ExecutionConfig(**config_data),
|
|
1009
|
+
resource=ResourceRequest(**resource_data),
|
|
1010
|
+
retry=RetryConfig(**retry_data),
|
|
1011
|
+
priority=JobPriority(row[9]),
|
|
1012
|
+
tags=tags,
|
|
1013
|
+
metadata=metadata
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
return job
|
|
1017
|
+
|
|
1018
|
+
def get_job_history(self, job_id: str) -> List[JobResult]:
|
|
1019
|
+
"""获取作业历史结果"""
|
|
1020
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
1021
|
+
cursor = conn.execute("""
|
|
1022
|
+
SELECT * FROM job_results
|
|
1023
|
+
WHERE job_id = ?
|
|
1024
|
+
ORDER BY attempt
|
|
1025
|
+
""", (job_id,))
|
|
1026
|
+
|
|
1027
|
+
results = []
|
|
1028
|
+
for row in cursor.fetchall():
|
|
1029
|
+
try:
|
|
1030
|
+
result = JobResult(
|
|
1031
|
+
job_id=row[1],
|
|
1032
|
+
status=JobStatus(row[3]),
|
|
1033
|
+
exit_code=row[4],
|
|
1034
|
+
stdout=row[5],
|
|
1035
|
+
stderr=row[6],
|
|
1036
|
+
error_message=row[7],
|
|
1037
|
+
start_time=datetime.fromisoformat(row[8].replace('Z', '+00:00')) if row[8] else None,
|
|
1038
|
+
end_time=datetime.fromisoformat(row[9].replace('Z', '+00:00')) if row[9] else None,
|
|
1039
|
+
duration=row[10],
|
|
1040
|
+
attempts=row[2],
|
|
1041
|
+
resource_usage=json.loads(row[11]) if row[11] else None,
|
|
1042
|
+
metrics=json.loads(row[12]) if row[12] else {},
|
|
1043
|
+
tags=json.loads(row[13]) if row[13] else {}
|
|
1044
|
+
)
|
|
1045
|
+
results.append(result)
|
|
1046
|
+
except Exception as e:
|
|
1047
|
+
print(f"Error loading result for job {job_id}: {e}")
|
|
1048
|
+
|
|
1049
|
+
return results
|
|
1050
|
+
|
|
1051
|
+
def search_jobs(
|
|
1052
|
+
self,
|
|
1053
|
+
status: Optional[JobStatus] = None,
|
|
1054
|
+
backend: Optional[Backend] = None,
|
|
1055
|
+
tags: Optional[Dict[str, str]] = None,
|
|
1056
|
+
limit: int = 100,
|
|
1057
|
+
offset: int = 0
|
|
1058
|
+
) -> List[JobDefinition]:
|
|
1059
|
+
"""搜索作业"""
|
|
1060
|
+
query = "SELECT job_id FROM jobs WHERE 1=1"
|
|
1061
|
+
params = []
|
|
1062
|
+
|
|
1063
|
+
if status:
|
|
1064
|
+
# 需要连接结果表
|
|
1065
|
+
query += " AND job_id IN (SELECT DISTINCT job_id FROM job_results WHERE status = ?)"
|
|
1066
|
+
params.append(status.value)
|
|
1067
|
+
|
|
1068
|
+
if backend:
|
|
1069
|
+
query += " AND backend = ?"
|
|
1070
|
+
params.append(backend.value)
|
|
1071
|
+
|
|
1072
|
+
if tags:
|
|
1073
|
+
for key, value in tags.items():
|
|
1074
|
+
query += f" AND json_extract(tags_json, '$.{key}') = ?"
|
|
1075
|
+
params.append(value)
|
|
1076
|
+
|
|
1077
|
+
query += " ORDER BY created_at DESC LIMIT ? OFFSET ?"
|
|
1078
|
+
params.extend([limit, offset])
|
|
1079
|
+
|
|
1080
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
1081
|
+
cursor = conn.execute(query, params)
|
|
1082
|
+
job_ids = [row[0] for row in cursor.fetchall()]
|
|
1083
|
+
|
|
1084
|
+
jobs = []
|
|
1085
|
+
for job_id in job_ids:
|
|
1086
|
+
job = self.get_job(job_id)
|
|
1087
|
+
if job:
|
|
1088
|
+
jobs.append(job)
|
|
1089
|
+
|
|
1090
|
+
return jobs
|
|
1091
|
+
|
|
1092
|
+
def cleanup_old_jobs(self, days: int = 30):
|
|
1093
|
+
"""清理旧作业"""
|
|
1094
|
+
cutoff = datetime.now() - timedelta(days=days)
|
|
1095
|
+
|
|
1096
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
1097
|
+
# 删除旧结果
|
|
1098
|
+
conn.execute("""
|
|
1099
|
+
DELETE FROM job_results
|
|
1100
|
+
WHERE job_id IN (
|
|
1101
|
+
SELECT job_id FROM jobs
|
|
1102
|
+
WHERE created_at < ?
|
|
1103
|
+
)
|
|
1104
|
+
""", (cutoff.isoformat(),))
|
|
1105
|
+
|
|
1106
|
+
# 删除旧作业
|
|
1107
|
+
conn.execute("""
|
|
1108
|
+
DELETE FROM jobs WHERE created_at < ?
|
|
1109
|
+
""", (cutoff.isoformat(),))
|
|
1110
|
+
|
|
1111
|
+
conn.commit()
|
|
1112
|
+
|
|
1113
|
+
class ResultCache:
|
|
1114
|
+
"""
|
|
1115
|
+
结果缓存
|
|
1116
|
+
缓存作业结果,支持结果复用和增量计算。
|
|
1117
|
+
"""
|
|
1118
|
+
|
|
1119
|
+
def __init__(self, cache_dir: Union[str, Path] = ".job_cache"):
|
|
1120
|
+
self.cache_dir = Path(cache_dir)
|
|
1121
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
1122
|
+
self._index_file = self.cache_dir / "index.json"
|
|
1123
|
+
self._index = self._load_index()
|
|
1124
|
+
self._lock = threading.Lock()
|
|
1125
|
+
|
|
1126
|
+
def _load_index(self) -> Dict[str, Dict[str, Any]]:
|
|
1127
|
+
"""加载索引"""
|
|
1128
|
+
if self._index_file.exists():
|
|
1129
|
+
try:
|
|
1130
|
+
with open(self._index_file) as f:
|
|
1131
|
+
return json.load(f)
|
|
1132
|
+
except json.JSONDecodeError:
|
|
1133
|
+
return {}
|
|
1134
|
+
return {}
|
|
1135
|
+
|
|
1136
|
+
def _save_index(self):
|
|
1137
|
+
"""保存索引"""
|
|
1138
|
+
with open(self._index_file, 'w') as f:
|
|
1139
|
+
json.dump(self._index, f, indent=2)
|
|
1140
|
+
|
|
1141
|
+
def compute_key(
|
|
1142
|
+
self,
|
|
1143
|
+
cmd: str,
|
|
1144
|
+
backend: Backend = Backend.LOCAL,
|
|
1145
|
+
image: Optional[str] = None,
|
|
1146
|
+
mounts: Optional[Dict[Path, Path]] = None,
|
|
1147
|
+
workdir: Optional[Path] = None,
|
|
1148
|
+
env: Optional[Dict[str, str]] = None,
|
|
1149
|
+
resource: Optional[ResourceRequest] = None
|
|
1150
|
+
) -> str:
|
|
1151
|
+
"""
|
|
1152
|
+
计算缓存键
|
|
1153
|
+
基于命令和配置生成唯一的缓存键,相同配置返回相同键。
|
|
1154
|
+
"""
|
|
1155
|
+
import hashlib
|
|
1156
|
+
|
|
1157
|
+
components = {
|
|
1158
|
+
"cmd": cmd,
|
|
1159
|
+
"backend": backend.value,
|
|
1160
|
+
"image": image or "",
|
|
1161
|
+
"mounts": json.dumps(sorted((str(k), str(v)) for k, v in (mounts or {}).items())),
|
|
1162
|
+
"workdir": str(workdir) if workdir else "",
|
|
1163
|
+
"env": json.dumps(sorted((k, v) for k, v in (env or {}).items())),
|
|
1164
|
+
"resource": json.dumps(asdict(resource)) if resource else ""
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
content = json.dumps(components, sort_keys=True)
|
|
1168
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
1169
|
+
|
|
1170
|
+
def set(self, key: str, result: JobResult, ttl_seconds: int = 86400):
|
|
1171
|
+
"""
|
|
1172
|
+
设置缓存
|
|
1173
|
+
"""
|
|
1174
|
+
with self._lock:
|
|
1175
|
+
# 保存结果
|
|
1176
|
+
cache_file = self.cache_dir / f"{key}.pkl"
|
|
1177
|
+
try:
|
|
1178
|
+
with open(cache_file, 'wb') as f:
|
|
1179
|
+
pickle.dump(result, f)
|
|
1180
|
+
|
|
1181
|
+
# 更新索引
|
|
1182
|
+
self._index[key] = {
|
|
1183
|
+
"created": datetime.now().isoformat(),
|
|
1184
|
+
"expires": (datetime.now() + timedelta(seconds=ttl_seconds)).isoformat(),
|
|
1185
|
+
"size": cache_file.stat().st_size,
|
|
1186
|
+
"job_id": result.job_id,
|
|
1187
|
+
"status": result.status.value
|
|
1188
|
+
}
|
|
1189
|
+
self._save_index()
|
|
1190
|
+
except Exception as e:
|
|
1191
|
+
print(f"Error saving cache: {e}")
|
|
1192
|
+
|
|
1193
|
+
def get(self, key: str) -> Optional[JobResult]:
|
|
1194
|
+
"""获取缓存结果"""
|
|
1195
|
+
with self._lock:
|
|
1196
|
+
if not self.has(key):
|
|
1197
|
+
return None
|
|
1198
|
+
|
|
1199
|
+
cache_file = self.cache_dir / f"{key}.pkl"
|
|
1200
|
+
try:
|
|
1201
|
+
with open(cache_file, 'rb') as f:
|
|
1202
|
+
return pickle.load(f)
|
|
1203
|
+
except Exception:
|
|
1204
|
+
# 缓存损坏,删除
|
|
1205
|
+
self.delete(key)
|
|
1206
|
+
return None
|
|
1207
|
+
|
|
1208
|
+
def has(self, key: str) -> bool:
|
|
1209
|
+
"""检查缓存是否存在且有效"""
|
|
1210
|
+
with self._lock:
|
|
1211
|
+
if key not in self._index:
|
|
1212
|
+
return False
|
|
1213
|
+
|
|
1214
|
+
entry = self._index[key]
|
|
1215
|
+
expires = datetime.fromisoformat(entry["expires"])
|
|
1216
|
+
|
|
1217
|
+
if datetime.now() > expires:
|
|
1218
|
+
self.delete(key)
|
|
1219
|
+
return False
|
|
1220
|
+
|
|
1221
|
+
cache_file = self.cache_dir / f"{key}.pkl"
|
|
1222
|
+
return cache_file.exists()
|
|
1223
|
+
|
|
1224
|
+
def delete(self, key: str):
|
|
1225
|
+
"""删除缓存"""
|
|
1226
|
+
with self._lock:
|
|
1227
|
+
if key in self._index:
|
|
1228
|
+
cache_file = self.cache_dir / f"{key}.pkl"
|
|
1229
|
+
if cache_file.exists():
|
|
1230
|
+
cache_file.unlink()
|
|
1231
|
+
del self._index[key]
|
|
1232
|
+
self._save_index()
|
|
1233
|
+
|
|
1234
|
+
def cleanup(self, max_size_mb: int = 1024, max_age_days: int = 30):
|
|
1235
|
+
"""清理缓存"""
|
|
1236
|
+
with self._lock:
|
|
1237
|
+
now = datetime.now()
|
|
1238
|
+
total_size_mb = 0
|
|
1239
|
+
|
|
1240
|
+
# 收集需要删除的键
|
|
1241
|
+
to_delete = []
|
|
1242
|
+
|
|
1243
|
+
for key, entry in list(self._index.items()):
|
|
1244
|
+
cache_file = self.cache_dir / f"{key}.pkl"
|
|
1245
|
+
|
|
1246
|
+
# 检查过期
|
|
1247
|
+
expires = datetime.fromisoformat(entry["expires"])
|
|
1248
|
+
created = datetime.fromisoformat(entry["created"])
|
|
1249
|
+
|
|
1250
|
+
if now > expires:
|
|
1251
|
+
to_delete.append(key)
|
|
1252
|
+
elif (now - created).days > max_age_days:
|
|
1253
|
+
to_delete.append(key)
|
|
1254
|
+
elif cache_file.exists():
|
|
1255
|
+
total_size_mb += entry["size"] / (1024 * 1024)
|
|
1256
|
+
if total_size_mb > max_size_mb:
|
|
1257
|
+
to_delete.append(key)
|
|
1258
|
+
|
|
1259
|
+
# 删除
|
|
1260
|
+
for key in to_delete:
|
|
1261
|
+
self.delete(key)
|
|
1262
|
+
|
|
1263
|
+
# 按创建时间排序,删除最旧的
|
|
1264
|
+
if total_size_mb > max_size_mb:
|
|
1265
|
+
sorted_keys = sorted(
|
|
1266
|
+
self._index.keys(),
|
|
1267
|
+
key=lambda k: datetime.fromisoformat(self._index[k]["created"])
|
|
1268
|
+
)
|
|
1269
|
+
while total_size_mb > max_size_mb and sorted_keys:
|
|
1270
|
+
key = sorted_keys.pop(0)
|
|
1271
|
+
if key in self._index:
|
|
1272
|
+
total_size_mb -= self._index[key]["size"] / (1024 * 1024)
|
|
1273
|
+
self.delete(key)
|
|
1274
|
+
|
|
1275
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
1276
|
+
"""获取缓存统计"""
|
|
1277
|
+
with self._lock:
|
|
1278
|
+
total_size = 0
|
|
1279
|
+
count = 0
|
|
1280
|
+
status_counts = {}
|
|
1281
|
+
|
|
1282
|
+
for entry in self._index.values():
|
|
1283
|
+
total_size += entry["size"]
|
|
1284
|
+
count += 1
|
|
1285
|
+
status = entry["status"]
|
|
1286
|
+
status_counts[status] = status_counts.get(status, 0) + 1
|
|
1287
|
+
|
|
1288
|
+
return {
|
|
1289
|
+
"count": count,
|
|
1290
|
+
"total_size_bytes": total_size,
|
|
1291
|
+
"total_size_mb": total_size / (1024 * 1024),
|
|
1292
|
+
"status_counts": status_counts
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
# ============================================================================
|
|
1296
|
+
# 队列和监控
|
|
1297
|
+
# ============================================================================
|
|
1298
|
+
|
|
1299
|
+
class PriorityJobQueue:
|
|
1300
|
+
"""优先作业队列"""
|
|
1301
|
+
def __init__(self):
|
|
1302
|
+
self._queue = []
|
|
1303
|
+
self._lock = threading.Lock()
|
|
1304
|
+
self._stats = {
|
|
1305
|
+
"jobs_pushed": 0,
|
|
1306
|
+
"jobs_popped": 0,
|
|
1307
|
+
"max_size": 0
|
|
1308
|
+
}
|
|
1309
|
+
|
|
1310
|
+
def push(self, job: JobDefinition):
|
|
1311
|
+
with self._lock:
|
|
1312
|
+
heapq.heappush(self._queue, (-job.priority.value, time.time(), job.job_id, job))
|
|
1313
|
+
self._stats["jobs_pushed"] += 1
|
|
1314
|
+
self._stats["max_size"] = max(self._stats["max_size"], len(self._queue))
|
|
1315
|
+
|
|
1316
|
+
def pop(self) -> Optional[JobDefinition]:
|
|
1317
|
+
with self._lock:
|
|
1318
|
+
if self._queue:
|
|
1319
|
+
_, _, job_id, job = heapq.heappop(self._queue)
|
|
1320
|
+
self._stats["jobs_popped"] += 1
|
|
1321
|
+
return job
|
|
1322
|
+
return None
|
|
1323
|
+
|
|
1324
|
+
def peek(self) -> Optional[JobDefinition]:
|
|
1325
|
+
"""查看队列中的下一个作业但不移除"""
|
|
1326
|
+
with self._lock:
|
|
1327
|
+
if self._queue:
|
|
1328
|
+
_, _, _, job = self._queue[0]
|
|
1329
|
+
return job
|
|
1330
|
+
return None
|
|
1331
|
+
|
|
1332
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
1333
|
+
"""获取队列统计信息"""
|
|
1334
|
+
with self._lock:
|
|
1335
|
+
stats = self._stats.copy()
|
|
1336
|
+
stats["current_size"] = len(self._queue)
|
|
1337
|
+
stats["avg_wait_time"] = self._calculate_avg_wait_time()
|
|
1338
|
+
return stats
|
|
1339
|
+
|
|
1340
|
+
def _calculate_avg_wait_time(self) -> float:
|
|
1341
|
+
"""计算平均等待时间"""
|
|
1342
|
+
if not self._queue:
|
|
1343
|
+
return 0.0
|
|
1344
|
+
|
|
1345
|
+
now = time.time()
|
|
1346
|
+
total_wait = 0.0
|
|
1347
|
+
count = 0
|
|
1348
|
+
|
|
1349
|
+
for _, enqueue_time, _, _ in self._queue:
|
|
1350
|
+
total_wait += (now - enqueue_time)
|
|
1351
|
+
count += 1
|
|
1352
|
+
|
|
1353
|
+
return total_wait / count if count > 0 else 0.0
|
|
1354
|
+
|
|
1355
|
+
def get_queue_snapshot(self, limit: int = 10) -> List[Dict[str, Any]]:
|
|
1356
|
+
"""获取队列快照"""
|
|
1357
|
+
with self._lock:
|
|
1358
|
+
snapshot = []
|
|
1359
|
+
# 复制并排序队列
|
|
1360
|
+
sorted_queue = sorted(self._queue, key=lambda x: (-x[0], x[1]))
|
|
1361
|
+
for priority_neg, enqueue_time, job_id, job in sorted_queue[:limit]:
|
|
1362
|
+
snapshot.append({
|
|
1363
|
+
"job_id": job_id,
|
|
1364
|
+
"name": job.name,
|
|
1365
|
+
"priority": JobPriority(-priority_neg).name,
|
|
1366
|
+
"enqueued_at": datetime.fromtimestamp(enqueue_time).isoformat(),
|
|
1367
|
+
"wait_time_seconds": time.time() - enqueue_time,
|
|
1368
|
+
"backend": job.backend.value,
|
|
1369
|
+
"cmd_preview": job.cmd[:100] + ("..." if len(job.cmd) > 100 else "")
|
|
1370
|
+
})
|
|
1371
|
+
return snapshot
|
|
1372
|
+
|
|
1373
|
+
class TimeoutMonitor:
|
|
1374
|
+
"""作业超时监控"""
|
|
1375
|
+
def __init__(self, scheduler):
|
|
1376
|
+
self.scheduler = scheduler
|
|
1377
|
+
self._monitored_jobs = {}
|
|
1378
|
+
self._lock = threading.Lock()
|
|
1379
|
+
self._stats = {
|
|
1380
|
+
"jobs_timed_out": 0,
|
|
1381
|
+
"preemptive_cancellations": 0
|
|
1382
|
+
}
|
|
1383
|
+
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
|
1384
|
+
self._thread.start()
|
|
1385
|
+
|
|
1386
|
+
def add_job(self, job_id: str, timeout_seconds: Optional[float] = None):
|
|
1387
|
+
"""添加要监控的作业"""
|
|
1388
|
+
if timeout_seconds is None or timeout_seconds <= 0:
|
|
1389
|
+
return
|
|
1390
|
+
|
|
1391
|
+
with self._lock:
|
|
1392
|
+
self._monitored_jobs[job_id] = {
|
|
1393
|
+
"start_time": time.time(),
|
|
1394
|
+
"timeout": timeout_seconds,
|
|
1395
|
+
"warned": False
|
|
1396
|
+
}
|
|
1397
|
+
|
|
1398
|
+
def remove_job(self, job_id: str):
|
|
1399
|
+
"""移除监控的作业"""
|
|
1400
|
+
with self._lock:
|
|
1401
|
+
if job_id in self._monitored_jobs:
|
|
1402
|
+
job_info = self._monitored_jobs[job_id]
|
|
1403
|
+
if job_info.get("warned", False):
|
|
1404
|
+
self._stats["preemptive_cancellations"] += 1
|
|
1405
|
+
del self._monitored_jobs[job_id]
|
|
1406
|
+
|
|
1407
|
+
def _monitor_loop(self):
|
|
1408
|
+
"""监控循环"""
|
|
1409
|
+
while True:
|
|
1410
|
+
try:
|
|
1411
|
+
now = time.time()
|
|
1412
|
+
jobs_to_cancel = []
|
|
1413
|
+
|
|
1414
|
+
with self._lock:
|
|
1415
|
+
for job_id, job_info in list(self._monitored_jobs.items()):
|
|
1416
|
+
elapsed = now - job_info["start_time"]
|
|
1417
|
+
timeout = job_info["timeout"]
|
|
1418
|
+
|
|
1419
|
+
# 提前警告(80%超时)
|
|
1420
|
+
if not job_info["warned"] and elapsed > timeout * 0.8:
|
|
1421
|
+
self._warn_about_timeout(job_id, elapsed, timeout)
|
|
1422
|
+
job_info["warned"] = True
|
|
1423
|
+
|
|
1424
|
+
# 超时取消
|
|
1425
|
+
if elapsed > timeout:
|
|
1426
|
+
jobs_to_cancel.append(job_id)
|
|
1427
|
+
self._stats["jobs_timed_out"] += 1
|
|
1428
|
+
|
|
1429
|
+
# 在锁外取消作业
|
|
1430
|
+
for job_id in jobs_to_cancel:
|
|
1431
|
+
self.scheduler.logger.warning(f"[{job_id}] Job timeout, cancelling")
|
|
1432
|
+
self.scheduler.cancel(job_id)
|
|
1433
|
+
self.remove_job(job_id)
|
|
1434
|
+
|
|
1435
|
+
time.sleep(1)
|
|
1436
|
+
|
|
1437
|
+
except Exception as e:
|
|
1438
|
+
self.scheduler.logger.error(f"Timeout monitor error: {e}")
|
|
1439
|
+
time.sleep(5)
|
|
1440
|
+
|
|
1441
|
+
def _warn_about_timeout(self, job_id: str, elapsed: float, timeout: float):
|
|
1442
|
+
"""超时警告"""
|
|
1443
|
+
self.scheduler.logger.warning(
|
|
1444
|
+
f"[{job_id}] Job is approaching timeout: "
|
|
1445
|
+
f"{elapsed:.1f}/{timeout:.1f} seconds ({elapsed/timeout:.1%})"
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
1449
|
+
"""获取监控统计"""
|
|
1450
|
+
with self._lock:
|
|
1451
|
+
stats = self._stats.copy()
|
|
1452
|
+
stats["currently_monitored"] = len(self._monitored_jobs)
|
|
1453
|
+
return stats
|
|
1454
|
+
|
|
1455
|
+
# ============================================================================
|
|
1456
|
+
# 主调度器类
|
|
1457
|
+
# ============================================================================
|
|
1458
|
+
|
|
1459
|
+
class ContainerScheduler:
|
|
1460
|
+
"""
|
|
1461
|
+
通用容器调度器
|
|
1462
|
+
企业级的命令执行框架,支持多种后端、资源管理、重试机制、监控等。
|
|
1463
|
+
"""
|
|
1464
|
+
|
|
1465
|
+
def __init__(
|
|
1466
|
+
self,
|
|
1467
|
+
default_backend: Backend = Backend.LOCAL,
|
|
1468
|
+
default_image: Optional[str] = None,
|
|
1469
|
+
logger: Optional[logging.Logger] = None,
|
|
1470
|
+
max_concurrent: int = 4,
|
|
1471
|
+
metrics_collector: Optional[MetricsCollector] = None,
|
|
1472
|
+
job_store: Optional[JobStore] = None,
|
|
1473
|
+
result_cache: Optional[ResultCache] = None,
|
|
1474
|
+
plugins: Optional[List[Plugin]] = None,
|
|
1475
|
+
enable_web_monitor: bool = False,
|
|
1476
|
+
web_port: int = 8080,
|
|
1477
|
+
config: Optional[SchedulerConfig] = None,
|
|
1478
|
+
enable_priority_queue: bool = False,
|
|
1479
|
+
enable_timeout_monitor: bool = True,
|
|
1480
|
+
):
|
|
1481
|
+
"""
|
|
1482
|
+
初始化调度器
|
|
1483
|
+
|
|
1484
|
+
Args:
|
|
1485
|
+
default_backend: 默认执行后端
|
|
1486
|
+
default_image: 默认容器镜像
|
|
1487
|
+
logger: 日志记录器
|
|
1488
|
+
max_concurrent: 最大并发作业数
|
|
1489
|
+
metrics_collector: 指标收集器
|
|
1490
|
+
job_store: 作业存储
|
|
1491
|
+
result_cache: 结果缓存
|
|
1492
|
+
plugins: 插件列表
|
|
1493
|
+
enable_web_monitor: 是否启用Web监控
|
|
1494
|
+
web_port: Web监控端口
|
|
1495
|
+
config: 调度器配置
|
|
1496
|
+
enable_priority_queue: 是否启用优先级队列
|
|
1497
|
+
enable_timeout_monitor: 是否启用超时监控
|
|
1498
|
+
"""
|
|
1499
|
+
# 配置
|
|
1500
|
+
self.config = config or SchedulerConfig(
|
|
1501
|
+
default_backend=default_backend,
|
|
1502
|
+
max_concurrent=max_concurrent
|
|
1503
|
+
)
|
|
1504
|
+
self.default_backend = self.config.default_backend
|
|
1505
|
+
self.default_image = default_image
|
|
1506
|
+
self.max_concurrent = self.config.max_concurrent
|
|
1507
|
+
|
|
1508
|
+
# 日志
|
|
1509
|
+
if logger is None:
|
|
1510
|
+
logger = logging.getLogger(__name__)
|
|
1511
|
+
if not logger.handlers:
|
|
1512
|
+
handler = logging.StreamHandler()
|
|
1513
|
+
formatter = logging.Formatter(
|
|
1514
|
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
1515
|
+
)
|
|
1516
|
+
handler.setFormatter(formatter)
|
|
1517
|
+
logger.addHandler(handler)
|
|
1518
|
+
logger.setLevel(getattr(logging, self.config.log_level))
|
|
1519
|
+
self.logger = logger
|
|
1520
|
+
|
|
1521
|
+
# 核心组件
|
|
1522
|
+
self.metrics_collector = metrics_collector or MetricsCollector()
|
|
1523
|
+
self.job_store = job_store
|
|
1524
|
+
self.result_cache = result_cache
|
|
1525
|
+
self.plugins = plugins or []
|
|
1526
|
+
self.resource_monitor = ResourceMonitor()
|
|
1527
|
+
|
|
1528
|
+
# 队列系统
|
|
1529
|
+
self._priority_queue = None
|
|
1530
|
+
if enable_priority_queue:
|
|
1531
|
+
self._priority_queue = PriorityJobQueue()
|
|
1532
|
+
self._queue_thread = threading.Thread(
|
|
1533
|
+
target=self._process_queue_loop,
|
|
1534
|
+
daemon=True
|
|
1535
|
+
)
|
|
1536
|
+
self._queue_thread.start()
|
|
1537
|
+
|
|
1538
|
+
# 超时监控
|
|
1539
|
+
self._timeout_monitor = None
|
|
1540
|
+
if enable_timeout_monitor:
|
|
1541
|
+
self._timeout_monitor = TimeoutMonitor(self)
|
|
1542
|
+
|
|
1543
|
+
# 执行器和状态管理
|
|
1544
|
+
self._executor = ThreadPoolExecutor(max_workers=max_concurrent)
|
|
1545
|
+
self._futures: Dict[str, Future] = {}
|
|
1546
|
+
self._results: Dict[str, JobResult] = {}
|
|
1547
|
+
self._job_definitions: Dict[str, JobDefinition] = {}
|
|
1548
|
+
self._lock = threading.RLock()
|
|
1549
|
+
|
|
1550
|
+
# 信号处理
|
|
1551
|
+
signal.signal(signal.SIGINT, self._signal_handler)
|
|
1552
|
+
signal.signal(signal.SIGTERM, self._signal_handler)
|
|
1553
|
+
|
|
1554
|
+
# 清理线程
|
|
1555
|
+
self._cleanup_thread = threading.Thread(
|
|
1556
|
+
target=self._cleanup_loop,
|
|
1557
|
+
daemon=True
|
|
1558
|
+
)
|
|
1559
|
+
self._cleanup_thread.start()
|
|
1560
|
+
|
|
1561
|
+
# 健康检查线程
|
|
1562
|
+
self._health_check_thread = threading.Thread(
|
|
1563
|
+
target=self._health_check_loop,
|
|
1564
|
+
daemon=True
|
|
1565
|
+
)
|
|
1566
|
+
self._health_check_thread.start()
|
|
1567
|
+
|
|
1568
|
+
self.logger.info(f"Scheduler initialized with backend={self.default_backend}, max_concurrent={self.max_concurrent}")
|
|
1569
|
+
|
|
1570
|
+
def _signal_handler(self, signum, frame):
|
|
1571
|
+
"""处理中断信号"""
|
|
1572
|
+
self.logger.warning(f"Received signal {signum}, shutting down...")
|
|
1573
|
+
self.shutdown()
|
|
1574
|
+
|
|
1575
|
+
def _cleanup_loop(self):
|
|
1576
|
+
"""清理循环"""
|
|
1577
|
+
while True:
|
|
1578
|
+
time.sleep(self.config.cleanup_interval)
|
|
1579
|
+
try:
|
|
1580
|
+
self._cleanup_stale_jobs()
|
|
1581
|
+
if self.result_cache:
|
|
1582
|
+
self.result_cache.cleanup(
|
|
1583
|
+
max_size_mb=self.config.cache_max_size_mb,
|
|
1584
|
+
max_age_days=self.config.cache_max_age_days
|
|
1585
|
+
)
|
|
1586
|
+
if self.job_store:
|
|
1587
|
+
self.job_store.cleanup_old_jobs(days=self.config.db_cleanup_days)
|
|
1588
|
+
except Exception as e:
|
|
1589
|
+
self.logger.error(f"Cleanup error: {e}")
|
|
1590
|
+
|
|
1591
|
+
def _health_check_loop(self):
|
|
1592
|
+
"""健康检查循环"""
|
|
1593
|
+
while True:
|
|
1594
|
+
try:
|
|
1595
|
+
time.sleep(self.config.health_check_interval)
|
|
1596
|
+
health = self.health_check()
|
|
1597
|
+
if health["status"] != "healthy":
|
|
1598
|
+
self.logger.warning(f"Health check failed: {health}")
|
|
1599
|
+
except Exception as e:
|
|
1600
|
+
self.logger.error(f"Health check error: {e}")
|
|
1601
|
+
|
|
1602
|
+
def _cleanup_stale_jobs(self):
|
|
1603
|
+
"""清理过时的作业"""
|
|
1604
|
+
with self._lock:
|
|
1605
|
+
stale_time = datetime.now() - timedelta(hours=24)
|
|
1606
|
+
stale_jobs = []
|
|
1607
|
+
|
|
1608
|
+
for job_id, future in list(self._futures.items()):
|
|
1609
|
+
if future.done():
|
|
1610
|
+
stale_jobs.append(job_id)
|
|
1611
|
+
|
|
1612
|
+
for job_id in stale_jobs:
|
|
1613
|
+
if job_id in self._futures:
|
|
1614
|
+
del self._futures[job_id]
|
|
1615
|
+
|
|
1616
|
+
def _process_queue_loop(self):
|
|
1617
|
+
"""处理优先级队列的循环"""
|
|
1618
|
+
while True:
|
|
1619
|
+
try:
|
|
1620
|
+
if self._priority_queue:
|
|
1621
|
+
job = self._priority_queue.pop()
|
|
1622
|
+
if job:
|
|
1623
|
+
# 检查是否有足够的资源
|
|
1624
|
+
if self._has_enough_resources(job):
|
|
1625
|
+
self.submit(job, wait=False)
|
|
1626
|
+
else:
|
|
1627
|
+
# 放回队列稍后重试
|
|
1628
|
+
time.sleep(5)
|
|
1629
|
+
self._priority_queue.push(job)
|
|
1630
|
+
|
|
1631
|
+
time.sleep(0.1)
|
|
1632
|
+
|
|
1633
|
+
except Exception as e:
|
|
1634
|
+
self.logger.error(f"Queue processing error: {e}")
|
|
1635
|
+
time.sleep(1)
|
|
1636
|
+
|
|
1637
|
+
def _has_enough_resources(self, job: JobDefinition) -> bool:
|
|
1638
|
+
"""检查是否有足够资源运行作业"""
|
|
1639
|
+
try:
|
|
1640
|
+
if not PSUTIL_AVAILABLE:
|
|
1641
|
+
return True
|
|
1642
|
+
|
|
1643
|
+
# 获取系统资源
|
|
1644
|
+
system_usage = self.resource_monitor.get_system_usage()
|
|
1645
|
+
|
|
1646
|
+
if "error" in system_usage:
|
|
1647
|
+
return True
|
|
1648
|
+
|
|
1649
|
+
# 检查CPU
|
|
1650
|
+
if "cpu_percent" in system_usage:
|
|
1651
|
+
if system_usage["cpu_percent"] > self.config.max_cpu_percent:
|
|
1652
|
+
self.logger.debug(f"CPU usage too high: {system_usage['cpu_percent']}%")
|
|
1653
|
+
return False
|
|
1654
|
+
|
|
1655
|
+
# 检查内存
|
|
1656
|
+
if job.resource.memory_gb:
|
|
1657
|
+
available_memory_gb = (system_usage.get("memory_total_gb", 0) -
|
|
1658
|
+
system_usage.get("memory_used_gb", 0))
|
|
1659
|
+
if available_memory_gb < job.resource.memory_gb:
|
|
1660
|
+
self.logger.debug(f"Insufficient memory: {available_memory_gb:.1f}GB available, {job.resource.memory_gb:.1f}GB required")
|
|
1661
|
+
return False
|
|
1662
|
+
|
|
1663
|
+
return True
|
|
1664
|
+
|
|
1665
|
+
except Exception as e:
|
|
1666
|
+
self.logger.error(f"Resource check error: {e}")
|
|
1667
|
+
return True
|
|
1668
|
+
|
|
1669
|
+
def _build_command(
|
|
1670
|
+
self,
|
|
1671
|
+
job: JobDefinition
|
|
1672
|
+
) -> Tuple[List[str], Optional[Path]]:
|
|
1673
|
+
"""
|
|
1674
|
+
构建执行命令
|
|
1675
|
+
|
|
1676
|
+
Args:
|
|
1677
|
+
job: 作业定义
|
|
1678
|
+
|
|
1679
|
+
Returns:
|
|
1680
|
+
(命令部分列表, 临时脚本路径)
|
|
1681
|
+
"""
|
|
1682
|
+
backend = job.backend
|
|
1683
|
+
cmd = job.cmd
|
|
1684
|
+
image = job.image or self.default_image
|
|
1685
|
+
config = job.config
|
|
1686
|
+
resource = job.resource
|
|
1687
|
+
|
|
1688
|
+
if backend == Backend.LOCAL:
|
|
1689
|
+
return [config.shell, "-c", cmd], None
|
|
1690
|
+
|
|
1691
|
+
elif backend == Backend.DOCKER:
|
|
1692
|
+
if not image:
|
|
1693
|
+
raise ValueError("Docker backend requires image")
|
|
1694
|
+
|
|
1695
|
+
parts = ["docker", "run", "--rm", "-i"]
|
|
1696
|
+
|
|
1697
|
+
# 挂载卷
|
|
1698
|
+
for host_path, container_path in config.mounts.items():
|
|
1699
|
+
parts += ["-v", f"{host_path}:{container_path}"]
|
|
1700
|
+
|
|
1701
|
+
# 工作目录
|
|
1702
|
+
if config.workdir:
|
|
1703
|
+
parts += ["-w", str(config.workdir)]
|
|
1704
|
+
|
|
1705
|
+
# 环境变量
|
|
1706
|
+
for key, value in config.env.items():
|
|
1707
|
+
parts += ["-e", f"{key}={shlex.quote(str(value))}"]
|
|
1708
|
+
|
|
1709
|
+
# 用户
|
|
1710
|
+
if config.user:
|
|
1711
|
+
parts += ["-u", config.user]
|
|
1712
|
+
|
|
1713
|
+
# 网络
|
|
1714
|
+
if config.network_mode:
|
|
1715
|
+
parts += ["--network", config.network_mode]
|
|
1716
|
+
|
|
1717
|
+
# 安全选项
|
|
1718
|
+
if config.security_opts:
|
|
1719
|
+
for opt in config.security_opts:
|
|
1720
|
+
parts += ["--security-opt", opt]
|
|
1721
|
+
|
|
1722
|
+
# 资源限制
|
|
1723
|
+
if resource:
|
|
1724
|
+
parts += ["--cpus", str(resource.cpus)]
|
|
1725
|
+
if resource.memory_mb:
|
|
1726
|
+
parts += ["--memory", f"{resource.memory_mb}m"]
|
|
1727
|
+
|
|
1728
|
+
parts.append(image)
|
|
1729
|
+
parts += [config.shell, "-c", cmd]
|
|
1730
|
+
|
|
1731
|
+
return parts, None
|
|
1732
|
+
|
|
1733
|
+
elif backend == Backend.APPTAINER:
|
|
1734
|
+
if not image:
|
|
1735
|
+
raise ValueError("Apptainer backend requires image")
|
|
1736
|
+
|
|
1737
|
+
parts = ["apptainer", "exec", "--containall"]
|
|
1738
|
+
|
|
1739
|
+
# 挂载卷
|
|
1740
|
+
for host_path, container_path in config.mounts.items():
|
|
1741
|
+
parts += ["--bind", f"{host_path}:{container_path}"]
|
|
1742
|
+
|
|
1743
|
+
# 工作目录
|
|
1744
|
+
if config.workdir:
|
|
1745
|
+
parts += ["--pwd", str(config.workdir)]
|
|
1746
|
+
|
|
1747
|
+
# 环境变量
|
|
1748
|
+
for key, value in config.env.items():
|
|
1749
|
+
parts += ["--env", f"{key}={shlex.quote(str(value))}"]
|
|
1750
|
+
|
|
1751
|
+
# 确保镜像存在
|
|
1752
|
+
image_path = Path(image)
|
|
1753
|
+
if not image_path.exists():
|
|
1754
|
+
self.logger.info(f"Pulling image: {image}")
|
|
1755
|
+
pull_cmd = ["apptainer", "pull", "--force", str(image_path), f"docker://{image}"]
|
|
1756
|
+
subprocess.run(pull_cmd, check=False, capture_output=True)
|
|
1757
|
+
|
|
1758
|
+
parts.append(str(image_path))
|
|
1759
|
+
parts += [config.shell, "-c", cmd]
|
|
1760
|
+
|
|
1761
|
+
return parts, None
|
|
1762
|
+
|
|
1763
|
+
elif backend == Backend.SLURM:
|
|
1764
|
+
# 创建SLURM脚本
|
|
1765
|
+
return self._build_slurm_script(job)
|
|
1766
|
+
|
|
1767
|
+
elif backend == Backend.PBS:
|
|
1768
|
+
# 创建PBS脚本
|
|
1769
|
+
return self._build_pbs_script(job)
|
|
1770
|
+
|
|
1771
|
+
else:
|
|
1772
|
+
raise ValueError(f"Unsupported backend: {backend}")
|
|
1773
|
+
|
|
1774
|
+
def _build_slurm_script(self, job: JobDefinition) -> Tuple[List[str], Path]:
|
|
1775
|
+
"""构建SLURM脚本"""
|
|
1776
|
+
# 创建临时目录
|
|
1777
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="slurm_"))
|
|
1778
|
+
script_path = temp_dir / f"{job.job_id}.sh"
|
|
1779
|
+
|
|
1780
|
+
# 构建脚本内容
|
|
1781
|
+
script_lines = ["#!/bin/bash"]
|
|
1782
|
+
|
|
1783
|
+
# SLURM指令
|
|
1784
|
+
directives = job.resource.to_slurm_directives()
|
|
1785
|
+
for key, value in directives.items():
|
|
1786
|
+
if value:
|
|
1787
|
+
script_lines.append(f"#SBATCH {key}={value}")
|
|
1788
|
+
else:
|
|
1789
|
+
script_lines.append(f"#SBATCH {key}")
|
|
1790
|
+
|
|
1791
|
+
# 输出文件
|
|
1792
|
+
if job.config.stdout:
|
|
1793
|
+
script_lines.append(f"#SBATCH --output={job.config.stdout}")
|
|
1794
|
+
if job.config.stderr:
|
|
1795
|
+
script_lines.append(f"#SBATCH --error={job.config.stderr}")
|
|
1796
|
+
|
|
1797
|
+
script_lines.append("")
|
|
1798
|
+
|
|
1799
|
+
# 环境变量
|
|
1800
|
+
for key, value in job.config.env.items():
|
|
1801
|
+
script_lines.append(f"export {key}={shlex.quote(str(value))}")
|
|
1802
|
+
|
|
1803
|
+
script_lines.append("")
|
|
1804
|
+
|
|
1805
|
+
# 容器命令
|
|
1806
|
+
if job.image:
|
|
1807
|
+
container_cmd = ["apptainer", "exec"]
|
|
1808
|
+
for host_path, container_path in job.config.mounts.items():
|
|
1809
|
+
container_cmd += ["--bind", f"{host_path}:{container_path}"]
|
|
1810
|
+
if job.config.workdir:
|
|
1811
|
+
container_cmd += ["--pwd", str(job.config.workdir)]
|
|
1812
|
+
container_cmd.append(job.image)
|
|
1813
|
+
container_cmd += [job.config.shell, "-c", shlex.quote(job.cmd)]
|
|
1814
|
+
script_lines.append(" ".join(container_cmd))
|
|
1815
|
+
else:
|
|
1816
|
+
if job.config.workdir:
|
|
1817
|
+
script_lines.append(f"cd {job.config.workdir}")
|
|
1818
|
+
script_lines.append(job.cmd)
|
|
1819
|
+
|
|
1820
|
+
# 写入文件
|
|
1821
|
+
script_path.write_text("\n".join(script_lines))
|
|
1822
|
+
script_path.chmod(0o755)
|
|
1823
|
+
|
|
1824
|
+
return ["sbatch", "--parsable", str(script_path)], script_path
|
|
1825
|
+
|
|
1826
|
+
def _build_pbs_script(self, job: JobDefinition) -> Tuple[List[str], Path]:
|
|
1827
|
+
"""构建PBS脚本"""
|
|
1828
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="pbs_"))
|
|
1829
|
+
script_path = temp_dir / f"{job.job_id}.pbs"
|
|
1830
|
+
|
|
1831
|
+
script_lines = ["#!/bin/bash"]
|
|
1832
|
+
|
|
1833
|
+
# PBS指令
|
|
1834
|
+
directives = job.resource.to_pbs_directives()
|
|
1835
|
+
for key, value in directives.items():
|
|
1836
|
+
script_lines.append(f"#PBS {key} {value}")
|
|
1837
|
+
|
|
1838
|
+
# 输出文件
|
|
1839
|
+
if job.config.stdout:
|
|
1840
|
+
script_lines.append(f"#PBS -o {job.config.stdout}")
|
|
1841
|
+
if job.config.stderr:
|
|
1842
|
+
script_lines.append(f"#PBS -e {job.config.stderr}")
|
|
1843
|
+
|
|
1844
|
+
script_lines.append("")
|
|
1845
|
+
|
|
1846
|
+
# 环境变量
|
|
1847
|
+
for key, value in job.config.env.items():
|
|
1848
|
+
script_lines.append(f"export {key}={shlex.quote(str(value))}")
|
|
1849
|
+
|
|
1850
|
+
script_lines.append("")
|
|
1851
|
+
script_lines.append("cd $PBS_O_WORKDIR")
|
|
1852
|
+
|
|
1853
|
+
if job.config.workdir:
|
|
1854
|
+
script_lines.append(f"cd {job.config.workdir}")
|
|
1855
|
+
|
|
1856
|
+
script_lines.append("")
|
|
1857
|
+
|
|
1858
|
+
# 容器命令
|
|
1859
|
+
if job.image:
|
|
1860
|
+
container_cmd = ["apptainer", "exec"]
|
|
1861
|
+
for host_path, container_path in job.config.mounts.items():
|
|
1862
|
+
container_cmd += ["--bind", f"{host_path}:{container_path}"]
|
|
1863
|
+
container_cmd.append(job.image)
|
|
1864
|
+
container_cmd += [job.config.shell, "-c", shlex.quote(job.cmd)]
|
|
1865
|
+
script_lines.append(" ".join(container_cmd))
|
|
1866
|
+
else:
|
|
1867
|
+
script_lines.append(job.cmd)
|
|
1868
|
+
|
|
1869
|
+
# 写入文件
|
|
1870
|
+
script_path.write_text("\n".join(script_lines))
|
|
1871
|
+
script_path.chmod(0o755)
|
|
1872
|
+
|
|
1873
|
+
return ["qsub", str(script_path)], script_path
|
|
1874
|
+
|
|
1875
|
+
def _execute_job(
|
|
1876
|
+
self,
|
|
1877
|
+
job: JobDefinition,
|
|
1878
|
+
use_cache: bool = True
|
|
1879
|
+
) -> JobResult:
|
|
1880
|
+
"""
|
|
1881
|
+
执行单个作业
|
|
1882
|
+
|
|
1883
|
+
Args:
|
|
1884
|
+
job: 作业定义
|
|
1885
|
+
use_cache: 是否使用缓存
|
|
1886
|
+
|
|
1887
|
+
Returns:
|
|
1888
|
+
作业结果
|
|
1889
|
+
"""
|
|
1890
|
+
job_id = job.job_id
|
|
1891
|
+
|
|
1892
|
+
# 触发插件事件
|
|
1893
|
+
for plugin in self.plugins:
|
|
1894
|
+
try:
|
|
1895
|
+
plugin.on_job_submit(job)
|
|
1896
|
+
except Exception as e:
|
|
1897
|
+
self.logger.error(f"Plugin on_job_submit error: {e}")
|
|
1898
|
+
|
|
1899
|
+
# 指标收集
|
|
1900
|
+
self.metrics_collector.record_job_start(job)
|
|
1901
|
+
|
|
1902
|
+
# 检查缓存
|
|
1903
|
+
if use_cache and self.result_cache:
|
|
1904
|
+
cache_key = self.result_cache.compute_key(
|
|
1905
|
+
cmd=job.cmd,
|
|
1906
|
+
backend=job.backend,
|
|
1907
|
+
image=job.image,
|
|
1908
|
+
mounts=job.config.mounts,
|
|
1909
|
+
workdir=job.config.workdir,
|
|
1910
|
+
env=job.config.env,
|
|
1911
|
+
resource=job.resource
|
|
1912
|
+
)
|
|
1913
|
+
|
|
1914
|
+
if self.result_cache.has(cache_key):
|
|
1915
|
+
cached_result = self.result_cache.get(cache_key)
|
|
1916
|
+
if cached_result:
|
|
1917
|
+
self.logger.info(f"[{job_id}] Using cached result")
|
|
1918
|
+
|
|
1919
|
+
# 更新作业ID
|
|
1920
|
+
cached_result.job_id = job_id
|
|
1921
|
+
cached_result.attempts = 1
|
|
1922
|
+
|
|
1923
|
+
# 触发完成事件
|
|
1924
|
+
for plugin in self.plugins:
|
|
1925
|
+
try:
|
|
1926
|
+
plugin.on_job_complete(cached_result)
|
|
1927
|
+
except Exception as e:
|
|
1928
|
+
self.logger.error(f"Plugin on_job_complete error: {e}")
|
|
1929
|
+
|
|
1930
|
+
self.metrics_collector.record_job_complete(cached_result, job)
|
|
1931
|
+
|
|
1932
|
+
# 保存到存储
|
|
1933
|
+
if self.job_store:
|
|
1934
|
+
self.job_store.save_job(job)
|
|
1935
|
+
self.job_store.save_result(cached_result)
|
|
1936
|
+
|
|
1937
|
+
return cached_result
|
|
1938
|
+
|
|
1939
|
+
# 构建命令
|
|
1940
|
+
cmd_parts, script_path = self._build_command(job)
|
|
1941
|
+
|
|
1942
|
+
result = JobResult(
|
|
1943
|
+
job_id=job_id,
|
|
1944
|
+
status=JobStatus.RUNNING,
|
|
1945
|
+
backend=job.backend.value,
|
|
1946
|
+
command=job.cmd,
|
|
1947
|
+
tags=job.tags.copy()
|
|
1948
|
+
)
|
|
1949
|
+
|
|
1950
|
+
result.start_time = datetime.now()
|
|
1951
|
+
|
|
1952
|
+
try:
|
|
1953
|
+
# 触发开始事件
|
|
1954
|
+
for plugin in self.plugins:
|
|
1955
|
+
try:
|
|
1956
|
+
plugin.on_job_start(job_id)
|
|
1957
|
+
except Exception as e:
|
|
1958
|
+
self.logger.error(f"Plugin on_job_start error: {e}")
|
|
1959
|
+
|
|
1960
|
+
# 添加超时监控
|
|
1961
|
+
if self._timeout_monitor and job.config.timeout:
|
|
1962
|
+
self._timeout_monitor.add_job(job_id, job.config.timeout)
|
|
1963
|
+
|
|
1964
|
+
self.logger.info(f"[{job_id}] Starting job: {job.name}")
|
|
1965
|
+
if not job.config.silent:
|
|
1966
|
+
self.logger.info(f"[{job_id}] Command: {' '.join(cmd_parts)}")
|
|
1967
|
+
|
|
1968
|
+
# 准备输出
|
|
1969
|
+
stdout_dest = None
|
|
1970
|
+
stderr_dest = None
|
|
1971
|
+
stdout_file = None
|
|
1972
|
+
stderr_file = None
|
|
1973
|
+
|
|
1974
|
+
if job.config.capture_output:
|
|
1975
|
+
stdout_dest = subprocess.PIPE
|
|
1976
|
+
stderr_dest = subprocess.PIPE
|
|
1977
|
+
elif job.config.stdout:
|
|
1978
|
+
stdout_file = open(job.config.stdout, 'w')
|
|
1979
|
+
stdout_dest = stdout_file
|
|
1980
|
+
elif job.config.stderr:
|
|
1981
|
+
stderr_file = open(job.config.stderr, 'w')
|
|
1982
|
+
stderr_dest = stderr_file
|
|
1983
|
+
|
|
1984
|
+
# 执行命令
|
|
1985
|
+
stdin_input = job.config.stdin.encode() if job.config.stdin else None
|
|
1986
|
+
|
|
1987
|
+
process = subprocess.run(
|
|
1988
|
+
cmd_parts,
|
|
1989
|
+
shell=False,
|
|
1990
|
+
check=False, # 我们手动检查退出码
|
|
1991
|
+
timeout=job.config.timeout,
|
|
1992
|
+
stdout=stdout_dest,
|
|
1993
|
+
stderr=stderr_dest,
|
|
1994
|
+
stdin=subprocess.PIPE if job.config.stdin else None,
|
|
1995
|
+
text=not job.config.stdin, # 如果stdin是二进制,text=False
|
|
1996
|
+
cwd=job.config.workdir,
|
|
1997
|
+
input=stdin_input
|
|
1998
|
+
)
|
|
1999
|
+
|
|
2000
|
+
# 收集输出
|
|
2001
|
+
if job.config.capture_output:
|
|
2002
|
+
result.stdout = process.stdout
|
|
2003
|
+
result.stderr = process.stderr
|
|
2004
|
+
if result.stdout and len(result.stdout) > 10000: # 限制输出大小
|
|
2005
|
+
result.stdout = result.stdout[:10000] + "... [truncated]"
|
|
2006
|
+
if result.stderr and len(result.stderr) > 10000:
|
|
2007
|
+
result.stderr = result.stderr[:10000] + "... [truncated]"
|
|
2008
|
+
|
|
2009
|
+
result.exit_code = process.returncode
|
|
2010
|
+
|
|
2011
|
+
if process.returncode == 0:
|
|
2012
|
+
result.status = JobStatus.COMPLETED
|
|
2013
|
+
else:
|
|
2014
|
+
result.status = JobStatus.FAILED
|
|
2015
|
+
result.error_message = f"Command failed with exit code {process.returncode}"
|
|
2016
|
+
|
|
2017
|
+
except subprocess.TimeoutExpired:
|
|
2018
|
+
result.status = JobStatus.TIMEOUT
|
|
2019
|
+
result.error_message = f"Command timed out after {job.config.timeout} seconds"
|
|
2020
|
+
except subprocess.CalledProcessError as e:
|
|
2021
|
+
result.status = JobStatus.FAILED
|
|
2022
|
+
result.exit_code = e.returncode
|
|
2023
|
+
result.error_message = str(e)
|
|
2024
|
+
if e.stdout:
|
|
2025
|
+
result.stdout = e.stdout
|
|
2026
|
+
if e.stderr:
|
|
2027
|
+
result.stderr = e.stderr
|
|
2028
|
+
except FileNotFoundError as e:
|
|
2029
|
+
result.status = JobStatus.FAILED
|
|
2030
|
+
result.error_message = f"Command not found: {e}"
|
|
2031
|
+
except PermissionError as e:
|
|
2032
|
+
result.status = JobStatus.FAILED
|
|
2033
|
+
result.error_message = f"Permission denied: {e}"
|
|
2034
|
+
except Exception as e:
|
|
2035
|
+
result.status = JobStatus.FAILED
|
|
2036
|
+
result.error_message = f"Unexpected error: {e}"
|
|
2037
|
+
|
|
2038
|
+
finally:
|
|
2039
|
+
# 关闭文件
|
|
2040
|
+
if stdout_file:
|
|
2041
|
+
stdout_file.close()
|
|
2042
|
+
if stderr_file:
|
|
2043
|
+
stderr_file.close()
|
|
2044
|
+
|
|
2045
|
+
# 移除超时监控
|
|
2046
|
+
if self._timeout_monitor:
|
|
2047
|
+
self._timeout_monitor.remove_job(job_id)
|
|
2048
|
+
|
|
2049
|
+
result.end_time = datetime.now()
|
|
2050
|
+
if result.start_time and result.end_time:
|
|
2051
|
+
result.duration = (result.end_time - result.start_time).total_seconds()
|
|
2052
|
+
|
|
2053
|
+
# 清理临时脚本
|
|
2054
|
+
if script_path and job.config.clean_temp:
|
|
2055
|
+
try:
|
|
2056
|
+
script_path.unlink()
|
|
2057
|
+
script_path.parent.rmdir()
|
|
2058
|
+
except Exception as e:
|
|
2059
|
+
self.logger.warning(f"[{job_id}] Failed to clean temp files: {e}")
|
|
2060
|
+
|
|
2061
|
+
# 结果处理
|
|
2062
|
+
if job.result_handler:
|
|
2063
|
+
try:
|
|
2064
|
+
result = job.result_handler(result)
|
|
2065
|
+
except Exception as e:
|
|
2066
|
+
self.logger.error(f"[{job_id}] Result handler failed: {e}")
|
|
2067
|
+
|
|
2068
|
+
# 触发完成事件
|
|
2069
|
+
for plugin in self.plugins:
|
|
2070
|
+
try:
|
|
2071
|
+
plugin.on_job_complete(result)
|
|
2072
|
+
except Exception as e:
|
|
2073
|
+
self.logger.error(f"Plugin on_job_complete error: {e}")
|
|
2074
|
+
|
|
2075
|
+
# 指标收集
|
|
2076
|
+
self.metrics_collector.record_job_complete(result, job)
|
|
2077
|
+
|
|
2078
|
+
# 保存结果
|
|
2079
|
+
if self.job_store:
|
|
2080
|
+
self.job_store.save_job(job)
|
|
2081
|
+
self.job_store.save_result(result)
|
|
2082
|
+
|
|
2083
|
+
# 缓存结果
|
|
2084
|
+
if use_cache and self.result_cache and result.success():
|
|
2085
|
+
cache_key = self.result_cache.compute_key(
|
|
2086
|
+
cmd=job.cmd,
|
|
2087
|
+
backend=job.backend,
|
|
2088
|
+
image=job.image,
|
|
2089
|
+
mounts=job.config.mounts,
|
|
2090
|
+
workdir=job.config.workdir,
|
|
2091
|
+
env=job.config.env,
|
|
2092
|
+
resource=job.resource
|
|
2093
|
+
)
|
|
2094
|
+
self.result_cache.set(cache_key, result)
|
|
2095
|
+
|
|
2096
|
+
# 回调函数
|
|
2097
|
+
if job.callback:
|
|
2098
|
+
try:
|
|
2099
|
+
job.callback(result)
|
|
2100
|
+
except Exception as e:
|
|
2101
|
+
self.logger.error(f"[{job_id}] Callback failed: {e}")
|
|
2102
|
+
|
|
2103
|
+
self.logger.info(f"[{job_id}] Job completed with status: {result.status.value}")
|
|
2104
|
+
|
|
2105
|
+
# 存储结果
|
|
2106
|
+
with self._lock:
|
|
2107
|
+
self._results[job_id] = result
|
|
2108
|
+
|
|
2109
|
+
return result
|
|
2110
|
+
|
|
2111
|
+
def submit(self, job: JobDefinition, wait: bool = True) -> Union[JobResult, Future]:
|
|
2112
|
+
"""
|
|
2113
|
+
提交作业
|
|
2114
|
+
|
|
2115
|
+
Args:
|
|
2116
|
+
job: 作业定义
|
|
2117
|
+
wait: 是否等待作业完成
|
|
2118
|
+
|
|
2119
|
+
Returns:
|
|
2120
|
+
如果wait=True返回JobResult,否则返回Future
|
|
2121
|
+
"""
|
|
2122
|
+
# 保存作业定义
|
|
2123
|
+
with self._lock:
|
|
2124
|
+
self._job_definitions[job.job_id] = job
|
|
2125
|
+
|
|
2126
|
+
# 提交到线程池
|
|
2127
|
+
future = self._executor.submit(self._execute_job, job)
|
|
2128
|
+
|
|
2129
|
+
with self._lock:
|
|
2130
|
+
self._futures[job.job_id] = future
|
|
2131
|
+
|
|
2132
|
+
if wait:
|
|
2133
|
+
try:
|
|
2134
|
+
return future.result()
|
|
2135
|
+
except Exception as e:
|
|
2136
|
+
self.logger.error(f"[{job.job_id}] Job execution failed: {e}")
|
|
2137
|
+
result = JobResult(
|
|
2138
|
+
job_id=job.job_id,
|
|
2139
|
+
status=JobStatus.FAILED,
|
|
2140
|
+
error_message=str(e)
|
|
2141
|
+
)
|
|
2142
|
+
return result
|
|
2143
|
+
else:
|
|
2144
|
+
return future
|
|
2145
|
+
|
|
2146
|
+
def run(
|
|
2147
|
+
self,
|
|
2148
|
+
cmd: str,
|
|
2149
|
+
backend: Union[Backend, str] = None,
|
|
2150
|
+
image: str = None,
|
|
2151
|
+
mounts: Dict[Union[str, Path], Union[str, Path]] = None,
|
|
2152
|
+
workdir: Union[str, Path] = None,
|
|
2153
|
+
env: Dict[str, str] = None,
|
|
2154
|
+
dry_run: bool = False,
|
|
2155
|
+
resource: Union[ResourceRequest, Dict[str, Any]] = None,
|
|
2156
|
+
retry: Union[RetryConfig, Dict[str, Any]] = None,
|
|
2157
|
+
config: Union[ExecutionConfig, Dict[str, Any]] = None,
|
|
2158
|
+
job_id: str = None,
|
|
2159
|
+
wait: bool = True,
|
|
2160
|
+
name: str = None,
|
|
2161
|
+
description: str = None,
|
|
2162
|
+
tags: Dict[str, str] = None,
|
|
2163
|
+
priority: Union[JobPriority, int] = JobPriority.NORMAL,
|
|
2164
|
+
use_cache: bool = True
|
|
2165
|
+
) -> Union[JobResult, Future]:
|
|
2166
|
+
"""
|
|
2167
|
+
运行命令(简化接口)
|
|
2168
|
+
|
|
2169
|
+
Args:
|
|
2170
|
+
cmd: 要执行的命令
|
|
2171
|
+
backend: 执行后端,默认使用调度器默认后端
|
|
2172
|
+
image: 容器镜像
|
|
2173
|
+
mounts: 挂载映射 {主机路径: 容器路径}
|
|
2174
|
+
workdir: 工作目录
|
|
2175
|
+
env: 环境变量
|
|
2176
|
+
dry_run: 只打印命令不执行
|
|
2177
|
+
resource: 资源请求配置或字典
|
|
2178
|
+
retry: 重试配置或字典
|
|
2179
|
+
config: 执行配置或字典
|
|
2180
|
+
job_id: 作业ID,自动生成如果未提供
|
|
2181
|
+
wait: 是否等待作业完成
|
|
2182
|
+
name: 作业名称
|
|
2183
|
+
description: 作业描述
|
|
2184
|
+
tags: 作业标签
|
|
2185
|
+
priority: 作业优先级
|
|
2186
|
+
use_cache: 是否使用结果缓存
|
|
2187
|
+
|
|
2188
|
+
Returns:
|
|
2189
|
+
如果wait=True返回JobResult,否则返回Future
|
|
2190
|
+
"""
|
|
2191
|
+
# 参数转换
|
|
2192
|
+
if backend is None:
|
|
2193
|
+
backend = self.default_backend
|
|
2194
|
+
elif isinstance(backend, str):
|
|
2195
|
+
backend = Backend(backend)
|
|
2196
|
+
|
|
2197
|
+
if isinstance(resource, dict):
|
|
2198
|
+
resource = ResourceRequest(**resource)
|
|
2199
|
+
elif resource is None:
|
|
2200
|
+
resource = ResourceRequest()
|
|
2201
|
+
|
|
2202
|
+
if isinstance(retry, dict):
|
|
2203
|
+
retry = RetryConfig(**retry)
|
|
2204
|
+
elif retry is None:
|
|
2205
|
+
retry = RetryConfig()
|
|
2206
|
+
|
|
2207
|
+
if isinstance(config, dict):
|
|
2208
|
+
config = ExecutionConfig(**config)
|
|
2209
|
+
elif config is None:
|
|
2210
|
+
config = ExecutionConfig()
|
|
2211
|
+
|
|
2212
|
+
# 挂载转换
|
|
2213
|
+
mounts_dict = {}
|
|
2214
|
+
if mounts:
|
|
2215
|
+
for host_path, container_path in mounts.items():
|
|
2216
|
+
mounts_dict[Path(host_path)] = Path(container_path)
|
|
2217
|
+
|
|
2218
|
+
# 工作目录转换
|
|
2219
|
+
if workdir:
|
|
2220
|
+
config.workdir = Path(workdir)
|
|
2221
|
+
|
|
2222
|
+
# 环境变量
|
|
2223
|
+
if env:
|
|
2224
|
+
config.env.update(env)
|
|
2225
|
+
|
|
2226
|
+
# 挂载点
|
|
2227
|
+
if mounts_dict:
|
|
2228
|
+
config.mounts.update(mounts_dict)
|
|
2229
|
+
|
|
2230
|
+
# 优先级转换
|
|
2231
|
+
if isinstance(priority, int):
|
|
2232
|
+
priority = JobPriority(priority)
|
|
2233
|
+
|
|
2234
|
+
# 创建作业定义
|
|
2235
|
+
job = JobDefinition(
|
|
2236
|
+
cmd=cmd,
|
|
2237
|
+
backend=backend,
|
|
2238
|
+
image=image,
|
|
2239
|
+
config=config,
|
|
2240
|
+
resource=resource,
|
|
2241
|
+
retry=retry,
|
|
2242
|
+
job_id=job_id,
|
|
2243
|
+
name=name,
|
|
2244
|
+
description=description,
|
|
2245
|
+
priority=priority,
|
|
2246
|
+
tags=tags or {}
|
|
2247
|
+
)
|
|
2248
|
+
|
|
2249
|
+
# 干运行
|
|
2250
|
+
if dry_run:
|
|
2251
|
+
cmd_parts, _ = self._build_command(job)
|
|
2252
|
+
self.logger.info(f"[DRY RUN] Command: {' '.join(cmd_parts)}")
|
|
2253
|
+
|
|
2254
|
+
result = JobResult(
|
|
2255
|
+
job_id=job.job_id,
|
|
2256
|
+
status=JobStatus.COMPLETED,
|
|
2257
|
+
command=cmd
|
|
2258
|
+
)
|
|
2259
|
+
return result
|
|
2260
|
+
|
|
2261
|
+
# 提交作业
|
|
2262
|
+
return self.submit(job, wait=wait)
|
|
2263
|
+
|
|
2264
|
+
def enqueue(self, job: JobDefinition) -> str:
|
|
2265
|
+
"""
|
|
2266
|
+
将作业加入优先级队列(而不是立即执行)
|
|
2267
|
+
|
|
2268
|
+
Args:
|
|
2269
|
+
job: 作业定义
|
|
2270
|
+
|
|
2271
|
+
Returns:
|
|
2272
|
+
作业ID
|
|
2273
|
+
"""
|
|
2274
|
+
if not self._priority_queue:
|
|
2275
|
+
raise RuntimeError("Priority queue is not enabled")
|
|
2276
|
+
|
|
2277
|
+
# 保存作业定义
|
|
2278
|
+
with self._lock:
|
|
2279
|
+
self._job_definitions[job.job_id] = job
|
|
2280
|
+
|
|
2281
|
+
# 加入优先级队列
|
|
2282
|
+
self._priority_queue.push(job)
|
|
2283
|
+
|
|
2284
|
+
self.logger.info(f"[{job.job_id}] Job enqueued with priority {job.priority}")
|
|
2285
|
+
|
|
2286
|
+
return job.job_id
|
|
2287
|
+
|
|
2288
|
+
def run_many(
|
|
2289
|
+
self,
|
|
2290
|
+
commands: List[Union[str, Dict[str, Any]]],
|
|
2291
|
+
backend: Backend = None,
|
|
2292
|
+
max_workers: int = None,
|
|
2293
|
+
progress_callback: Callable[[int, int], None] = None,
|
|
2294
|
+
stop_on_error: bool = False,
|
|
2295
|
+
use_cache: bool = True
|
|
2296
|
+
) -> List[JobResult]:
|
|
2297
|
+
"""
|
|
2298
|
+
批量运行多个命令
|
|
2299
|
+
|
|
2300
|
+
Args:
|
|
2301
|
+
commands: 命令列表,可以是字符串或配置字典
|
|
2302
|
+
backend: 执行后端,覆盖单个命令的后端设置
|
|
2303
|
+
max_workers: 最大工作线程数,默认使用调度器设置
|
|
2304
|
+
progress_callback: 进度回调函数 (completed, total)
|
|
2305
|
+
stop_on_error: 遇到错误是否停止
|
|
2306
|
+
use_cache: 是否使用结果缓存
|
|
2307
|
+
|
|
2308
|
+
Returns:
|
|
2309
|
+
作业结果列表
|
|
2310
|
+
"""
|
|
2311
|
+
if max_workers is None:
|
|
2312
|
+
max_workers = self.max_concurrent
|
|
2313
|
+
|
|
2314
|
+
# 转换命令为作业定义
|
|
2315
|
+
jobs = []
|
|
2316
|
+
for i, cmd_spec in enumerate(commands):
|
|
2317
|
+
if isinstance(cmd_spec, JobDefinition):
|
|
2318
|
+
# 已经是JobDefinition对象
|
|
2319
|
+
job = cmd_spec
|
|
2320
|
+
# 确保有作业ID
|
|
2321
|
+
if not job.job_id:
|
|
2322
|
+
job.job_id = f"batch_{i}_{uuid.uuid4().hex[:4]}"
|
|
2323
|
+
elif isinstance(cmd_spec, str):
|
|
2324
|
+
# 简单字符串命令
|
|
2325
|
+
job = JobDefinition(
|
|
2326
|
+
cmd=cmd_spec,
|
|
2327
|
+
backend=backend or self.default_backend,
|
|
2328
|
+
name=f"batch_{i}",
|
|
2329
|
+
job_id=f"batch_{i}_{uuid.uuid4().hex[:4]}"
|
|
2330
|
+
)
|
|
2331
|
+
else:
|
|
2332
|
+
# 配置字典
|
|
2333
|
+
cmd_spec = cmd_spec.copy()
|
|
2334
|
+
|
|
2335
|
+
# 提取命令
|
|
2336
|
+
cmd = cmd_spec.pop("cmd")
|
|
2337
|
+
|
|
2338
|
+
# 处理后端
|
|
2339
|
+
if backend is not None and "backend" not in cmd_spec:
|
|
2340
|
+
cmd_spec["backend"] = backend
|
|
2341
|
+
|
|
2342
|
+
# 创建作业定义
|
|
2343
|
+
try:
|
|
2344
|
+
job = JobDefinition(cmd=cmd, **cmd_spec)
|
|
2345
|
+
except TypeError as e:
|
|
2346
|
+
self.logger.error(f"Error creating job from spec {cmd_spec}: {e}")
|
|
2347
|
+
continue
|
|
2348
|
+
|
|
2349
|
+
# 如果没有作业ID,生成一个
|
|
2350
|
+
if not job.job_id:
|
|
2351
|
+
job.job_id = f"batch_{i}_{uuid.uuid4().hex[:4]}"
|
|
2352
|
+
|
|
2353
|
+
# 如果没有名称,使用作业ID
|
|
2354
|
+
if not job.name:
|
|
2355
|
+
job.name = job.job_id
|
|
2356
|
+
|
|
2357
|
+
jobs.append(job)
|
|
2358
|
+
|
|
2359
|
+
total = len(jobs)
|
|
2360
|
+
results = []
|
|
2361
|
+
completed = 0
|
|
2362
|
+
|
|
2363
|
+
# 使用执行器并行运行
|
|
2364
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
2365
|
+
futures = []
|
|
2366
|
+
|
|
2367
|
+
for job in jobs:
|
|
2368
|
+
future = executor.submit(self._execute_job, job, use_cache)
|
|
2369
|
+
futures.append((job.job_id, future))
|
|
2370
|
+
|
|
2371
|
+
for job_id, future in futures:
|
|
2372
|
+
try:
|
|
2373
|
+
result = future.result()
|
|
2374
|
+
results.append(result)
|
|
2375
|
+
|
|
2376
|
+
completed += 1
|
|
2377
|
+
if progress_callback:
|
|
2378
|
+
progress_callback(completed, total)
|
|
2379
|
+
|
|
2380
|
+
# 检查是否需要停止
|
|
2381
|
+
if stop_on_error and result.failed():
|
|
2382
|
+
self.logger.warning(f"Stopping batch due to failed job: {job_id}")
|
|
2383
|
+
break
|
|
2384
|
+
|
|
2385
|
+
except Exception as e:
|
|
2386
|
+
self.logger.error(f"Job {job_id} failed with exception: {e}")
|
|
2387
|
+
|
|
2388
|
+
error_result = JobResult(
|
|
2389
|
+
job_id=job_id,
|
|
2390
|
+
status=JobStatus.FAILED,
|
|
2391
|
+
error_message=str(e)
|
|
2392
|
+
)
|
|
2393
|
+
results.append(error_result)
|
|
2394
|
+
|
|
2395
|
+
completed += 1
|
|
2396
|
+
if progress_callback:
|
|
2397
|
+
progress_callback(completed, total)
|
|
2398
|
+
|
|
2399
|
+
if stop_on_error:
|
|
2400
|
+
break
|
|
2401
|
+
|
|
2402
|
+
return results
|
|
2403
|
+
|
|
2404
|
+
def cancel(self, job_id: str):
|
|
2405
|
+
"""取消作业"""
|
|
2406
|
+
with self._lock:
|
|
2407
|
+
if job_id in self._futures:
|
|
2408
|
+
future = self._futures[job_id]
|
|
2409
|
+
future.cancel()
|
|
2410
|
+
|
|
2411
|
+
# 更新结果
|
|
2412
|
+
if job_id in self._results:
|
|
2413
|
+
self._results[job_id].status = JobStatus.CANCELLED
|
|
2414
|
+
|
|
2415
|
+
self.logger.info(f"[{job_id}] Job cancelled")
|
|
2416
|
+
else:
|
|
2417
|
+
self.logger.warning(f"[{job_id}] Job not found or already completed")
|
|
2418
|
+
|
|
2419
|
+
def cancel_all(self):
|
|
2420
|
+
"""取消所有作业"""
|
|
2421
|
+
with self._lock:
|
|
2422
|
+
for job_id in list(self._futures.keys()):
|
|
2423
|
+
self.cancel(job_id)
|
|
2424
|
+
|
|
2425
|
+
def wait_all(self, timeout: float = None) -> List[JobResult]:
|
|
2426
|
+
"""等待所有作业完成"""
|
|
2427
|
+
results = []
|
|
2428
|
+
with self._lock:
|
|
2429
|
+
futures = list(self._futures.items())
|
|
2430
|
+
|
|
2431
|
+
for job_id, future in futures:
|
|
2432
|
+
try:
|
|
2433
|
+
result = future.result(timeout=timeout)
|
|
2434
|
+
results.append(result)
|
|
2435
|
+
except Exception as e:
|
|
2436
|
+
self.logger.error(f"Error waiting for job {job_id}: {e}")
|
|
2437
|
+
|
|
2438
|
+
return results
|
|
2439
|
+
|
|
2440
|
+
def get_job(self, job_id: str) -> Optional[JobDefinition]:
|
|
2441
|
+
"""获取作业定义"""
|
|
2442
|
+
with self._lock:
|
|
2443
|
+
return self._job_definitions.get(job_id)
|
|
2444
|
+
|
|
2445
|
+
def get_result(self, job_id: str) -> Optional[JobResult]:
|
|
2446
|
+
"""获取作业结果"""
|
|
2447
|
+
# 首先检查内存中的结果
|
|
2448
|
+
with self._lock:
|
|
2449
|
+
if job_id in self._results:
|
|
2450
|
+
return self._results[job_id]
|
|
2451
|
+
|
|
2452
|
+
# 然后检查存储
|
|
2453
|
+
if self.job_store:
|
|
2454
|
+
history = self.job_store.get_job_history(job_id)
|
|
2455
|
+
if history:
|
|
2456
|
+
return history[-1] # 返回最新结果
|
|
2457
|
+
|
|
2458
|
+
return None
|
|
2459
|
+
|
|
2460
|
+
def get_status(self, job_id: str) -> Optional[JobStatus]:
|
|
2461
|
+
"""获取作业状态"""
|
|
2462
|
+
result = self.get_result(job_id)
|
|
2463
|
+
if result:
|
|
2464
|
+
return result.status
|
|
2465
|
+
|
|
2466
|
+
# 检查是否在运行
|
|
2467
|
+
with self._lock:
|
|
2468
|
+
if job_id in self._futures and not self._futures[job_id].done():
|
|
2469
|
+
return JobStatus.RUNNING
|
|
2470
|
+
|
|
2471
|
+
return None
|
|
2472
|
+
|
|
2473
|
+
def health_check(self) -> Dict[str, Any]:
|
|
2474
|
+
"""检查调度器健康状况"""
|
|
2475
|
+
health_status = {
|
|
2476
|
+
"status": "healthy",
|
|
2477
|
+
"timestamp": datetime.now().isoformat(),
|
|
2478
|
+
"components": {}
|
|
2479
|
+
}
|
|
2480
|
+
|
|
2481
|
+
# 检查执行器
|
|
2482
|
+
health_status["components"]["executor"] = {
|
|
2483
|
+
"running": not self._executor._shutdown,
|
|
2484
|
+
"active_threads": threading.active_count(),
|
|
2485
|
+
"max_workers": self.max_concurrent
|
|
2486
|
+
}
|
|
2487
|
+
|
|
2488
|
+
# 检查后端可用性
|
|
2489
|
+
backend_health = {}
|
|
2490
|
+
for backend in [Backend.LOCAL, Backend.DOCKER, Backend.APPTAINER, Backend.SLURM, Backend.PBS]:
|
|
2491
|
+
backend_health[backend.value] = self._check_backend_health(backend)
|
|
2492
|
+
|
|
2493
|
+
health_status["components"]["backends"] = backend_health
|
|
2494
|
+
|
|
2495
|
+
# 检查系统资源
|
|
2496
|
+
try:
|
|
2497
|
+
system_usage = self.resource_monitor.get_system_usage()
|
|
2498
|
+
if "error" in system_usage:
|
|
2499
|
+
health_status["components"]["system"] = {"error": system_usage["error"]}
|
|
2500
|
+
else:
|
|
2501
|
+
health_status["components"]["system"] = {
|
|
2502
|
+
"cpu_percent": system_usage["cpu_percent"],
|
|
2503
|
+
"memory_percent": system_usage["memory_percent"],
|
|
2504
|
+
"disk_percent": system_usage["disk_percent"],
|
|
2505
|
+
"status": "healthy" if system_usage["cpu_percent"] < 95 and
|
|
2506
|
+
system_usage["memory_percent"] < 95 else "warning"
|
|
2507
|
+
}
|
|
2508
|
+
except Exception as e:
|
|
2509
|
+
health_status["components"]["system"] = {"error": str(e)}
|
|
2510
|
+
|
|
2511
|
+
# 检查存储
|
|
2512
|
+
if self.job_store:
|
|
2513
|
+
try:
|
|
2514
|
+
with sqlite3.connect(self.job_store.db_path) as conn:
|
|
2515
|
+
conn.execute("SELECT 1")
|
|
2516
|
+
health_status["components"]["storage"] = {"status": "healthy"}
|
|
2517
|
+
except Exception as e:
|
|
2518
|
+
health_status["components"]["storage"] = {"error": str(e), "status": "unhealthy"}
|
|
2519
|
+
health_status["status"] = "degraded"
|
|
2520
|
+
|
|
2521
|
+
# 检查缓存
|
|
2522
|
+
if self.result_cache:
|
|
2523
|
+
try:
|
|
2524
|
+
stats = self.result_cache.get_stats()
|
|
2525
|
+
health_status["components"]["cache"] = {
|
|
2526
|
+
"status": "healthy",
|
|
2527
|
+
"count": stats["count"],
|
|
2528
|
+
"size_mb": stats["total_size_mb"]
|
|
2529
|
+
}
|
|
2530
|
+
except Exception as e:
|
|
2531
|
+
health_status["components"]["cache"] = {"error": str(e), "status": "unhealthy"}
|
|
2532
|
+
health_status["status"] = "degraded"
|
|
2533
|
+
|
|
2534
|
+
# 如果任何组件不健康,更新整体状态
|
|
2535
|
+
for component, status in health_status["components"].items():
|
|
2536
|
+
if status.get("status") == "unhealthy":
|
|
2537
|
+
health_status["status"] = "unhealthy"
|
|
2538
|
+
break
|
|
2539
|
+
elif status.get("status") == "warning":
|
|
2540
|
+
health_status["status"] = "degraded"
|
|
2541
|
+
|
|
2542
|
+
return health_status
|
|
2543
|
+
|
|
2544
|
+
def _check_backend_health(self, backend: Backend) -> Dict[str, Any]:
|
|
2545
|
+
"""检查后端健康状态"""
|
|
2546
|
+
try:
|
|
2547
|
+
if backend == Backend.LOCAL:
|
|
2548
|
+
return {"available": True, "status": "healthy"}
|
|
2549
|
+
|
|
2550
|
+
elif backend == Backend.DOCKER:
|
|
2551
|
+
result = subprocess.run(
|
|
2552
|
+
["docker", "info"],
|
|
2553
|
+
capture_output=True,
|
|
2554
|
+
text=True
|
|
2555
|
+
)
|
|
2556
|
+
return {
|
|
2557
|
+
"available": result.returncode == 0,
|
|
2558
|
+
"status": "healthy" if result.returncode == 0 else "unhealthy"
|
|
2559
|
+
}
|
|
2560
|
+
|
|
2561
|
+
elif backend == Backend.APPTAINER:
|
|
2562
|
+
result = subprocess.run(
|
|
2563
|
+
["apptainer", "version"],
|
|
2564
|
+
capture_output=True,
|
|
2565
|
+
text=True
|
|
2566
|
+
)
|
|
2567
|
+
return {
|
|
2568
|
+
"available": result.returncode == 0,
|
|
2569
|
+
"status": "healthy" if result.returncode == 0 else "unhealthy"
|
|
2570
|
+
}
|
|
2571
|
+
|
|
2572
|
+
elif backend == Backend.SLURM:
|
|
2573
|
+
result = subprocess.run(
|
|
2574
|
+
["sinfo", "--version"],
|
|
2575
|
+
capture_output=True,
|
|
2576
|
+
text=True
|
|
2577
|
+
)
|
|
2578
|
+
return {
|
|
2579
|
+
"available": result.returncode == 0,
|
|
2580
|
+
"status": "healthy" if result.returncode == 0 else "unhealthy"
|
|
2581
|
+
}
|
|
2582
|
+
|
|
2583
|
+
elif backend == Backend.PBS:
|
|
2584
|
+
result = subprocess.run(
|
|
2585
|
+
["qstat", "--version"],
|
|
2586
|
+
capture_output=True,
|
|
2587
|
+
text=True
|
|
2588
|
+
)
|
|
2589
|
+
return {
|
|
2590
|
+
"available": result.returncode == 0,
|
|
2591
|
+
"status": "healthy" if result.returncode == 0 else "unhealthy"
|
|
2592
|
+
}
|
|
2593
|
+
|
|
2594
|
+
return {"available": False, "status": "unknown"}
|
|
2595
|
+
|
|
2596
|
+
except Exception as e:
|
|
2597
|
+
return {"available": False, "status": "error", "error": str(e)}
|
|
2598
|
+
|
|
2599
|
+
def get_metrics(self) -> Dict[str, Any]:
|
|
2600
|
+
"""获取调度器指标"""
|
|
2601
|
+
metrics = self.metrics_collector.get_metrics()
|
|
2602
|
+
|
|
2603
|
+
# 添加当前状态
|
|
2604
|
+
with self._lock:
|
|
2605
|
+
current = {
|
|
2606
|
+
"jobs_total": len(self._job_definitions),
|
|
2607
|
+
"jobs_pending": len([f for f in self._futures.values() if not f.done()]),
|
|
2608
|
+
"jobs_running": len([f for f in self._futures.values() if not f.done()]),
|
|
2609
|
+
"jobs_completed": len(self._results),
|
|
2610
|
+
"backends_available": [b.value for b in Backend]
|
|
2611
|
+
}
|
|
2612
|
+
|
|
2613
|
+
metrics["current"] = current
|
|
2614
|
+
|
|
2615
|
+
# 添加缓存统计
|
|
2616
|
+
if self.result_cache:
|
|
2617
|
+
metrics["cache"] = self.result_cache.get_stats()
|
|
2618
|
+
|
|
2619
|
+
# 添加队列统计
|
|
2620
|
+
if self._priority_queue:
|
|
2621
|
+
metrics["queue"] = self._priority_queue.get_stats()
|
|
2622
|
+
|
|
2623
|
+
# 添加超时监控统计
|
|
2624
|
+
if self._timeout_monitor:
|
|
2625
|
+
metrics["timeout_monitor"] = self._timeout_monitor.get_stats()
|
|
2626
|
+
|
|
2627
|
+
return metrics
|
|
2628
|
+
|
|
2629
|
+
def get_queue_status(self) -> Dict[str, Any]:
|
|
2630
|
+
"""获取队列状态"""
|
|
2631
|
+
if not self._priority_queue:
|
|
2632
|
+
return {"enabled": False}
|
|
2633
|
+
|
|
2634
|
+
return {
|
|
2635
|
+
"enabled": True,
|
|
2636
|
+
"stats": self._priority_queue.get_stats(),
|
|
2637
|
+
"snapshot": self._priority_queue.get_queue_snapshot()
|
|
2638
|
+
}
|
|
2639
|
+
|
|
2640
|
+
def shutdown(self, wait: bool = True, cancel_jobs: bool = True):
|
|
2641
|
+
"""
|
|
2642
|
+
关闭调度器
|
|
2643
|
+
|
|
2644
|
+
Args:
|
|
2645
|
+
wait: 是否等待正在进行的作业完成
|
|
2646
|
+
cancel_jobs: 是否取消正在运行的作业
|
|
2647
|
+
"""
|
|
2648
|
+
self.logger.info("Shutting down scheduler...")
|
|
2649
|
+
|
|
2650
|
+
if cancel_jobs:
|
|
2651
|
+
self.cancel_all()
|
|
2652
|
+
|
|
2653
|
+
self._executor.shutdown(wait=wait)
|
|
2654
|
+
|
|
2655
|
+
# 保存插件数据
|
|
2656
|
+
for plugin in self.plugins:
|
|
2657
|
+
if isinstance(plugin, ResourceLogger):
|
|
2658
|
+
plugin.save_logs()
|
|
2659
|
+
|
|
2660
|
+
self.logger.info("Scheduler shut down")
|
|
2661
|
+
|
|
2662
|
+
def __del__(self):
|
|
2663
|
+
"""析构函数"""
|
|
2664
|
+
try:
|
|
2665
|
+
self.shutdown(wait=False, cancel_jobs=True)
|
|
2666
|
+
except:
|
|
2667
|
+
pass
|
|
2668
|
+
def run_workflow(
|
|
2669
|
+
self,
|
|
2670
|
+
workflow: List[Dict[str, Any]],
|
|
2671
|
+
max_workers: int = None,
|
|
2672
|
+
stop_on_error: bool = True,
|
|
2673
|
+
name: str = "workflow"
|
|
2674
|
+
) -> Dict[str, JobResult]:
|
|
2675
|
+
"""
|
|
2676
|
+
运行工作流(带依赖关系的作业)
|
|
2677
|
+
|
|
2678
|
+
Args:
|
|
2679
|
+
workflow: 工作流定义列表,每个元素包含命令和依赖关系
|
|
2680
|
+
max_workers: 最大工作线程数
|
|
2681
|
+
stop_on_error: 遇到错误是否停止整个工作流
|
|
2682
|
+
name: 工作流名称
|
|
2683
|
+
|
|
2684
|
+
Returns:
|
|
2685
|
+
作业ID到结果的映射
|
|
2686
|
+
|
|
2687
|
+
Examples:
|
|
2688
|
+
>>> # 简单线性工作流
|
|
2689
|
+
>>> workflow = [
|
|
2690
|
+
... {"cmd": "download_data.sh", "job_id": "download"},
|
|
2691
|
+
... {"cmd": "process_data.py", "job_id": "process",
|
|
2692
|
+
... "dependencies": ["download"]},
|
|
2693
|
+
... {"cmd": "analyze.py", "job_id": "analyze",
|
|
2694
|
+
... "dependencies": ["process"]}
|
|
2695
|
+
... ]
|
|
2696
|
+
>>> results = scheduler.run_workflow(workflow)
|
|
2697
|
+
|
|
2698
|
+
>>> # 并行工作流
|
|
2699
|
+
>>> workflow = [
|
|
2700
|
+
... {"cmd": "preprocess.py --input data1.csv", "job_id": "preprocess1"},
|
|
2701
|
+
... {"cmd": "preprocess.py --input data2.csv", "job_id": "preprocess2"},
|
|
2702
|
+
... {"cmd": "merge_results.py", "job_id": "merge",
|
|
2703
|
+
... "dependencies": ["preprocess1", "preprocess2"]}
|
|
2704
|
+
... ]
|
|
2705
|
+
>>> results = scheduler.run_workflow(
|
|
2706
|
+
... workflow,
|
|
2707
|
+
... max_workers=2,
|
|
2708
|
+
... name="data_pipeline"
|
|
2709
|
+
... )
|
|
2710
|
+
"""
|
|
2711
|
+
if max_workers is None:
|
|
2712
|
+
max_workers = self.max_concurrent
|
|
2713
|
+
|
|
2714
|
+
self.logger.info(f"Starting workflow: {name} with {len(workflow)} jobs")
|
|
2715
|
+
|
|
2716
|
+
# 创建工作流作业
|
|
2717
|
+
jobs = {}
|
|
2718
|
+
job_dependencies = {}
|
|
2719
|
+
|
|
2720
|
+
for spec in workflow:
|
|
2721
|
+
spec = spec.copy()
|
|
2722
|
+
|
|
2723
|
+
# 提取作业ID
|
|
2724
|
+
job_id = spec.pop("job_id", str(uuid.uuid4().hex[:8]))
|
|
2725
|
+
|
|
2726
|
+
# 提取依赖
|
|
2727
|
+
dependencies = spec.pop("dependencies", [])
|
|
2728
|
+
job_dependencies[job_id] = dependencies
|
|
2729
|
+
|
|
2730
|
+
# 创建作业定义
|
|
2731
|
+
cmd = spec.pop("cmd")
|
|
2732
|
+
# 处理其他参数
|
|
2733
|
+
backend = spec.pop("backend", None)
|
|
2734
|
+
if isinstance(backend, str):
|
|
2735
|
+
backend = Backend(backend)
|
|
2736
|
+
elif backend is None:
|
|
2737
|
+
backend = self.default_backend
|
|
2738
|
+
resource = spec.pop("resource", None)
|
|
2739
|
+
retry = spec.pop("retry", None)
|
|
2740
|
+
config = spec.pop("config", None)
|
|
2741
|
+
|
|
2742
|
+
# 创建配置对象
|
|
2743
|
+
if isinstance(resource, dict):
|
|
2744
|
+
resource = ResourceRequest(**resource)
|
|
2745
|
+
elif resource is None:
|
|
2746
|
+
resource = ResourceRequest()
|
|
2747
|
+
|
|
2748
|
+
if isinstance(retry, dict):
|
|
2749
|
+
retry = RetryConfig(**retry)
|
|
2750
|
+
elif retry is None:
|
|
2751
|
+
retry = RetryConfig()
|
|
2752
|
+
|
|
2753
|
+
if isinstance(config, dict):
|
|
2754
|
+
config = ExecutionConfig(**config)
|
|
2755
|
+
elif config is None:
|
|
2756
|
+
config = ExecutionConfig()
|
|
2757
|
+
|
|
2758
|
+
# 后端转换
|
|
2759
|
+
if isinstance(backend, str):
|
|
2760
|
+
backend = Backend(backend)
|
|
2761
|
+
elif backend is None:
|
|
2762
|
+
backend = self.default_backend
|
|
2763
|
+
|
|
2764
|
+
# 创建作业定义
|
|
2765
|
+
job = JobDefinition(
|
|
2766
|
+
cmd=cmd,
|
|
2767
|
+
backend=backend,
|
|
2768
|
+
config=config,
|
|
2769
|
+
resource=resource,
|
|
2770
|
+
retry=retry,
|
|
2771
|
+
job_id=job_id,
|
|
2772
|
+
**spec # 其他参数如name, tags等
|
|
2773
|
+
)
|
|
2774
|
+
jobs[job_id] = job
|
|
2775
|
+
|
|
2776
|
+
# 结果存储
|
|
2777
|
+
results = {}
|
|
2778
|
+
completed = set()
|
|
2779
|
+
failed = set()
|
|
2780
|
+
|
|
2781
|
+
# 工作流执行循环
|
|
2782
|
+
while len(results) < len(jobs):
|
|
2783
|
+
# 找到可以执行的作业(依赖都已满足)
|
|
2784
|
+
ready_jobs = []
|
|
2785
|
+
|
|
2786
|
+
for job_id, job in jobs.items():
|
|
2787
|
+
if job_id in results:
|
|
2788
|
+
continue # 已经完成
|
|
2789
|
+
|
|
2790
|
+
# 检查依赖
|
|
2791
|
+
dependencies = job_dependencies.get(job_id, [])
|
|
2792
|
+
can_run = True
|
|
2793
|
+
|
|
2794
|
+
for dep_id in dependencies:
|
|
2795
|
+
if dep_id not in results:
|
|
2796
|
+
can_run = False
|
|
2797
|
+
break
|
|
2798
|
+
elif results[dep_id].failed():
|
|
2799
|
+
can_run = False
|
|
2800
|
+
break
|
|
2801
|
+
|
|
2802
|
+
if can_run:
|
|
2803
|
+
ready_jobs.append(job)
|
|
2804
|
+
|
|
2805
|
+
if not ready_jobs:
|
|
2806
|
+
# 没有可运行的作业,可能是有循环依赖或依赖失败
|
|
2807
|
+
if stop_on_error and failed:
|
|
2808
|
+
self.logger.warning(f"Workflow {name} stopped due to failed dependencies")
|
|
2809
|
+
break
|
|
2810
|
+
else:
|
|
2811
|
+
# 检查是否有作业因为循环依赖而无法运行
|
|
2812
|
+
time.sleep(1)
|
|
2813
|
+
continue
|
|
2814
|
+
|
|
2815
|
+
# 执行就绪的作业
|
|
2816
|
+
batch_results = self.run_many(
|
|
2817
|
+
ready_jobs,
|
|
2818
|
+
max_workers=min(max_workers, len(ready_jobs)),
|
|
2819
|
+
stop_on_error=stop_on_error,
|
|
2820
|
+
use_cache=False # 工作流作业通常不使用缓存
|
|
2821
|
+
)
|
|
2822
|
+
|
|
2823
|
+
# 更新结果
|
|
2824
|
+
for result in batch_results:
|
|
2825
|
+
results[result.job_id] = result
|
|
2826
|
+
|
|
2827
|
+
if result.success():
|
|
2828
|
+
completed.add(result.job_id)
|
|
2829
|
+
else:
|
|
2830
|
+
failed.add(result.job_id)
|
|
2831
|
+
|
|
2832
|
+
if stop_on_error:
|
|
2833
|
+
self.logger.warning(f"Workflow job failed: {result.job_id}")
|
|
2834
|
+
|
|
2835
|
+
# 进度日志
|
|
2836
|
+
self.logger.info(
|
|
2837
|
+
f"Workflow {name} progress: {len(results)}/{len(jobs)} "
|
|
2838
|
+
f"(completed: {len(completed)}, failed: {len(failed)})"
|
|
2839
|
+
)
|
|
2840
|
+
|
|
2841
|
+
self.logger.info(f"Workflow {name} completed")
|
|
2842
|
+
return results
|
|
2843
|
+
# ============================================================================
|
|
2844
|
+
# 简化接口函数
|
|
2845
|
+
# ============================================================================
|
|
2846
|
+
|
|
2847
|
+
def run_command(
|
|
2848
|
+
cmd: str,
|
|
2849
|
+
backend: str = "local",
|
|
2850
|
+
image: str = None,
|
|
2851
|
+
mounts: Dict[str, str] = None,
|
|
2852
|
+
workdir: str = None,
|
|
2853
|
+
env: Dict[str, str] = None,
|
|
2854
|
+
dry_run: bool = False,
|
|
2855
|
+
resource: Dict[str, Any] = None,
|
|
2856
|
+
retry: Dict[str, Any] = None,
|
|
2857
|
+
**kwargs
|
|
2858
|
+
) -> JobResult:
|
|
2859
|
+
"""
|
|
2860
|
+
简化接口函数 - 保持与原始版本兼容
|
|
2861
|
+
|
|
2862
|
+
Args:
|
|
2863
|
+
cmd: 要执行的命令
|
|
2864
|
+
backend: local | docker | apptainer | slurm | pbs
|
|
2865
|
+
image: 容器镜像
|
|
2866
|
+
mounts: 挂载映射
|
|
2867
|
+
workdir: 工作目录
|
|
2868
|
+
env: 环境变量
|
|
2869
|
+
dry_run: 只打印不执行
|
|
2870
|
+
resource: 资源请求配置
|
|
2871
|
+
retry: 重试配置
|
|
2872
|
+
|
|
2873
|
+
Returns:
|
|
2874
|
+
作业结果
|
|
2875
|
+
"""
|
|
2876
|
+
# 转换参数
|
|
2877
|
+
mounts_dict = None
|
|
2878
|
+
if mounts:
|
|
2879
|
+
mounts_dict = {Path(k): Path(v) for k, v in mounts.items()}
|
|
2880
|
+
|
|
2881
|
+
workdir_path = Path(workdir) if workdir else None
|
|
2882
|
+
|
|
2883
|
+
# 创建调度器实例
|
|
2884
|
+
scheduler = ContainerScheduler()
|
|
2885
|
+
|
|
2886
|
+
# 资源请求
|
|
2887
|
+
resource_obj = None
|
|
2888
|
+
if resource:
|
|
2889
|
+
resource_obj = ResourceRequest(**resource)
|
|
2890
|
+
|
|
2891
|
+
# 重试配置
|
|
2892
|
+
retry_obj = None
|
|
2893
|
+
if retry:
|
|
2894
|
+
retry_obj = RetryConfig(**retry)
|
|
2895
|
+
|
|
2896
|
+
# 执行配置
|
|
2897
|
+
config_kwargs = {}
|
|
2898
|
+
for key in ["timeout", "stdout", "stderr", "capture_output", "check", "silent"]:
|
|
2899
|
+
if key in kwargs:
|
|
2900
|
+
config_kwargs[key] = kwargs[key]
|
|
2901
|
+
|
|
2902
|
+
config = ExecutionConfig(
|
|
2903
|
+
workdir=workdir_path,
|
|
2904
|
+
env=env or {},
|
|
2905
|
+
mounts=mounts_dict or {},
|
|
2906
|
+
**config_kwargs
|
|
2907
|
+
)
|
|
2908
|
+
|
|
2909
|
+
# 运行命令
|
|
2910
|
+
return scheduler.run(
|
|
2911
|
+
cmd=cmd,
|
|
2912
|
+
backend=backend,
|
|
2913
|
+
image=image,
|
|
2914
|
+
config=config,
|
|
2915
|
+
resource=resource_obj,
|
|
2916
|
+
retry=retry_obj,
|
|
2917
|
+
dry_run=dry_run
|
|
2918
|
+
)
|
|
2919
|
+
|
|
2920
|
+
# ============================================================================
|
|
2921
|
+
# 命令行接口
|
|
2922
|
+
# ============================================================================
|
|
2923
|
+
|
|
2924
|
+
def main():
|
|
2925
|
+
"""命令行入口点"""
|
|
2926
|
+
import argparse
|
|
2927
|
+
|
|
2928
|
+
parser = argparse.ArgumentParser(
|
|
2929
|
+
description="Universal Container Scheduler CLI",
|
|
2930
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
2931
|
+
epilog="""
|
|
2932
|
+
Examples:
|
|
2933
|
+
%(prog)s "echo Hello World"
|
|
2934
|
+
%(prog)s "python script.py" --backend docker --image python:3.9
|
|
2935
|
+
%(prog)s --health
|
|
2936
|
+
%(prog)s --stats
|
|
2937
|
+
%(prog)s "long_job.sh" --queue --priority high
|
|
2938
|
+
"""
|
|
2939
|
+
)
|
|
2940
|
+
|
|
2941
|
+
# 主要模式
|
|
2942
|
+
parser.add_argument("command", nargs="?", help="Command to execute")
|
|
2943
|
+
parser.add_argument("--config", help="Configuration file path")
|
|
2944
|
+
|
|
2945
|
+
# 执行参数
|
|
2946
|
+
parser.add_argument("--backend", default="local",
|
|
2947
|
+
choices=["local", "docker", "apptainer", "slurm", "pbs"],
|
|
2948
|
+
help="Execution backend")
|
|
2949
|
+
parser.add_argument("--image", help="Container image")
|
|
2950
|
+
parser.add_argument("--workdir", help="Working directory")
|
|
2951
|
+
parser.add_argument("--mount", action="append",
|
|
2952
|
+
help="Mount mapping (host:container)")
|
|
2953
|
+
parser.add_argument("--env", action="append",
|
|
2954
|
+
help="Environment variable (KEY=VALUE)")
|
|
2955
|
+
parser.add_argument("--cpus", type=int, default=1, help="CPU cores")
|
|
2956
|
+
parser.add_argument("--memory", type=float, help="Memory in GB")
|
|
2957
|
+
parser.add_argument("--timeout", type=int, help="Timeout in seconds")
|
|
2958
|
+
parser.add_argument("--dry-run", action="store_true", help="Dry run")
|
|
2959
|
+
parser.add_argument("--output", help="Output file")
|
|
2960
|
+
parser.add_argument("--error", help="Error file")
|
|
2961
|
+
parser.add_argument("--retry", type=int, default=1, help="Max retry attempts")
|
|
2962
|
+
|
|
2963
|
+
# 队列和调度
|
|
2964
|
+
parser.add_argument("--queue", action="store_true", help="Add job to queue instead of immediate execution")
|
|
2965
|
+
parser.add_argument("--priority", default="normal",
|
|
2966
|
+
choices=["lowest", "low", "normal", "high", "highest", "critical"],
|
|
2967
|
+
help="Job priority")
|
|
2968
|
+
|
|
2969
|
+
# 监控和管理
|
|
2970
|
+
parser.add_argument("--health", action="store_true", help="Check scheduler health")
|
|
2971
|
+
parser.add_argument("--stats", action="store_true", help="Show scheduler statistics")
|
|
2972
|
+
parser.add_argument("--list-jobs", action="store_true", help="List all jobs")
|
|
2973
|
+
parser.add_argument("--queue-status", action="store_true", help="Show queue status")
|
|
2974
|
+
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
2975
|
+
|
|
2976
|
+
args = parser.parse_args()
|
|
2977
|
+
|
|
2978
|
+
# 设置日志
|
|
2979
|
+
log_level = logging.DEBUG if args.verbose else logging.INFO
|
|
2980
|
+
logging.basicConfig(
|
|
2981
|
+
level=log_level,
|
|
2982
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
2983
|
+
)
|
|
2984
|
+
|
|
2985
|
+
logger = logging.getLogger(__name__)
|
|
2986
|
+
|
|
2987
|
+
# 加载配置
|
|
2988
|
+
config = None
|
|
2989
|
+
if args.config:
|
|
2990
|
+
config = SchedulerConfig.load(Path(args.config))
|
|
2991
|
+
|
|
2992
|
+
# 创建调度器
|
|
2993
|
+
scheduler = ContainerScheduler(
|
|
2994
|
+
config=config,
|
|
2995
|
+
enable_priority_queue=True if args.queue else False,
|
|
2996
|
+
enable_timeout_monitor=True
|
|
2997
|
+
)
|
|
2998
|
+
|
|
2999
|
+
try:
|
|
3000
|
+
# 处理不同模式
|
|
3001
|
+
if args.health:
|
|
3002
|
+
# 健康检查模式
|
|
3003
|
+
health = scheduler.health_check()
|
|
3004
|
+
print(json.dumps(health, indent=2, default=str))
|
|
3005
|
+
sys.exit(0 if health["status"] == "healthy" else 1)
|
|
3006
|
+
|
|
3007
|
+
elif args.stats:
|
|
3008
|
+
# 统计模式
|
|
3009
|
+
metrics = scheduler.get_metrics()
|
|
3010
|
+
print(json.dumps(metrics, indent=2, default=str))
|
|
3011
|
+
sys.exit(0)
|
|
3012
|
+
|
|
3013
|
+
elif args.queue_status:
|
|
3014
|
+
# 队列状态
|
|
3015
|
+
queue_status = scheduler.get_queue_status()
|
|
3016
|
+
print(json.dumps(queue_status, indent=2, default=str))
|
|
3017
|
+
sys.exit(0)
|
|
3018
|
+
|
|
3019
|
+
elif args.list_jobs:
|
|
3020
|
+
# 列出作业
|
|
3021
|
+
if scheduler.job_store:
|
|
3022
|
+
jobs = scheduler.job_store.search_jobs(limit=100)
|
|
3023
|
+
for job in jobs:
|
|
3024
|
+
result = scheduler.get_result(job.job_id)
|
|
3025
|
+
status = result.status.value if result else "unknown"
|
|
3026
|
+
print(f"{job.job_id}: {job.name} ({job.backend.value}) - {status}")
|
|
3027
|
+
else:
|
|
3028
|
+
print("Job store not enabled")
|
|
3029
|
+
sys.exit(0)
|
|
3030
|
+
|
|
3031
|
+
elif args.command:
|
|
3032
|
+
# 执行命令模式
|
|
3033
|
+
# 解析挂载
|
|
3034
|
+
mounts = {}
|
|
3035
|
+
if args.mount:
|
|
3036
|
+
for mount in args.mount:
|
|
3037
|
+
if ":" in mount:
|
|
3038
|
+
host, container = mount.split(":", 1)
|
|
3039
|
+
mounts[host] = container
|
|
3040
|
+
|
|
3041
|
+
# 解析环境变量
|
|
3042
|
+
env = {}
|
|
3043
|
+
if args.env:
|
|
3044
|
+
for env_var in args.env:
|
|
3045
|
+
if "=" in env_var:
|
|
3046
|
+
key, value = env_var.split("=", 1)
|
|
3047
|
+
env[key] = value
|
|
3048
|
+
|
|
3049
|
+
# 资源配置
|
|
3050
|
+
resource = {}
|
|
3051
|
+
if args.cpus > 1:
|
|
3052
|
+
resource["cpus"] = args.cpus
|
|
3053
|
+
if args.memory:
|
|
3054
|
+
resource["memory_gb"] = args.memory
|
|
3055
|
+
|
|
3056
|
+
# 执行配置
|
|
3057
|
+
config_dict = {}
|
|
3058
|
+
if args.timeout:
|
|
3059
|
+
config_dict["timeout"] = args.timeout
|
|
3060
|
+
if args.output:
|
|
3061
|
+
config_dict["stdout"] = args.output
|
|
3062
|
+
if args.error:
|
|
3063
|
+
config_dict["stderr"] = args.error
|
|
3064
|
+
|
|
3065
|
+
# 优先级转换
|
|
3066
|
+
priority_map = {
|
|
3067
|
+
"lowest": JobPriority.LOWEST,
|
|
3068
|
+
"low": JobPriority.LOW,
|
|
3069
|
+
"normal": JobPriority.NORMAL,
|
|
3070
|
+
"high": JobPriority.HIGH,
|
|
3071
|
+
"highest": JobPriority.HIGHEST,
|
|
3072
|
+
"critical": JobPriority.CRITICAL
|
|
3073
|
+
}
|
|
3074
|
+
priority = priority_map[args.priority]
|
|
3075
|
+
|
|
3076
|
+
if args.queue:
|
|
3077
|
+
# 加入队列
|
|
3078
|
+
job = JobDefinition(
|
|
3079
|
+
cmd=args.command,
|
|
3080
|
+
backend=Backend(args.backend),
|
|
3081
|
+
image=args.image,
|
|
3082
|
+
config=ExecutionConfig(
|
|
3083
|
+
workdir=Path(args.workdir) if args.workdir else None,
|
|
3084
|
+
env=env,
|
|
3085
|
+
mounts={Path(k): Path(v) for k, v in mounts.items()},
|
|
3086
|
+
**config_dict
|
|
3087
|
+
),
|
|
3088
|
+
resource=ResourceRequest(**resource),
|
|
3089
|
+
retry=RetryConfig(max_attempts=args.retry),
|
|
3090
|
+
priority=priority,
|
|
3091
|
+
name="cli_job"
|
|
3092
|
+
)
|
|
3093
|
+
job_id = scheduler.enqueue(job)
|
|
3094
|
+
print(f"Job enqueued with ID: {job_id}")
|
|
3095
|
+
print(f"Use --queue-status to check queue status")
|
|
3096
|
+
else:
|
|
3097
|
+
# 立即执行
|
|
3098
|
+
result = scheduler.run(
|
|
3099
|
+
cmd=args.command,
|
|
3100
|
+
backend=args.backend,
|
|
3101
|
+
image=args.image,
|
|
3102
|
+
mounts=mounts,
|
|
3103
|
+
workdir=args.workdir,
|
|
3104
|
+
env=env,
|
|
3105
|
+
dry_run=args.dry_run,
|
|
3106
|
+
resource=resource,
|
|
3107
|
+
retry={"max_attempts": args.retry},
|
|
3108
|
+
config=config_dict,
|
|
3109
|
+
priority=priority
|
|
3110
|
+
)
|
|
3111
|
+
|
|
3112
|
+
# 输出结果
|
|
3113
|
+
if result.stdout:
|
|
3114
|
+
print(result.stdout)
|
|
3115
|
+
|
|
3116
|
+
if result.stderr:
|
|
3117
|
+
print(result.stderr, file=sys.stderr)
|
|
3118
|
+
|
|
3119
|
+
exit_code = result.exit_code or 0
|
|
3120
|
+
if result.success():
|
|
3121
|
+
print(f"Command completed successfully in {result.duration:.2f}s")
|
|
3122
|
+
else:
|
|
3123
|
+
print(f"Command failed with exit code {exit_code}: {result.error_message}")
|
|
3124
|
+
|
|
3125
|
+
sys.exit(exit_code)
|
|
3126
|
+
else:
|
|
3127
|
+
# 交互模式
|
|
3128
|
+
print("Universal Container Scheduler")
|
|
3129
|
+
print("No command specified. Available modes:")
|
|
3130
|
+
print(" --health Check scheduler health")
|
|
3131
|
+
print(" --stats Show scheduler statistics")
|
|
3132
|
+
print(" --queue Add job to queue")
|
|
3133
|
+
print(" --list-jobs List all jobs")
|
|
3134
|
+
print(" --queue-status Show queue status")
|
|
3135
|
+
sys.exit(1)
|
|
3136
|
+
|
|
3137
|
+
except KeyboardInterrupt:
|
|
3138
|
+
print("\nInterrupted by user")
|
|
3139
|
+
scheduler.shutdown(wait=False, cancel_jobs=True)
|
|
3140
|
+
sys.exit(130)
|
|
3141
|
+
except Exception as e:
|
|
3142
|
+
logger.error(f"Error: {e}")
|
|
3143
|
+
if args.verbose:
|
|
3144
|
+
import traceback
|
|
3145
|
+
traceback.print_exc()
|
|
3146
|
+
sys.exit(1)
|
|
3147
|
+
finally:
|
|
3148
|
+
scheduler.shutdown(wait=True, cancel_jobs=False)
|
|
3149
|
+
|
|
3150
|
+
|
|
3151
|
+
# ============================================================================
|
|
3152
|
+
# 使用示例
|
|
3153
|
+
# ============================================================================
|
|
3154
|
+
|
|
3155
|
+
# if __name__ == "__main__":
|
|
3156
|
+
# # 示例代码
|
|
3157
|
+
# print("Universal Container Scheduler - Example Usage")
|
|
3158
|
+
# print("=" * 50)
|
|
3159
|
+
|
|
3160
|
+
# # 示例1: 基本使用
|
|
3161
|
+
# print("\n1. Basic Usage:")
|
|
3162
|
+
# scheduler = ContainerScheduler()
|
|
3163
|
+
# result = scheduler.run("echo 'Hello, World!'")
|
|
3164
|
+
# print(f" Result: {result.status.value}, Output: {result.stdout}")
|
|
3165
|
+
|
|
3166
|
+
# # 示例2: 使用Docker容器
|
|
3167
|
+
# print("\n2. Docker Container Example:")
|
|
3168
|
+
# try:
|
|
3169
|
+
# result = scheduler.run(
|
|
3170
|
+
# cmd="python -c 'import sys; print(f\"Python {sys.version}\")'",
|
|
3171
|
+
# backend="docker",
|
|
3172
|
+
# image="python:3.9-slim",
|
|
3173
|
+
# dry_run=True # 干运行,不实际执行
|
|
3174
|
+
# )
|
|
3175
|
+
# print(f" Dry run completed for Docker command")
|
|
3176
|
+
# except Exception as e:
|
|
3177
|
+
# print(f" Docker not available: {e}")
|
|
3178
|
+
|
|
3179
|
+
# # 示例3: 带重试的作业
|
|
3180
|
+
# print("\n3. Job with Retry:")
|
|
3181
|
+
# result = scheduler.run(
|
|
3182
|
+
# cmd="echo 'Test with retry' && exit 1", # 这个命令会失败
|
|
3183
|
+
# retry={"max_attempts": 3, "delay_seconds": 1}
|
|
3184
|
+
# )
|
|
3185
|
+
# print(f" Final status: {result.status.value}, Attempts: {result.attempts}")
|
|
3186
|
+
|
|
3187
|
+
# # 示例4: 批量作业
|
|
3188
|
+
# print("\n4. Batch Jobs:")
|
|
3189
|
+
# commands = [f"echo 'Job {i}'" for i in range(3)]
|
|
3190
|
+
# results = scheduler.run_many(commands, max_workers=2)
|
|
3191
|
+
# print(f" Completed {len(results)} jobs, {sum(1 for r in results if r.success())} successful")
|
|
3192
|
+
|
|
3193
|
+
# # 示例5: 健康检查
|
|
3194
|
+
# print("\n5. Health Check:")
|
|
3195
|
+
# health = scheduler.health_check()
|
|
3196
|
+
# print(f" Status: {health['status']}")
|
|
3197
|
+
|
|
3198
|
+
# # 示例6: 指标收集
|
|
3199
|
+
# print("\n6. Metrics:")
|
|
3200
|
+
# metrics = scheduler.get_metrics()
|
|
3201
|
+
# print(f" Total jobs: {metrics['jobs_total']}")
|
|
3202
|
+
# print(f" Success rate: {metrics.get('success_rate', 0):.1%}")
|
|
3203
|
+
|
|
3204
|
+
# # 清理
|
|
3205
|
+
# scheduler.shutdown()
|
|
3206
|
+
|
|
3207
|
+
# print("\n" + "=" * 50)
|
|
3208
|
+
# print("All examples completed!")
|
|
3209
|
+
# print("\nTo use the command line interface:")
|
|
3210
|
+
# print(" python universal_scheduler.py --help")
|
|
3211
|
+
# print("\nExample commands:")
|
|
3212
|
+
# print(" python universal_scheduler.py 'echo Hello'")
|
|
3213
|
+
# print(" python universal_scheduler.py --health")
|
|
3214
|
+
# print(" python universal_scheduler.py --stats")
|
|
3215
|
+
|
|
3216
|
+
|
|
3217
|
+
# ============================================================================
|
|
3218
|
+
# 高级使用场景示例
|
|
3219
|
+
# ============================================================================
|
|
3220
|
+
|
|
3221
|
+
def example_data_processing_pipeline():
|
|
3222
|
+
"""
|
|
3223
|
+
示例1: 数据处理流水线
|
|
3224
|
+
模拟真实的数据处理工作流:下载 -> 预处理 -> 分析 -> 报告
|
|
3225
|
+
"""
|
|
3226
|
+
print("\n" + "="*60)
|
|
3227
|
+
print("示例1: 数据处理流水线")
|
|
3228
|
+
print("="*60)
|
|
3229
|
+
|
|
3230
|
+
# 创建配置化的调度器
|
|
3231
|
+
scheduler = ContainerScheduler(
|
|
3232
|
+
max_concurrent=4,
|
|
3233
|
+
job_store=JobStore("data_pipeline.db"),
|
|
3234
|
+
result_cache=ResultCache(".pipeline_cache"),
|
|
3235
|
+
plugins=[
|
|
3236
|
+
NotificationPlugin(),
|
|
3237
|
+
ResourceLogger()
|
|
3238
|
+
],
|
|
3239
|
+
enable_priority_queue=True
|
|
3240
|
+
)
|
|
3241
|
+
|
|
3242
|
+
# 模拟数据文件
|
|
3243
|
+
data_files = [
|
|
3244
|
+
"sales_2023_q1.csv",
|
|
3245
|
+
"sales_2023_q2.csv",
|
|
3246
|
+
"sales_2023_q3.csv",
|
|
3247
|
+
"sales_2023_q4.csv"
|
|
3248
|
+
]
|
|
3249
|
+
|
|
3250
|
+
try:
|
|
3251
|
+
# 阶段1: 并行下载数据(模拟)
|
|
3252
|
+
print("\n阶段1: 下载数据文件...")
|
|
3253
|
+
download_jobs = []
|
|
3254
|
+
for data_file in data_files:
|
|
3255
|
+
job = JobDefinition(
|
|
3256
|
+
cmd=f"curl -s https://example.com/data/{data_file} -o {data_file}",
|
|
3257
|
+
name=f"download_{data_file}",
|
|
3258
|
+
resource=ResourceRequest(cpus=1, memory_gb=2),
|
|
3259
|
+
retry=RetryConfig(max_attempts=3, delay_seconds=5),
|
|
3260
|
+
tags={"stage": "download", "file": data_file}
|
|
3261
|
+
)
|
|
3262
|
+
download_jobs.append(job)
|
|
3263
|
+
|
|
3264
|
+
# 批量提交下载作业
|
|
3265
|
+
download_results = scheduler.run_many(
|
|
3266
|
+
[{"cmd": f"echo '模拟下载 {f}' && sleep 1" for f in data_files}],
|
|
3267
|
+
progress_callback=lambda c, t: print(f" 下载进度: {c}/{t}")
|
|
3268
|
+
)
|
|
3269
|
+
|
|
3270
|
+
# 阶段2: 数据预处理
|
|
3271
|
+
print("\n阶段2: 数据预处理...")
|
|
3272
|
+
preprocess_jobs = []
|
|
3273
|
+
for data_file in data_files:
|
|
3274
|
+
output_file = data_file.replace(".csv", "_processed.parquet")
|
|
3275
|
+
job = JobDefinition(
|
|
3276
|
+
cmd=f"python preprocess.py --input {data_file} --output {output_file}",
|
|
3277
|
+
name=f"preprocess_{data_file}",
|
|
3278
|
+
backend=Backend.DOCKER,
|
|
3279
|
+
image="python:3.9-data-science",
|
|
3280
|
+
mounts={"/data": "/data"},
|
|
3281
|
+
config=ExecutionConfig(
|
|
3282
|
+
workdir=Path("/data"),
|
|
3283
|
+
env={"PYTHONPATH": "/data/scripts"}
|
|
3284
|
+
),
|
|
3285
|
+
resource=ResourceRequest(cpus=4, memory_gb=8),
|
|
3286
|
+
retry=RetryConfig(max_attempts=2),
|
|
3287
|
+
tags={"stage": "preprocess", "file": data_file}
|
|
3288
|
+
)
|
|
3289
|
+
preprocess_jobs.append(job)
|
|
3290
|
+
scheduler.enqueue(job) # 加入队列
|
|
3291
|
+
|
|
3292
|
+
# 等待预处理完成
|
|
3293
|
+
print("等待预处理作业完成...")
|
|
3294
|
+
scheduler.wait_all()
|
|
3295
|
+
|
|
3296
|
+
# 阶段3: 聚合分析
|
|
3297
|
+
print("\n阶段3: 聚合分析...")
|
|
3298
|
+
analysis_job = JobDefinition(
|
|
3299
|
+
cmd="python analyze.py --pattern *_processed.parquet --output analysis_results.json",
|
|
3300
|
+
name="aggregate_analysis",
|
|
3301
|
+
backend=Backend.SLURM,
|
|
3302
|
+
resource=ResourceRequest(
|
|
3303
|
+
cpus=8,
|
|
3304
|
+
memory_gb=32,
|
|
3305
|
+
time_hours=2,
|
|
3306
|
+
partition="analysis"
|
|
3307
|
+
),
|
|
3308
|
+
tags={"stage": "analysis"}
|
|
3309
|
+
)
|
|
3310
|
+
|
|
3311
|
+
analysis_result = scheduler.submit(analysis_job, wait=True)
|
|
3312
|
+
|
|
3313
|
+
# 阶段4: 生成报告
|
|
3314
|
+
print("\n阶段4: 生成报告...")
|
|
3315
|
+
report_job = JobDefinition(
|
|
3316
|
+
cmd="python generate_report.py --input analysis_results.json --output report.html",
|
|
3317
|
+
name="generate_report",
|
|
3318
|
+
resource=ResourceRequest(cpus=2, memory_gb=4),
|
|
3319
|
+
tags={"stage": "report"}
|
|
3320
|
+
)
|
|
3321
|
+
|
|
3322
|
+
report_result = scheduler.submit(report_job, wait=True)
|
|
3323
|
+
|
|
3324
|
+
# 检查最终结果
|
|
3325
|
+
if report_result.success():
|
|
3326
|
+
print(f"\n✅ 数据处理流水线完成!")
|
|
3327
|
+
print(f" 总作业数: {scheduler.metrics_collector.get_metrics()['jobs_total']}")
|
|
3328
|
+
print(f" 成功作业: {scheduler.metrics_collector.get_metrics()['jobs_completed']}")
|
|
3329
|
+
print(f" 总耗时: {report_result.duration:.1f}s")
|
|
3330
|
+
else:
|
|
3331
|
+
print(f"\n❌ 数据处理流水线失败!")
|
|
3332
|
+
print(f" 错误信息: {report_result.error_message}")
|
|
3333
|
+
|
|
3334
|
+
finally:
|
|
3335
|
+
scheduler.shutdown()
|
|
3336
|
+
print("调度器已关闭")
|
|
3337
|
+
|
|
3338
|
+
def example_machine_learning_training():
|
|
3339
|
+
"""
|
|
3340
|
+
示例2: 机器学习模型训练与超参数搜索
|
|
3341
|
+
分布式模型训练和超参数优化
|
|
3342
|
+
"""
|
|
3343
|
+
print("\n" + "="*60)
|
|
3344
|
+
print("示例2: 机器学习模型训练")
|
|
3345
|
+
print("="*60)
|
|
3346
|
+
|
|
3347
|
+
scheduler = ContainerScheduler(
|
|
3348
|
+
max_concurrent=8,
|
|
3349
|
+
job_store=JobStore("ml_training.db"),
|
|
3350
|
+
enable_priority_queue=True
|
|
3351
|
+
)
|
|
3352
|
+
|
|
3353
|
+
# 超参数搜索空间
|
|
3354
|
+
hyperparameters = [
|
|
3355
|
+
{"model": "resnet50", "lr": 0.001, "batch_size": 32, "epochs": 50},
|
|
3356
|
+
{"model": "resnet50", "lr": 0.01, "batch_size": 64, "epochs": 50},
|
|
3357
|
+
{"model": "efficientnet", "lr": 0.001, "batch_size": 32, "epochs": 50},
|
|
3358
|
+
{"model": "efficientnet", "lr": 0.01, "batch_size": 64, "epochs": 50},
|
|
3359
|
+
{"model": "vit", "lr": 0.0005, "batch_size": 16, "epochs": 100},
|
|
3360
|
+
{"model": "vit", "lr": 0.005, "batch_size": 32, "epochs": 100},
|
|
3361
|
+
]
|
|
3362
|
+
|
|
3363
|
+
try:
|
|
3364
|
+
print(f"开始超参数搜索,共 {len(hyperparameters)} 组配置...")
|
|
3365
|
+
|
|
3366
|
+
# 为每组超参数创建训练任务
|
|
3367
|
+
training_jobs = []
|
|
3368
|
+
for i, params in enumerate(hyperparameters):
|
|
3369
|
+
params_str = " ".join([f"--{k} {v}" for k, v in params.items()])
|
|
3370
|
+
|
|
3371
|
+
job = JobDefinition(
|
|
3372
|
+
cmd=f"python train_model.py {params_str} --data /data/imagenet --output /output/model_{i}.pth",
|
|
3373
|
+
name=f"train_model_{i}",
|
|
3374
|
+
backend=Backend.SLURM,
|
|
3375
|
+
image="pytorch/pytorch:latest",
|
|
3376
|
+
config=ExecutionConfig(
|
|
3377
|
+
mounts={
|
|
3378
|
+
Path("/datasets/imagenet"): Path("/data"),
|
|
3379
|
+
Path("/models"): Path("/output")
|
|
3380
|
+
}
|
|
3381
|
+
),
|
|
3382
|
+
resource=ResourceRequest(
|
|
3383
|
+
cpus=8,
|
|
3384
|
+
memory_gb=32,
|
|
3385
|
+
gpus=2,
|
|
3386
|
+
gpu_type="a100",
|
|
3387
|
+
time_hours=12,
|
|
3388
|
+
partition="gpu",
|
|
3389
|
+
exclusive=True
|
|
3390
|
+
),
|
|
3391
|
+
retry=RetryConfig(
|
|
3392
|
+
max_attempts=2,
|
|
3393
|
+
retry_on_memory_error=True,
|
|
3394
|
+
retry_on_network_error=True
|
|
3395
|
+
),
|
|
3396
|
+
tags={
|
|
3397
|
+
"task": "hyperparameter_search",
|
|
3398
|
+
"model": params["model"],
|
|
3399
|
+
"config_id": str(i)
|
|
3400
|
+
},
|
|
3401
|
+
callback=lambda result, i=i: print(f" 配置 {i} 训练完成: {result.status.value}")
|
|
3402
|
+
)
|
|
3403
|
+
training_jobs.append(job)
|
|
3404
|
+
|
|
3405
|
+
# 批量提交训练作业(并行执行)
|
|
3406
|
+
print("提交训练作业...")
|
|
3407
|
+
training_results = scheduler.run_many(
|
|
3408
|
+
training_jobs,
|
|
3409
|
+
max_workers=4, # 最多同时训练4个模型
|
|
3410
|
+
progress_callback=lambda c, t: print(f" 训练进度: {c}/{t}")
|
|
3411
|
+
)
|
|
3412
|
+
|
|
3413
|
+
# 收集最佳模型
|
|
3414
|
+
best_model = None
|
|
3415
|
+
best_accuracy = 0.0
|
|
3416
|
+
|
|
3417
|
+
for result in training_results:
|
|
3418
|
+
if result.success() and result.stdout:
|
|
3419
|
+
try:
|
|
3420
|
+
# 从输出中解析指标
|
|
3421
|
+
import json
|
|
3422
|
+
metrics = json.loads(result.stdout)
|
|
3423
|
+
accuracy = metrics.get("val_accuracy", 0)
|
|
3424
|
+
|
|
3425
|
+
if accuracy > best_accuracy:
|
|
3426
|
+
best_accuracy = accuracy
|
|
3427
|
+
best_model = result
|
|
3428
|
+
except:
|
|
3429
|
+
pass
|
|
3430
|
+
|
|
3431
|
+
if best_model:
|
|
3432
|
+
print(f"\n🎉 找到最佳模型!")
|
|
3433
|
+
print(f" 配置ID: {best_model.job_id}")
|
|
3434
|
+
print(f" 验证准确率: {best_accuracy:.2%}")
|
|
3435
|
+
|
|
3436
|
+
# 评估最佳模型
|
|
3437
|
+
print("\n评估最佳模型...")
|
|
3438
|
+
eval_job = JobDefinition(
|
|
3439
|
+
cmd=f"python evaluate_model.py --model /output/{best_model.job_id}.pth --test_data /data/imagenet_test",
|
|
3440
|
+
name="evaluate_best_model",
|
|
3441
|
+
backend=Backend.SLURM,
|
|
3442
|
+
resource=ResourceRequest(
|
|
3443
|
+
cpus=4,
|
|
3444
|
+
memory_gb=16,
|
|
3445
|
+
gpus=1,
|
|
3446
|
+
time_hours=2
|
|
3447
|
+
),
|
|
3448
|
+
tags={"task": "evaluation", "best_model": best_model.job_id}
|
|
3449
|
+
)
|
|
3450
|
+
|
|
3451
|
+
eval_result = scheduler.submit(eval_job, wait=True)
|
|
3452
|
+
|
|
3453
|
+
if eval_result.success():
|
|
3454
|
+
print(" 评估完成!")
|
|
3455
|
+
print(f" 测试准确率: {eval_result.stdout}")
|
|
3456
|
+
|
|
3457
|
+
finally:
|
|
3458
|
+
scheduler.shutdown()
|
|
3459
|
+
print("\nML训练完成!")
|
|
3460
|
+
|
|
3461
|
+
def example_bioinformatics_workflow():
|
|
3462
|
+
"""
|
|
3463
|
+
示例3: 生物信息学工作流
|
|
3464
|
+
DNA测序数据分析流程
|
|
3465
|
+
"""
|
|
3466
|
+
print("\n" + "="*60)
|
|
3467
|
+
print("示例3: 生物信息学工作流")
|
|
3468
|
+
print("="*60)
|
|
3469
|
+
|
|
3470
|
+
scheduler = ContainerScheduler(
|
|
3471
|
+
max_concurrent=6,
|
|
3472
|
+
default_backend=Backend.SLURM,
|
|
3473
|
+
job_store=JobStore("bioinformatics.db"),
|
|
3474
|
+
result_cache=ResultCache(".bio_cache")
|
|
3475
|
+
)
|
|
3476
|
+
|
|
3477
|
+
# 样本列表
|
|
3478
|
+
samples = [
|
|
3479
|
+
"sample_001", "sample_002", "sample_003",
|
|
3480
|
+
"sample_004", "sample_005", "sample_006"
|
|
3481
|
+
]
|
|
3482
|
+
|
|
3483
|
+
try:
|
|
3484
|
+
# 工作流定义
|
|
3485
|
+
workflow = []
|
|
3486
|
+
|
|
3487
|
+
# 1. 质量控制(并行)
|
|
3488
|
+
for sample in samples:
|
|
3489
|
+
workflow.append({
|
|
3490
|
+
"job_id": f"qc_{sample}",
|
|
3491
|
+
"cmd": f"fastqc /data/raw/{sample}.fastq.gz -o /data/qc/{sample}",
|
|
3492
|
+
"backend": Backend.LOCAL,
|
|
3493
|
+
"resource": {"cpus": 2, "memory_gb": 4},
|
|
3494
|
+
"tags": {"stage": "quality_control", "sample": sample}
|
|
3495
|
+
})
|
|
3496
|
+
|
|
3497
|
+
# 2. 序列比对(有依赖关系)
|
|
3498
|
+
for sample in samples:
|
|
3499
|
+
workflow.append({
|
|
3500
|
+
"job_id": f"align_{sample}",
|
|
3501
|
+
"cmd": f"bwa mem -t 8 /data/reference/hg38.fasta /data/raw/{sample}.fastq.gz > /data/aligned/{sample}.sam",
|
|
3502
|
+
"dependencies": [f"qc_{sample}"],
|
|
3503
|
+
"backend": Backend.SLURM,
|
|
3504
|
+
"resource": {"cpus": 8, "memory_gb": 16, "time_hours": 4},
|
|
3505
|
+
"tags": {"stage": "alignment", "sample": sample}
|
|
3506
|
+
})
|
|
3507
|
+
|
|
3508
|
+
# 3. 变异检测(批量处理)
|
|
3509
|
+
workflow.append({
|
|
3510
|
+
"job_id": "variant_calling",
|
|
3511
|
+
"cmd": "gatk HaplotypeCaller -R /data/reference/hg38.fasta -I /data/aligned/*.bam -O /data/variants/all_variants.vcf",
|
|
3512
|
+
"dependencies": [f"align_{sample}" for sample in samples],
|
|
3513
|
+
"backend": Backend.SLURM,
|
|
3514
|
+
"resource": {"cpus": 32, "memory_gb": 64, "time_hours": 8, "partition": "large"},
|
|
3515
|
+
"tags": {"stage": "variant_calling"}
|
|
3516
|
+
})
|
|
3517
|
+
|
|
3518
|
+
# 4. 注释分析
|
|
3519
|
+
workflow.append({
|
|
3520
|
+
"job_id": "annotation",
|
|
3521
|
+
"cmd": "annovar /data/variants/all_variants.vcf /data/annotations/ -buildver hg38",
|
|
3522
|
+
"dependencies": ["variant_calling"],
|
|
3523
|
+
"backend": Backend.LOCAL,
|
|
3524
|
+
"resource": {"cpus": 4, "memory_gb": 8},
|
|
3525
|
+
"tags": {"stage": "annotation"}
|
|
3526
|
+
})
|
|
3527
|
+
|
|
3528
|
+
# 5. 生成报告
|
|
3529
|
+
workflow.append({
|
|
3530
|
+
"job_id": "generate_report",
|
|
3531
|
+
"cmd": "python generate_report.py --vcf /data/variants/all_variants.vcf --output /data/report/final_report.html",
|
|
3532
|
+
"dependencies": ["annotation"],
|
|
3533
|
+
"tags": {"stage": "report"}
|
|
3534
|
+
})
|
|
3535
|
+
|
|
3536
|
+
print(f"开始生物信息学工作流,共 {len(workflow)} 个步骤...")
|
|
3537
|
+
|
|
3538
|
+
# 运行工作流
|
|
3539
|
+
results = scheduler.run_workflow(
|
|
3540
|
+
workflow=workflow,
|
|
3541
|
+
max_workers=3,
|
|
3542
|
+
name="bioinformatics_pipeline",
|
|
3543
|
+
stop_on_error=True
|
|
3544
|
+
)
|
|
3545
|
+
|
|
3546
|
+
# 分析结果
|
|
3547
|
+
successful = sum(1 for r in results.values() if r.success())
|
|
3548
|
+
total = len(results)
|
|
3549
|
+
|
|
3550
|
+
print(f"\n工作流完成: {successful}/{total} 个步骤成功")
|
|
3551
|
+
|
|
3552
|
+
if successful == total:
|
|
3553
|
+
final_result = results["generate_report"]
|
|
3554
|
+
print(f"🎉 分析完成! 报告已生成")
|
|
3555
|
+
print(f" 总耗时: {sum(r.duration or 0 for r in results.values()):.1f}秒")
|
|
3556
|
+
|
|
3557
|
+
# 显示各阶段耗时
|
|
3558
|
+
print("\n各阶段耗时:")
|
|
3559
|
+
for job_id, result in results.items():
|
|
3560
|
+
if result.duration:
|
|
3561
|
+
print(f" {job_id}: {result.duration:.1f}s")
|
|
3562
|
+
|
|
3563
|
+
finally:
|
|
3564
|
+
scheduler.shutdown()
|
|
3565
|
+
|
|
3566
|
+
def example_cloud_batch_processing():
|
|
3567
|
+
"""
|
|
3568
|
+
示例4: 云批量处理
|
|
3569
|
+
模拟AWS Batch或Azure Batch场景
|
|
3570
|
+
"""
|
|
3571
|
+
print("\n" + "="*60)
|
|
3572
|
+
print("示例4: 云批量处理")
|
|
3573
|
+
print("="*60)
|
|
3574
|
+
|
|
3575
|
+
# 模拟云作业调度
|
|
3576
|
+
scheduler = ContainerScheduler(
|
|
3577
|
+
max_concurrent=20, # 高并发
|
|
3578
|
+
job_store=JobStore("cloud_batch.db"),
|
|
3579
|
+
plugins=[
|
|
3580
|
+
NotificationPlugin(),
|
|
3581
|
+
ResourceLogger("cloud_resources.json")
|
|
3582
|
+
]
|
|
3583
|
+
)
|
|
3584
|
+
|
|
3585
|
+
# 模拟大量数据处理任务
|
|
3586
|
+
tasks = []
|
|
3587
|
+
for i in range(100):
|
|
3588
|
+
task = {
|
|
3589
|
+
"task_id": f"task_{i:03d}",
|
|
3590
|
+
"input_file": f"s3://bucket/input/data_{i}.json",
|
|
3591
|
+
"output_file": f"s3://bucket/output/processed_{i}.parquet",
|
|
3592
|
+
"complexity": random.choice(["simple", "medium", "complex"])
|
|
3593
|
+
}
|
|
3594
|
+
tasks.append(task)
|
|
3595
|
+
|
|
3596
|
+
try:
|
|
3597
|
+
print(f"开始处理 {len(tasks)} 个云任务...")
|
|
3598
|
+
|
|
3599
|
+
# 根据任务复杂度分配资源
|
|
3600
|
+
job_definitions = []
|
|
3601
|
+
for task in tasks:
|
|
3602
|
+
if task["complexity"] == "simple":
|
|
3603
|
+
cpus = 2
|
|
3604
|
+
memory_gb = 4
|
|
3605
|
+
priority = JobPriority.LOW
|
|
3606
|
+
elif task["complexity"] == "medium":
|
|
3607
|
+
cpus = 4
|
|
3608
|
+
memory_gb = 8
|
|
3609
|
+
priority = JobPriority.NORMAL
|
|
3610
|
+
else: # complex
|
|
3611
|
+
cpus = 8
|
|
3612
|
+
memory_gb = 16
|
|
3613
|
+
priority = JobPriority.HIGH
|
|
3614
|
+
|
|
3615
|
+
job = JobDefinition(
|
|
3616
|
+
cmd=f"python cloud_processor.py --input {task['input_file']} --output {task['output_file']}",
|
|
3617
|
+
name=f"cloud_task_{task['task_id']}",
|
|
3618
|
+
backend=Backend.DOCKER,
|
|
3619
|
+
image="python:3.9-cloud",
|
|
3620
|
+
config=ExecutionConfig(
|
|
3621
|
+
env={
|
|
3622
|
+
"AWS_ACCESS_KEY_ID": "xxx",
|
|
3623
|
+
"AWS_SECRET_ACCESS_KEY": "xxx",
|
|
3624
|
+
"AWS_DEFAULT_REGION": "us-east-1"
|
|
3625
|
+
}
|
|
3626
|
+
),
|
|
3627
|
+
resource=ResourceRequest(cpus=cpus, memory_gb=memory_gb),
|
|
3628
|
+
retry=RetryConfig(
|
|
3629
|
+
max_attempts=3,
|
|
3630
|
+
backoff_factor=2.0,
|
|
3631
|
+
retry_on_network_error=True
|
|
3632
|
+
),
|
|
3633
|
+
priority=priority,
|
|
3634
|
+
tags={
|
|
3635
|
+
"cloud": "aws",
|
|
3636
|
+
"task_type": "batch_processing",
|
|
3637
|
+
"complexity": task["complexity"],
|
|
3638
|
+
"task_id": task["task_id"]
|
|
3639
|
+
},
|
|
3640
|
+
callback=lambda r, t=task: print(f" 任务 {t['task_id']} 完成: {r.status.value}")
|
|
3641
|
+
)
|
|
3642
|
+
job_definitions.append(job)
|
|
3643
|
+
|
|
3644
|
+
# 批量提交(模拟云批量作业)
|
|
3645
|
+
print("提交任务到云队列...")
|
|
3646
|
+
batch_size = 10 # 每批处理10个任务
|
|
3647
|
+
all_results = []
|
|
3648
|
+
|
|
3649
|
+
for i in range(0, len(job_definitions), batch_size):
|
|
3650
|
+
batch = job_definitions[i:i+batch_size]
|
|
3651
|
+
print(f"处理批次 {i//batch_size + 1}/{(len(job_definitions)+batch_size-1)//batch_size}...")
|
|
3652
|
+
|
|
3653
|
+
batch_results = scheduler.run_many(
|
|
3654
|
+
batch,
|
|
3655
|
+
max_workers=10,
|
|
3656
|
+
progress_callback=lambda c, t: None # 静默进度
|
|
3657
|
+
)
|
|
3658
|
+
all_results.extend(batch_results)
|
|
3659
|
+
|
|
3660
|
+
# 批次间短暂暂停
|
|
3661
|
+
time.sleep(2)
|
|
3662
|
+
|
|
3663
|
+
# 统计结果
|
|
3664
|
+
successful = sum(1 for r in all_results if r.success())
|
|
3665
|
+
failed = len(all_results) - successful
|
|
3666
|
+
|
|
3667
|
+
print(f"\n📊 批量处理完成统计:")
|
|
3668
|
+
print(f" 总任务数: {len(all_results)}")
|
|
3669
|
+
print(f" 成功: {successful}")
|
|
3670
|
+
print(f" 失败: {failed}")
|
|
3671
|
+
|
|
3672
|
+
if failed > 0:
|
|
3673
|
+
print("\n失败任务:")
|
|
3674
|
+
for result in all_results:
|
|
3675
|
+
if result.failed():
|
|
3676
|
+
print(f" {result.job_id}: {result.error_message}")
|
|
3677
|
+
|
|
3678
|
+
# 显示资源使用情况
|
|
3679
|
+
metrics = scheduler.get_metrics()
|
|
3680
|
+
print(f"\n💻 资源使用统计:")
|
|
3681
|
+
print(f" CPU小时: {metrics['total_cpu_hours']:.2f}")
|
|
3682
|
+
print(f" 内存GB小时: {metrics['total_memory_gb_hours']:.2f}")
|
|
3683
|
+
print(f" 平均作业时长: {metrics.get('avg_duration', 0):.1f}s")
|
|
3684
|
+
|
|
3685
|
+
finally:
|
|
3686
|
+
scheduler.shutdown()
|
|
3687
|
+
|
|
3688
|
+
def example_real_time_monitoring():
|
|
3689
|
+
"""
|
|
3690
|
+
示例5: 实时监控和告警系统
|
|
3691
|
+
模拟生产环境监控场景
|
|
3692
|
+
"""
|
|
3693
|
+
print("\n" + "="*60)
|
|
3694
|
+
print("示例5: 实时监控系统")
|
|
3695
|
+
print("="*60)
|
|
3696
|
+
|
|
3697
|
+
# 创建带完整监控的调度器
|
|
3698
|
+
scheduler = ContainerScheduler(
|
|
3699
|
+
max_concurrent=10,
|
|
3700
|
+
job_store=JobStore("monitoring.db"),
|
|
3701
|
+
plugins=[
|
|
3702
|
+
NotificationPlugin(webhook_url="https://hooks.slack.com/services/XXX"),
|
|
3703
|
+
ResourceLogger("monitoring_logs.json")
|
|
3704
|
+
],
|
|
3705
|
+
enable_timeout_monitor=True,
|
|
3706
|
+
enable_priority_queue=True
|
|
3707
|
+
)
|
|
3708
|
+
|
|
3709
|
+
# 监控任务定义
|
|
3710
|
+
monitoring_tasks = [
|
|
3711
|
+
{
|
|
3712
|
+
"name": "database_health_check",
|
|
3713
|
+
"cmd": "python check_database.py --host db-prod --timeout 30",
|
|
3714
|
+
"interval": 60, # 每60秒执行一次
|
|
3715
|
+
"timeout": 45,
|
|
3716
|
+
"priority": JobPriority.HIGH
|
|
3717
|
+
},
|
|
3718
|
+
{
|
|
3719
|
+
"name": "api_endpoint_check",
|
|
3720
|
+
"cmd": "curl -f https://api.example.com/health",
|
|
3721
|
+
"interval": 30,
|
|
3722
|
+
"timeout": 10,
|
|
3723
|
+
"retry": {"max_attempts": 2}
|
|
3724
|
+
},
|
|
3725
|
+
{
|
|
3726
|
+
"name": "disk_space_check",
|
|
3727
|
+
"cmd": "python check_disk.py --path / --threshold 90",
|
|
3728
|
+
"interval": 300,
|
|
3729
|
+
"priority": JobPriority.NORMAL
|
|
3730
|
+
},
|
|
3731
|
+
{
|
|
3732
|
+
"name": "service_metrics_collect",
|
|
3733
|
+
"cmd": "python collect_metrics.py --services web,api,cache,queue",
|
|
3734
|
+
"interval": 60,
|
|
3735
|
+
"resource": {"cpus": 2, "memory_gb": 4}
|
|
3736
|
+
},
|
|
3737
|
+
{
|
|
3738
|
+
"name": "log_analysis",
|
|
3739
|
+
"cmd": "python analyze_logs.py --logfile /var/log/app.log --pattern ERROR",
|
|
3740
|
+
"interval": 120,
|
|
3741
|
+
"backend": Backend.LOCAL,
|
|
3742
|
+
"resource": {"cpus": 4, "memory_gb": 8}
|
|
3743
|
+
}
|
|
3744
|
+
]
|
|
3745
|
+
|
|
3746
|
+
try:
|
|
3747
|
+
print("启动实时监控系统...")
|
|
3748
|
+
print(f"监控任务数: {len(monitoring_tasks)}")
|
|
3749
|
+
|
|
3750
|
+
# 创建定期执行的任务
|
|
3751
|
+
monitor_threads = []
|
|
3752
|
+
stop_event = threading.Event()
|
|
3753
|
+
|
|
3754
|
+
for task_def in monitoring_tasks:
|
|
3755
|
+
def monitor_loop(def_copy=task_def, stop=stop_event):
|
|
3756
|
+
"""监控循环"""
|
|
3757
|
+
task_name = def_copy["name"]
|
|
3758
|
+
interval = def_copy["interval"]
|
|
3759
|
+
|
|
3760
|
+
print(f" 启动监控: {task_name} (间隔: {interval}s)")
|
|
3761
|
+
|
|
3762
|
+
execution_count = 0
|
|
3763
|
+
while not stop.is_set():
|
|
3764
|
+
try:
|
|
3765
|
+
# 创建作业定义
|
|
3766
|
+
job = JobDefinition(
|
|
3767
|
+
cmd=def_copy["cmd"],
|
|
3768
|
+
name=f"monitor_{task_name}_{execution_count}",
|
|
3769
|
+
backend=def_copy.get("backend", Backend.LOCAL),
|
|
3770
|
+
config=ExecutionConfig(
|
|
3771
|
+
timeout=def_copy.get("timeout", 30),
|
|
3772
|
+
capture_output=True
|
|
3773
|
+
),
|
|
3774
|
+
resource=ResourceRequest(**def_copy.get("resource", {"cpus": 1, "memory_gb": 1})),
|
|
3775
|
+
retry=RetryConfig(**def_copy.get("retry", {"max_attempts": 1})),
|
|
3776
|
+
priority=def_copy.get("priority", JobPriority.NORMAL),
|
|
3777
|
+
tags={
|
|
3778
|
+
"monitoring": "true",
|
|
3779
|
+
"task": task_name,
|
|
3780
|
+
"execution": str(execution_count)
|
|
3781
|
+
}
|
|
3782
|
+
)
|
|
3783
|
+
|
|
3784
|
+
# 提交作业(异步)
|
|
3785
|
+
future = scheduler.submit(job, wait=False)
|
|
3786
|
+
|
|
3787
|
+
# 记录执行
|
|
3788
|
+
execution_count += 1
|
|
3789
|
+
|
|
3790
|
+
# 等待间隔时间
|
|
3791
|
+
for _ in range(interval):
|
|
3792
|
+
if stop.is_set():
|
|
3793
|
+
break
|
|
3794
|
+
time.sleep(1)
|
|
3795
|
+
|
|
3796
|
+
except Exception as e:
|
|
3797
|
+
print(f"监控任务 {task_name} 错误: {e}")
|
|
3798
|
+
time.sleep(interval)
|
|
3799
|
+
|
|
3800
|
+
thread = threading.Thread(target=monitor_loop, daemon=True)
|
|
3801
|
+
thread.start()
|
|
3802
|
+
monitor_threads.append(thread)
|
|
3803
|
+
|
|
3804
|
+
# 运行监控一段时间
|
|
3805
|
+
print("\n监控系统运行中... (运行30秒演示)")
|
|
3806
|
+
print("按 Ctrl+C 停止监控")
|
|
3807
|
+
|
|
3808
|
+
# 演示期间显示实时状态
|
|
3809
|
+
for i in range(6): # 运行30秒
|
|
3810
|
+
if stop_event.is_set():
|
|
3811
|
+
break
|
|
3812
|
+
|
|
3813
|
+
time.sleep(5)
|
|
3814
|
+
|
|
3815
|
+
# 显示当前状态
|
|
3816
|
+
health = scheduler.health_check()
|
|
3817
|
+
metrics = scheduler.get_metrics()
|
|
3818
|
+
|
|
3819
|
+
print(f"\n[{datetime.now().strftime('%H:%M:%S')}] 系统状态:")
|
|
3820
|
+
print(f" 健康状态: {health['status']}")
|
|
3821
|
+
print(f" 运行作业: {metrics['current']['jobs_running']}")
|
|
3822
|
+
print(f" 总作业数: {metrics['jobs_total']}")
|
|
3823
|
+
print(f" 成功率: {metrics.get('success_rate', 0):.1%}")
|
|
3824
|
+
|
|
3825
|
+
# 如果有失败作业,显示警告
|
|
3826
|
+
failed_jobs = []
|
|
3827
|
+
if scheduler.job_store:
|
|
3828
|
+
failed_jobs = scheduler.job_store.search_jobs(status=JobStatus.FAILED, limit=3)
|
|
3829
|
+
|
|
3830
|
+
if failed_jobs:
|
|
3831
|
+
print(f" ⚠️ 最近失败作业:")
|
|
3832
|
+
for job in failed_jobs:
|
|
3833
|
+
result = scheduler.get_result(job.job_id)
|
|
3834
|
+
if result:
|
|
3835
|
+
print(f" {job.name}: {result.error_message}")
|
|
3836
|
+
|
|
3837
|
+
# 停止监控
|
|
3838
|
+
print("\n停止监控系统...")
|
|
3839
|
+
stop_event.set()
|
|
3840
|
+
|
|
3841
|
+
# 等待所有监控线程结束
|
|
3842
|
+
for thread in monitor_threads:
|
|
3843
|
+
thread.join(timeout=5)
|
|
3844
|
+
|
|
3845
|
+
# 最终报告
|
|
3846
|
+
print("\n📈 监控报告:")
|
|
3847
|
+
print("=" * 40)
|
|
3848
|
+
|
|
3849
|
+
metrics = scheduler.get_metrics()
|
|
3850
|
+
print(f"监控周期: 30秒")
|
|
3851
|
+
print(f"执行作业数: {metrics['jobs_total']}")
|
|
3852
|
+
print(f"成功作业: {metrics['jobs_completed']}")
|
|
3853
|
+
print(f"失败作业: {metrics['jobs_failed']}")
|
|
3854
|
+
print(f"成功率: {metrics.get('success_rate', 0):.1%}")
|
|
3855
|
+
|
|
3856
|
+
# 显示后端使用情况
|
|
3857
|
+
if metrics.get('backend_stats'):
|
|
3858
|
+
print("\n后端使用统计:")
|
|
3859
|
+
for backend, count in metrics['backend_stats'].items():
|
|
3860
|
+
print(f" {backend}: {count} 次")
|
|
3861
|
+
|
|
3862
|
+
except KeyboardInterrupt:
|
|
3863
|
+
print("\n监控被用户中断")
|
|
3864
|
+
finally:
|
|
3865
|
+
scheduler.shutdown()
|
|
3866
|
+
print("监控系统已关闭")
|
|
3867
|
+
|
|
3868
|
+
def example_custom_workflow_orchestrator():
|
|
3869
|
+
"""
|
|
3870
|
+
示例6: 自定义工作流编排器
|
|
3871
|
+
复杂依赖关系和条件执行
|
|
3872
|
+
"""
|
|
3873
|
+
print("\n" + "="*60)
|
|
3874
|
+
print("示例6: 自定义工作流编排器")
|
|
3875
|
+
print("="*60)
|
|
3876
|
+
|
|
3877
|
+
scheduler = ContainerScheduler(
|
|
3878
|
+
max_concurrent=8,
|
|
3879
|
+
job_store=JobStore("workflow_orchestrator.db"),
|
|
3880
|
+
enable_priority_queue=True
|
|
3881
|
+
)
|
|
3882
|
+
|
|
3883
|
+
# 定义复杂工作流
|
|
3884
|
+
workflow_def = {
|
|
3885
|
+
"name": "ml_pipeline_with_validation",
|
|
3886
|
+
"stages": [
|
|
3887
|
+
{
|
|
3888
|
+
"id": "data_extraction",
|
|
3889
|
+
"description": "从多个源提取数据",
|
|
3890
|
+
"parallel_tasks": [
|
|
3891
|
+
{
|
|
3892
|
+
"id": "extract_db",
|
|
3893
|
+
"cmd": "python extract_from_database.py --config db_config.yaml",
|
|
3894
|
+
"resource": {"cpus": 4, "memory_gb": 8}
|
|
3895
|
+
},
|
|
3896
|
+
{
|
|
3897
|
+
"id": "extract_api",
|
|
3898
|
+
"cmd": "python extract_from_api.py --endpoints api_endpoints.json",
|
|
3899
|
+
"resource": {"cpus": 2, "memory_gb": 4}
|
|
3900
|
+
},
|
|
3901
|
+
{
|
|
3902
|
+
"id": "extract_files",
|
|
3903
|
+
"cmd": "python extract_from_files.py --input /data/raw/ --pattern *.csv",
|
|
3904
|
+
"resource": {"cpus": 2, "memory_gb": 4}
|
|
3905
|
+
}
|
|
3906
|
+
]
|
|
3907
|
+
},
|
|
3908
|
+
{
|
|
3909
|
+
"id": "data_validation",
|
|
3910
|
+
"description": "数据验证和质量检查",
|
|
3911
|
+
"dependencies": ["data_extraction"],
|
|
3912
|
+
"cmd": "python validate_data.py --sources extracted/ --output validation_report.json",
|
|
3913
|
+
"resource": {"cpus": 4, "memory_gb": 8},
|
|
3914
|
+
"condition": lambda r: r.exit_code == 0 # 只有成功才继续
|
|
3915
|
+
},
|
|
3916
|
+
{
|
|
3917
|
+
"id": "feature_engineering",
|
|
3918
|
+
"description": "特征工程(并行特征提取)",
|
|
3919
|
+
"dependencies": ["data_validation"],
|
|
3920
|
+
"parallel_tasks": [
|
|
3921
|
+
{
|
|
3922
|
+
"id": "numeric_features",
|
|
3923
|
+
"cmd": "python extract_numeric_features.py --input validated_data.parquet",
|
|
3924
|
+
"resource": {"cpus": 4, "memory_gb": 8}
|
|
3925
|
+
},
|
|
3926
|
+
{
|
|
3927
|
+
"id": "text_features",
|
|
3928
|
+
"cmd": "python extract_text_features.py --input validated_data.parquet",
|
|
3929
|
+
"resource": {"cpus": 4, "memory_gb": 16}
|
|
3930
|
+
},
|
|
3931
|
+
{
|
|
3932
|
+
"id": "time_features",
|
|
3933
|
+
"cmd": "python extract_time_features.py --input validated_data.parquet",
|
|
3934
|
+
"resource": {"cpus": 2, "memory_gb": 4}
|
|
3935
|
+
}
|
|
3936
|
+
]
|
|
3937
|
+
},
|
|
3938
|
+
{
|
|
3939
|
+
"id": "model_training",
|
|
3940
|
+
"description": "模型训练和验证",
|
|
3941
|
+
"dependencies": ["feature_engineering"],
|
|
3942
|
+
"parallel_tasks": [
|
|
3943
|
+
{
|
|
3944
|
+
"id": "train_xgboost",
|
|
3945
|
+
"cmd": "python train_xgboost.py --features features/ --output models/xgboost.pkl",
|
|
3946
|
+
"resource": {"cpus": 8, "memory_gb": 16}
|
|
3947
|
+
},
|
|
3948
|
+
{
|
|
3949
|
+
"id": "train_nn",
|
|
3950
|
+
"cmd": "python train_neural_network.py --features features/ --output models/nn.h5",
|
|
3951
|
+
"backend": Backend.SLURM,
|
|
3952
|
+
"resource": {"cpus": 8, "memory_gb": 32, "gpus": 1}
|
|
3953
|
+
}
|
|
3954
|
+
]
|
|
3955
|
+
},
|
|
3956
|
+
{
|
|
3957
|
+
"id": "model_evaluation",
|
|
3958
|
+
"description": "模型评估和选择",
|
|
3959
|
+
"dependencies": ["model_training"],
|
|
3960
|
+
"cmd": "python evaluate_models.py --models models/ --test_data test_set.parquet",
|
|
3961
|
+
"resource": {"cpus": 4, "memory_gb": 8}
|
|
3962
|
+
},
|
|
3963
|
+
{
|
|
3964
|
+
"id": "deployment_prep",
|
|
3965
|
+
"description": "部署准备",
|
|
3966
|
+
"dependencies": ["model_evaluation"],
|
|
3967
|
+
"cmd": "python prepare_deployment.py --best_model best_model.pkl --output deployment/",
|
|
3968
|
+
"resource": {"cpus": 2, "memory_gb": 4}
|
|
3969
|
+
}
|
|
3970
|
+
]
|
|
3971
|
+
}
|
|
3972
|
+
|
|
3973
|
+
try:
|
|
3974
|
+
print(f"开始工作流: {workflow_def['name']}")
|
|
3975
|
+
print(f"阶段数: {len(workflow_def['stages'])}")
|
|
3976
|
+
|
|
3977
|
+
# 跟踪作业ID映射
|
|
3978
|
+
job_mapping = {}
|
|
3979
|
+
all_results = {}
|
|
3980
|
+
|
|
3981
|
+
# 执行每个阶段
|
|
3982
|
+
for stage in workflow_def['stages']:
|
|
3983
|
+
print(f"\n➤ 阶段: {stage['id']} - {stage['description']}")
|
|
3984
|
+
|
|
3985
|
+
# 检查依赖是否满足
|
|
3986
|
+
if 'dependencies' in stage:
|
|
3987
|
+
deps_satisfied = True
|
|
3988
|
+
for dep in stage['dependencies']:
|
|
3989
|
+
if dep not in all_results or not all_results[dep].success():
|
|
3990
|
+
deps_satisfied = False
|
|
3991
|
+
print(f" 等待依赖: {dep}")
|
|
3992
|
+
break
|
|
3993
|
+
|
|
3994
|
+
if not deps_satisfied:
|
|
3995
|
+
print(f" 跳过阶段 {stage['id']} (依赖未满足)")
|
|
3996
|
+
continue
|
|
3997
|
+
|
|
3998
|
+
# 并行任务
|
|
3999
|
+
if 'parallel_tasks' in stage:
|
|
4000
|
+
print(f" 并行任务数: {len(stage['parallel_tasks'])}")
|
|
4001
|
+
|
|
4002
|
+
# 创建并行作业
|
|
4003
|
+
parallel_jobs = []
|
|
4004
|
+
for task in stage['parallel_tasks']:
|
|
4005
|
+
job = JobDefinition(
|
|
4006
|
+
cmd=task['cmd'],
|
|
4007
|
+
name=f"{stage['id']}_{task['id']}",
|
|
4008
|
+
backend=task.get('backend', Backend.LOCAL),
|
|
4009
|
+
resource=ResourceRequest(**task.get('resource', {"cpus": 1, "memory_gb": 1})),
|
|
4010
|
+
tags={
|
|
4011
|
+
"workflow": workflow_def['name'],
|
|
4012
|
+
"stage": stage['id'],
|
|
4013
|
+
"task": task['id']
|
|
4014
|
+
}
|
|
4015
|
+
)
|
|
4016
|
+
parallel_jobs.append(job)
|
|
4017
|
+
job_mapping[job.job_id] = f"{stage['id']}.{task['id']}"
|
|
4018
|
+
|
|
4019
|
+
# 执行并行任务
|
|
4020
|
+
results = scheduler.run_many(
|
|
4021
|
+
parallel_jobs,
|
|
4022
|
+
max_workers=len(parallel_jobs),
|
|
4023
|
+
stop_on_error=True
|
|
4024
|
+
)
|
|
4025
|
+
|
|
4026
|
+
# 存储结果
|
|
4027
|
+
for job, result in zip(parallel_jobs, results):
|
|
4028
|
+
task_id = job_mapping[job.job_id]
|
|
4029
|
+
all_results[task_id] = result
|
|
4030
|
+
print(f" 任务 {task_id}: {result.status.value}")
|
|
4031
|
+
|
|
4032
|
+
# 检查是否所有并行任务都成功
|
|
4033
|
+
all_success = all(r.success() for r in results)
|
|
4034
|
+
if not all_success:
|
|
4035
|
+
print(f" 阶段 {stage['id']} 有任务失败,停止工作流")
|
|
4036
|
+
break
|
|
4037
|
+
|
|
4038
|
+
# 将整个阶段标记为成功
|
|
4039
|
+
stage_result = JobResult(
|
|
4040
|
+
job_id=stage['id'],
|
|
4041
|
+
status=JobStatus.COMPLETED if all_success else JobStatus.FAILED
|
|
4042
|
+
)
|
|
4043
|
+
all_results[stage['id']] = stage_result
|
|
4044
|
+
|
|
4045
|
+
# 单一任务
|
|
4046
|
+
elif 'cmd' in stage:
|
|
4047
|
+
job = JobDefinition(
|
|
4048
|
+
cmd=stage['cmd'],
|
|
4049
|
+
name=stage['id'],
|
|
4050
|
+
resource=ResourceRequest(**stage.get('resource', {"cpus": 1, "memory_gb": 1})),
|
|
4051
|
+
tags={
|
|
4052
|
+
"workflow": workflow_def['name'],
|
|
4053
|
+
"stage": stage['id']
|
|
4054
|
+
}
|
|
4055
|
+
)
|
|
4056
|
+
|
|
4057
|
+
result = scheduler.submit(job, wait=True)
|
|
4058
|
+
all_results[stage['id']] = result
|
|
4059
|
+
|
|
4060
|
+
print(f" 结果: {result.status.value}, 耗时: {result.duration:.1f}s")
|
|
4061
|
+
|
|
4062
|
+
# 检查条件
|
|
4063
|
+
if 'condition' in stage and callable(stage['condition']):
|
|
4064
|
+
if not stage['condition'](result):
|
|
4065
|
+
print(f" 条件不满足,停止工作流")
|
|
4066
|
+
break
|
|
4067
|
+
|
|
4068
|
+
# 工作流完成报告
|
|
4069
|
+
print("\n" + "="*40)
|
|
4070
|
+
print("工作流完成报告")
|
|
4071
|
+
print("="*40)
|
|
4072
|
+
|
|
4073
|
+
successful_stages = sum(1 for k, v in all_results.items() if '.' not in k and v.success())
|
|
4074
|
+
total_stages = sum(1 for k in all_results.keys() if '.' not in k)
|
|
4075
|
+
|
|
4076
|
+
print(f"完成阶段: {successful_stages}/{total_stages}")
|
|
4077
|
+
print(f"总作业数: {scheduler.metrics_collector.get_metrics()['jobs_total']}")
|
|
4078
|
+
|
|
4079
|
+
if successful_stages == total_stages:
|
|
4080
|
+
print("✅ 工作流完全成功!")
|
|
4081
|
+
else:
|
|
4082
|
+
print("⚠️ 工作流部分完成")
|
|
4083
|
+
|
|
4084
|
+
# 显示失败阶段
|
|
4085
|
+
print("\n失败阶段:")
|
|
4086
|
+
for stage_id, result in all_results.items():
|
|
4087
|
+
if '.' not in stage_id and result.failed():
|
|
4088
|
+
print(f" {stage_id}: {result.error_message}")
|
|
4089
|
+
|
|
4090
|
+
finally:
|
|
4091
|
+
scheduler.shutdown()
|
|
4092
|
+
|
|
4093
|
+
def example_disaster_recovery_drills():
|
|
4094
|
+
"""
|
|
4095
|
+
示例7: 灾难恢复演练
|
|
4096
|
+
模拟系统故障和恢复过程
|
|
4097
|
+
"""
|
|
4098
|
+
print("\n" + "="*60)
|
|
4099
|
+
print("示例7: 灾难恢复演练")
|
|
4100
|
+
print("="*60)
|
|
4101
|
+
|
|
4102
|
+
# 创建具有高可用性特性的调度器
|
|
4103
|
+
scheduler = ContainerScheduler(
|
|
4104
|
+
max_concurrent=5,
|
|
4105
|
+
job_store=JobStore("dr_drill.db"),
|
|
4106
|
+
result_cache=ResultCache(".dr_cache"),
|
|
4107
|
+
plugins=[
|
|
4108
|
+
NotificationPlugin(email="admin@example.com")
|
|
4109
|
+
],
|
|
4110
|
+
enable_priority_queue=True,
|
|
4111
|
+
enable_timeout_monitor=True
|
|
4112
|
+
)
|
|
4113
|
+
|
|
4114
|
+
try:
|
|
4115
|
+
print("开始灾难恢复演练...")
|
|
4116
|
+
|
|
4117
|
+
# 模拟正常操作
|
|
4118
|
+
print("\n阶段1: 正常操作")
|
|
4119
|
+
normal_operations = [
|
|
4120
|
+
"python process_transactions.py --batch-size 1000",
|
|
4121
|
+
"python generate_reports.py --date $(date +%Y-%m-%d)",
|
|
4122
|
+
"python backup_database.py --incremental",
|
|
4123
|
+
"python monitor_services.py --all",
|
|
4124
|
+
"python cleanup_logs.py --older-than 7d"
|
|
4125
|
+
]
|
|
4126
|
+
|
|
4127
|
+
normal_results = scheduler.run_many(
|
|
4128
|
+
normal_operations,
|
|
4129
|
+
progress_callback=lambda c, t: print(f" 正常操作进度: {c}/{t}")
|
|
4130
|
+
)
|
|
4131
|
+
|
|
4132
|
+
# 模拟故障注入
|
|
4133
|
+
print("\n阶段2: 故障注入和检测")
|
|
4134
|
+
fault_jobs = [
|
|
4135
|
+
{
|
|
4136
|
+
"name": "simulate_network_partition",
|
|
4137
|
+
"cmd": "python simulate_fault.py --type network --duration 30",
|
|
4138
|
+
"retry": {"max_attempts": 5, "delay_seconds": 10},
|
|
4139
|
+
"tags": {"dr_test": "network_failure"}
|
|
4140
|
+
},
|
|
4141
|
+
{
|
|
4142
|
+
"name": "simulate_disk_failure",
|
|
4143
|
+
"cmd": "python simulate_fault.py --type disk --path /data --severity high",
|
|
4144
|
+
"priority": JobPriority.HIGH,
|
|
4145
|
+
"tags": {"dr_test": "disk_failure"}
|
|
4146
|
+
},
|
|
4147
|
+
{
|
|
4148
|
+
"name": "simulate_service_outage",
|
|
4149
|
+
"cmd": "python simulate_fault.py --type service --services db,cache,queue",
|
|
4150
|
+
"timeout": 60,
|
|
4151
|
+
"tags": {"dr_test": "service_outage"}
|
|
4152
|
+
}
|
|
4153
|
+
]
|
|
4154
|
+
|
|
4155
|
+
fault_results = scheduler.run_many(
|
|
4156
|
+
fault_jobs,
|
|
4157
|
+
stop_on_error=True
|
|
4158
|
+
)
|
|
4159
|
+
|
|
4160
|
+
# 检查系统健康状态
|
|
4161
|
+
print("\n阶段3: 系统健康检查")
|
|
4162
|
+
health = scheduler.health_check()
|
|
4163
|
+
|
|
4164
|
+
if health["status"] != "healthy":
|
|
4165
|
+
print(f"⚠️ 系统健康状态: {health['status']}")
|
|
4166
|
+
print("触发恢复程序...")
|
|
4167
|
+
|
|
4168
|
+
# 执行恢复步骤
|
|
4169
|
+
recovery_steps = [
|
|
4170
|
+
{
|
|
4171
|
+
"step": "1. 故障隔离",
|
|
4172
|
+
"cmd": "python isolate_fault.py --diagnosis fault_report.json",
|
|
4173
|
+
"priority": JobPriority.CRITICAL
|
|
4174
|
+
},
|
|
4175
|
+
{
|
|
4176
|
+
"step": "2. 启动备用系统",
|
|
4177
|
+
"cmd": "python start_backup_systems.py --components db,cache",
|
|
4178
|
+
"resource": {"cpus": 8, "memory_gb": 16}
|
|
4179
|
+
},
|
|
4180
|
+
{
|
|
4181
|
+
"step": "3. 数据恢复",
|
|
4182
|
+
"cmd": "python restore_data.py --backup latest --target /data",
|
|
4183
|
+
"timeout": 300,
|
|
4184
|
+
"retry": {"max_attempts": 3}
|
|
4185
|
+
},
|
|
4186
|
+
{
|
|
4187
|
+
"step": "4. 服务恢复",
|
|
4188
|
+
"cmd": "python restore_services.py --services all --validate",
|
|
4189
|
+
"priority": JobPriority.HIGH
|
|
4190
|
+
},
|
|
4191
|
+
{
|
|
4192
|
+
"step": "5. 数据同步",
|
|
4193
|
+
"cmd": "python sync_data.py --source backup --target production",
|
|
4194
|
+
"timeout": 600
|
|
4195
|
+
}
|
|
4196
|
+
]
|
|
4197
|
+
|
|
4198
|
+
print("\n执行恢复步骤:")
|
|
4199
|
+
recovery_results = []
|
|
4200
|
+
|
|
4201
|
+
for step in recovery_steps:
|
|
4202
|
+
print(f" {step['step']}...")
|
|
4203
|
+
|
|
4204
|
+
job = JobDefinition(
|
|
4205
|
+
cmd=step['cmd'],
|
|
4206
|
+
name=f"recovery_{step['step'].split('.')[0]}",
|
|
4207
|
+
priority=step.get('priority', JobPriority.NORMAL),
|
|
4208
|
+
config=ExecutionConfig(
|
|
4209
|
+
timeout=step.get('timeout', 60)
|
|
4210
|
+
),
|
|
4211
|
+
resource=ResourceRequest(**step.get('resource', {"cpus": 2, "memory_gb": 4})),
|
|
4212
|
+
retry=RetryConfig(**step.get('retry', {"max_attempts": 1})),
|
|
4213
|
+
tags={"dr_test": "recovery", "step": step['step']}
|
|
4214
|
+
)
|
|
4215
|
+
|
|
4216
|
+
result = scheduler.submit(job, wait=True)
|
|
4217
|
+
recovery_results.append(result)
|
|
4218
|
+
|
|
4219
|
+
if result.success():
|
|
4220
|
+
print(f" ✅ 完成")
|
|
4221
|
+
else:
|
|
4222
|
+
print(f" ❌ 失败: {result.error_message}")
|
|
4223
|
+
|
|
4224
|
+
# 验证恢复
|
|
4225
|
+
print("\n阶段4: 恢复验证")
|
|
4226
|
+
verification_jobs = [
|
|
4227
|
+
"python verify_system.py --full-check",
|
|
4228
|
+
"python verify_data.py --integrity --consistency",
|
|
4229
|
+
"python verify_services.py --all --timeout 30",
|
|
4230
|
+
"python verify_performance.py --baseline baseline_metrics.json"
|
|
4231
|
+
]
|
|
4232
|
+
|
|
4233
|
+
verification_results = scheduler.run_many(verification_jobs)
|
|
4234
|
+
|
|
4235
|
+
successful_verifications = sum(1 for r in verification_results if r.success())
|
|
4236
|
+
|
|
4237
|
+
if successful_verifications == len(verification_results):
|
|
4238
|
+
print("🎉 灾难恢复演练成功完成!")
|
|
4239
|
+
print(" 所有系统功能正常恢复")
|
|
4240
|
+
else:
|
|
4241
|
+
print("⚠️ 恢复验证部分失败")
|
|
4242
|
+
print(f" 成功验证: {successful_verifications}/{len(verification_results)}")
|
|
4243
|
+
|
|
4244
|
+
else:
|
|
4245
|
+
print("系统仍然健康,故障被自动恢复")
|
|
4246
|
+
|
|
4247
|
+
# 生成演练报告
|
|
4248
|
+
print("\n📋 灾难恢复演练报告:")
|
|
4249
|
+
print("="*40)
|
|
4250
|
+
|
|
4251
|
+
metrics = scheduler.get_metrics()
|
|
4252
|
+
total_jobs = metrics['jobs_total']
|
|
4253
|
+
successful_jobs = metrics['jobs_completed']
|
|
4254
|
+
success_rate = metrics.get('success_rate', 0)
|
|
4255
|
+
|
|
4256
|
+
print(f"总作业数: {total_jobs}")
|
|
4257
|
+
print(f"成功作业: {successful_jobs}")
|
|
4258
|
+
print(f"成功率: {success_rate:.1%}")
|
|
4259
|
+
print(f"重试次数: {metrics.get('retries_total', 0)}")
|
|
4260
|
+
|
|
4261
|
+
# 显示演练耗时
|
|
4262
|
+
if scheduler.job_store:
|
|
4263
|
+
all_jobs = scheduler.job_store.search_jobs(tags={"dr_test": True})
|
|
4264
|
+
total_duration = 0
|
|
4265
|
+
for job in all_jobs:
|
|
4266
|
+
result = scheduler.get_result(job.job_id)
|
|
4267
|
+
if result and result.duration:
|
|
4268
|
+
total_duration += result.duration
|
|
4269
|
+
|
|
4270
|
+
print(f"总演练耗时: {total_duration:.1f}秒")
|
|
4271
|
+
|
|
4272
|
+
finally:
|
|
4273
|
+
scheduler.shutdown()
|
|
4274
|
+
print("\n灾难恢复演练完成")
|
|
4275
|
+
|
|
4276
|
+
def example_edge_computing_scenario():
|
|
4277
|
+
"""
|
|
4278
|
+
示例8: 边缘计算场景
|
|
4279
|
+
分布式边缘节点任务调度
|
|
4280
|
+
"""
|
|
4281
|
+
print("\n" + "="*60)
|
|
4282
|
+
print("示例8: 边缘计算场景")
|
|
4283
|
+
print("="*60)
|
|
4284
|
+
|
|
4285
|
+
# 模拟多个边缘节点
|
|
4286
|
+
edge_nodes = [
|
|
4287
|
+
{"id": "edge-01", "location": "factory-floor", "cpus": 8, "memory_gb": 16, "gpus": 1},
|
|
4288
|
+
{"id": "edge-02", "location": "warehouse", "cpus": 4, "memory_gb": 8, "gpus": 0},
|
|
4289
|
+
{"id": "edge-03", "location": "retail-store", "cpus": 2, "memory_gb": 4, "gpus": 0},
|
|
4290
|
+
{"id": "edge-04", "location": "field-office", "cpus": 4, "memory_gb": 8, "gpus": 0},
|
|
4291
|
+
{"id": "edge-05", "location": "research-lab", "cpus": 16, "memory_gb": 32, "gpus": 2}
|
|
4292
|
+
]
|
|
4293
|
+
|
|
4294
|
+
# 创建主调度器
|
|
4295
|
+
master_scheduler = ContainerScheduler(
|
|
4296
|
+
max_concurrent=10,
|
|
4297
|
+
job_store=JobStore("edge_computing.db"),
|
|
4298
|
+
plugins=[NotificationPlugin()]
|
|
4299
|
+
)
|
|
4300
|
+
|
|
4301
|
+
# 为每个边缘节点创建子调度器(模拟)
|
|
4302
|
+
edge_schedulers = {}
|
|
4303
|
+
|
|
4304
|
+
try:
|
|
4305
|
+
print(f"初始化 {len(edge_nodes)} 个边缘节点...")
|
|
4306
|
+
|
|
4307
|
+
# 边缘计算任务
|
|
4308
|
+
edge_tasks = []
|
|
4309
|
+
|
|
4310
|
+
# 1. 实时视频分析
|
|
4311
|
+
for camera_id in range(5):
|
|
4312
|
+
task = {
|
|
4313
|
+
"type": "video_analytics",
|
|
4314
|
+
"cmd": f"python analyze_video.py --camera {camera_id} --model person_detection",
|
|
4315
|
+
"requirements": {"gpus": 1, "latency": "low"},
|
|
4316
|
+
"priority": JobPriority.HIGH
|
|
4317
|
+
}
|
|
4318
|
+
edge_tasks.append(task)
|
|
4319
|
+
|
|
4320
|
+
# 2. 传感器数据处理
|
|
4321
|
+
for sensor_group in ["temperature", "humidity", "vibration", "pressure"]:
|
|
4322
|
+
task = {
|
|
4323
|
+
"type": "sensor_processing",
|
|
4324
|
+
"cmd": f"python process_sensors.py --type {sensor_group} --window 60",
|
|
4325
|
+
"requirements": {"cpus": 2, "interval": 60},
|
|
4326
|
+
"priority": JobPriority.NORMAL
|
|
4327
|
+
}
|
|
4328
|
+
edge_tasks.append(task)
|
|
4329
|
+
|
|
4330
|
+
# 3. 预测性维护
|
|
4331
|
+
task = {
|
|
4332
|
+
"type": "predictive_maintenance",
|
|
4333
|
+
"cmd": "python predictive_maintenance.py --equipment all --horizon 24",
|
|
4334
|
+
"requirements": {"cpus": 4, "memory_gb": 8},
|
|
4335
|
+
"priority": JobPriority.HIGH
|
|
4336
|
+
}
|
|
4337
|
+
edge_tasks.append(task)
|
|
4338
|
+
|
|
4339
|
+
# 4. 本地AI推理
|
|
4340
|
+
for model in ["defect_detection", "quality_inspection", "anomaly_detection"]:
|
|
4341
|
+
task = {
|
|
4342
|
+
"type": "ai_inference",
|
|
4343
|
+
"cmd": f"python run_inference.py --model {model} --input /data/latest",
|
|
4344
|
+
"requirements": {"gpus": 1, "memory_gb": 4},
|
|
4345
|
+
"priority": JobPriority.CRITICAL
|
|
4346
|
+
}
|
|
4347
|
+
edge_tasks.append(task)
|
|
4348
|
+
|
|
4349
|
+
print(f"总共 {len(edge_tasks)} 个边缘计算任务")
|
|
4350
|
+
|
|
4351
|
+
# 任务分发策略
|
|
4352
|
+
print("\n任务分发到边缘节点...")
|
|
4353
|
+
|
|
4354
|
+
scheduled_tasks = []
|
|
4355
|
+
for task in edge_tasks:
|
|
4356
|
+
# 选择最适合的边缘节点
|
|
4357
|
+
suitable_nodes = []
|
|
4358
|
+
for node in edge_nodes:
|
|
4359
|
+
suitable = True
|
|
4360
|
+
|
|
4361
|
+
# 检查GPU需求
|
|
4362
|
+
if task["requirements"].get("gpus", 0) > 0 and node["gpus"] == 0:
|
|
4363
|
+
suitable = False
|
|
4364
|
+
|
|
4365
|
+
# 检查CPU需求
|
|
4366
|
+
if task["requirements"].get("cpus", 1) > node["cpus"]:
|
|
4367
|
+
suitable = False
|
|
4368
|
+
|
|
4369
|
+
# 检查内存需求
|
|
4370
|
+
if task["requirements"].get("memory_gb", 1) > node["memory_gb"]:
|
|
4371
|
+
suitable = False
|
|
4372
|
+
|
|
4373
|
+
if suitable:
|
|
4374
|
+
suitable_nodes.append(node)
|
|
4375
|
+
|
|
4376
|
+
if suitable_nodes:
|
|
4377
|
+
# 选择负载最低的节点(简化策略)
|
|
4378
|
+
selected_node = suitable_nodes[0]
|
|
4379
|
+
|
|
4380
|
+
job = JobDefinition(
|
|
4381
|
+
cmd=task["cmd"],
|
|
4382
|
+
name=f"edge_{task['type']}_{selected_node['id']}",
|
|
4383
|
+
backend=Backend.LOCAL, # 假设边缘节点使用本地执行
|
|
4384
|
+
resource=ResourceRequest(
|
|
4385
|
+
cpus=task["requirements"].get("cpus", 1),
|
|
4386
|
+
memory_gb=task["requirements"].get("memory_gb", 1),
|
|
4387
|
+
gpus=task["requirements"].get("gpus", 0)
|
|
4388
|
+
),
|
|
4389
|
+
priority=task["priority"],
|
|
4390
|
+
tags={
|
|
4391
|
+
"edge_computing": "true",
|
|
4392
|
+
"node_id": selected_node["id"],
|
|
4393
|
+
"location": selected_node["location"],
|
|
4394
|
+
"task_type": task["type"],
|
|
4395
|
+
"latency": task["requirements"].get("latency", "normal")
|
|
4396
|
+
}
|
|
4397
|
+
)
|
|
4398
|
+
|
|
4399
|
+
scheduled_tasks.append(job)
|
|
4400
|
+
print(f" 任务 '{task['type']}' 分配到节点 '{selected_node['id']}'")
|
|
4401
|
+
else:
|
|
4402
|
+
print(f" ⚠️ 任务 '{task['type']}' 无合适节点,调度到云端")
|
|
4403
|
+
|
|
4404
|
+
# 调度到云
|
|
4405
|
+
cloud_job = JobDefinition(
|
|
4406
|
+
cmd=task["cmd"],
|
|
4407
|
+
name=f"cloud_{task['type']}",
|
|
4408
|
+
backend=Backend.AWS_BATCH, # 假设使用AWS Batch
|
|
4409
|
+
resource=ResourceRequest(
|
|
4410
|
+
cpus=task["requirements"].get("cpus", 1),
|
|
4411
|
+
memory_gb=task["requirements"].get("memory_gb", 1),
|
|
4412
|
+
gpus=task["requirements"].get("gpus", 0)
|
|
4413
|
+
),
|
|
4414
|
+
priority=task["priority"],
|
|
4415
|
+
tags={
|
|
4416
|
+
"edge_computing": "true",
|
|
4417
|
+
"node_id": "cloud",
|
|
4418
|
+
"task_type": task["type"]
|
|
4419
|
+
}
|
|
4420
|
+
)
|
|
4421
|
+
scheduled_tasks.append(cloud_job)
|
|
4422
|
+
|
|
4423
|
+
# 执行所有任务
|
|
4424
|
+
print(f"\n开始执行 {len(scheduled_tasks)} 个边缘计算任务...")
|
|
4425
|
+
|
|
4426
|
+
results = master_scheduler.run_many(
|
|
4427
|
+
scheduled_tasks,
|
|
4428
|
+
max_workers=5,
|
|
4429
|
+
progress_callback=lambda c, t: print(f" 执行进度: {c}/{t}")
|
|
4430
|
+
)
|
|
4431
|
+
|
|
4432
|
+
# 分析结果
|
|
4433
|
+
print("\n📊 边缘计算任务执行统计:")
|
|
4434
|
+
|
|
4435
|
+
# 按节点统计
|
|
4436
|
+
node_stats = {}
|
|
4437
|
+
for job, result in zip(scheduled_tasks, results):
|
|
4438
|
+
node_id = job.tags.get("node_id", "unknown")
|
|
4439
|
+
if node_id not in node_stats:
|
|
4440
|
+
node_stats[node_id] = {"total": 0, "success": 0}
|
|
4441
|
+
|
|
4442
|
+
node_stats[node_id]["total"] += 1
|
|
4443
|
+
if result.success():
|
|
4444
|
+
node_stats[node_id]["success"] += 1
|
|
4445
|
+
|
|
4446
|
+
for node_id, stats in node_stats.items():
|
|
4447
|
+
success_rate = stats["success"] / stats["total"] if stats["total"] > 0 else 0
|
|
4448
|
+
print(f" 节点 {node_id}: {stats['success']}/{stats['total']} 成功 ({success_rate:.1%})")
|
|
4449
|
+
|
|
4450
|
+
# 按任务类型统计
|
|
4451
|
+
type_stats = {}
|
|
4452
|
+
for job, result in zip(scheduled_tasks, results):
|
|
4453
|
+
task_type = job.tags.get("task_type", "unknown")
|
|
4454
|
+
if task_type not in type_stats:
|
|
4455
|
+
type_stats[task_type] = {"total": 0, "success": 0}
|
|
4456
|
+
|
|
4457
|
+
type_stats[task_type]["total"] += 1
|
|
4458
|
+
if result.success():
|
|
4459
|
+
type_stats[task_type]["success"] += 1
|
|
4460
|
+
|
|
4461
|
+
print("\n📈 按任务类型统计:")
|
|
4462
|
+
for task_type, stats in type_stats.items():
|
|
4463
|
+
success_rate = stats["success"] / stats["total"] if stats["total"] > 0 else 0
|
|
4464
|
+
print(f" {task_type}: {stats['success']}/{stats['total']} 成功 ({success_rate:.1%})")
|
|
4465
|
+
|
|
4466
|
+
# 总体统计
|
|
4467
|
+
total_success = sum(1 for r in results if r.success())
|
|
4468
|
+
total_tasks = len(results)
|
|
4469
|
+
overall_success_rate = total_success / total_tasks if total_tasks > 0 else 0
|
|
4470
|
+
|
|
4471
|
+
print(f"\n🎯 总体统计:")
|
|
4472
|
+
print(f" 总任务数: {total_tasks}")
|
|
4473
|
+
print(f" 成功任务: {total_success}")
|
|
4474
|
+
print(f" 成功率: {overall_success_rate:.1%}")
|
|
4475
|
+
|
|
4476
|
+
# 计算平均延迟
|
|
4477
|
+
successful_results = [r for r in results if r.success() and r.duration]
|
|
4478
|
+
if successful_results:
|
|
4479
|
+
avg_duration = sum(r.duration for r in successful_results) / len(successful_results)
|
|
4480
|
+
print(f" 平均执行时间: {avg_duration:.2f}秒")
|
|
4481
|
+
|
|
4482
|
+
# 低延迟任务统计
|
|
4483
|
+
low_latency_tasks = [job for job in scheduled_tasks if job.tags.get("latency") == "low"]
|
|
4484
|
+
if low_latency_tasks:
|
|
4485
|
+
low_latency_durations = []
|
|
4486
|
+
for job in low_latency_tasks:
|
|
4487
|
+
result = next((r for r in results if r.job_id == job.job_id), None)
|
|
4488
|
+
if result and result.duration:
|
|
4489
|
+
low_latency_durations.append(result.duration)
|
|
4490
|
+
|
|
4491
|
+
if low_latency_durations:
|
|
4492
|
+
avg_low_latency = sum(low_latency_durations) / len(low_latency_durations)
|
|
4493
|
+
print(f" 低延迟任务平均时间: {avg_low_latency:.2f}秒")
|
|
4494
|
+
|
|
4495
|
+
finally:
|
|
4496
|
+
master_scheduler.shutdown()
|
|
4497
|
+
print("\n边缘计算场景模拟完成")
|
|
4498
|
+
|
|
4499
|
+
# ============================================================================
|
|
4500
|
+
# 运行所有示例
|
|
4501
|
+
# ============================================================================
|
|
4502
|
+
|
|
4503
|
+
def run_all_examples():
|
|
4504
|
+
"""运行所有示例场景"""
|
|
4505
|
+
print("通用容器调度器 - 高级使用场景示例")
|
|
4506
|
+
print("="*70)
|
|
4507
|
+
|
|
4508
|
+
examples = [
|
|
4509
|
+
("数据处理流水线", example_data_processing_pipeline),
|
|
4510
|
+
("机器学习训练", example_machine_learning_training),
|
|
4511
|
+
("生物信息学工作流", example_bioinformatics_workflow),
|
|
4512
|
+
("云批量处理", example_cloud_batch_processing),
|
|
4513
|
+
("实时监控系统", example_real_time_monitoring),
|
|
4514
|
+
("自定义工作流编排", example_custom_workflow_orchestrator),
|
|
4515
|
+
("灾难恢复演练", example_disaster_recovery_drills),
|
|
4516
|
+
("边缘计算场景", example_edge_computing_scenario),
|
|
4517
|
+
]
|
|
4518
|
+
|
|
4519
|
+
for i, (name, func) in enumerate(examples, 1):
|
|
4520
|
+
print(f"\n示例 {i}: {name}")
|
|
4521
|
+
print("-"*40)
|
|
4522
|
+
|
|
4523
|
+
try:
|
|
4524
|
+
func()
|
|
4525
|
+
print(f"✅ {name} 示例完成")
|
|
4526
|
+
except KeyboardInterrupt:
|
|
4527
|
+
print(f"⏹️ {name} 示例被中断")
|
|
4528
|
+
break
|
|
4529
|
+
except Exception as e:
|
|
4530
|
+
print(f"❌ {name} 示例错误: {e}")
|
|
4531
|
+
import traceback
|
|
4532
|
+
traceback.print_exc()
|
|
4533
|
+
|
|
4534
|
+
# 示例间暂停
|
|
4535
|
+
if i < len(examples):
|
|
4536
|
+
print("\n" + "="*70)
|
|
4537
|
+
input("按 Enter 键继续下一个示例...")
|
|
4538
|
+
|
|
4539
|
+
print("\n" + "="*70)
|
|
4540
|
+
print("所有示例运行完成!")
|
|
4541
|
+
|
|
4542
|
+
def quick_demo():
|
|
4543
|
+
"""快速演示核心功能"""
|
|
4544
|
+
print("快速演示 - 核心功能")
|
|
4545
|
+
print("="*50)
|
|
4546
|
+
|
|
4547
|
+
# 1. 基本使用
|
|
4548
|
+
print("\n1. 基本命令执行:")
|
|
4549
|
+
scheduler = ContainerScheduler()
|
|
4550
|
+
result = scheduler.run("echo 'Hello from Universal Scheduler!'")
|
|
4551
|
+
print(f" 状态: {result.status.value}, 输出: {result.stdout}")
|
|
4552
|
+
|
|
4553
|
+
# 2. 批量处理
|
|
4554
|
+
print("\n2. 批量作业处理:")
|
|
4555
|
+
commands = [f"echo 'Task {i}' && sleep 0.1" for i in range(5)]
|
|
4556
|
+
results = scheduler.run_many(commands, max_workers=3)
|
|
4557
|
+
print(f" 完成 {len(results)} 个任务, {sum(1 for r in results if r.success())} 个成功")
|
|
4558
|
+
|
|
4559
|
+
# 3. 工作流示例
|
|
4560
|
+
print("\n3. 简单工作流:")
|
|
4561
|
+
workflow = [
|
|
4562
|
+
{"cmd": "echo 'Step 1: Data extraction'", "job_id": "step1"},
|
|
4563
|
+
{"cmd": "echo 'Step 2: Processing'", "job_id": "step2", "dependencies": ["step1"]},
|
|
4564
|
+
{"cmd": "echo 'Step 3: Analysis'", "job_id": "step3", "dependencies": ["step2"]},
|
|
4565
|
+
]
|
|
4566
|
+
results = scheduler.run_workflow(workflow)
|
|
4567
|
+
print(f" 工作流完成: {len(results)}/{len(workflow)} 步骤成功")
|
|
4568
|
+
|
|
4569
|
+
# 4. 健康检查
|
|
4570
|
+
print("\n4. 系统健康检查:")
|
|
4571
|
+
health = scheduler.health_check()
|
|
4572
|
+
print(f" 健康状态: {health['status']}")
|
|
4573
|
+
|
|
4574
|
+
# 5. 指标查看
|
|
4575
|
+
print("\n5. 性能指标:")
|
|
4576
|
+
metrics = scheduler.get_metrics()
|
|
4577
|
+
print(f" 总作业数: {metrics['jobs_total']}")
|
|
4578
|
+
print(f" 成功率: {metrics.get('success_rate', 0):.1%}")
|
|
4579
|
+
|
|
4580
|
+
scheduler.shutdown()
|
|
4581
|
+
print("\n✅ 快速演示完成!")
|
|
4582
|
+
|
|
4583
|
+
# ============================================================================
|
|
4584
|
+
# 主入口
|
|
4585
|
+
# ============================================================================
|
|
4586
|
+
|
|
4587
|
+
if __name__ == "__main__":
|
|
4588
|
+
import argparse
|
|
4589
|
+
|
|
4590
|
+
parser = argparse.ArgumentParser(description="通用容器调度器示例")
|
|
4591
|
+
parser.add_argument("--demo", action="store_true", help="运行快速演示")
|
|
4592
|
+
parser.add_argument("--all", action="store_true", help="运行所有示例")
|
|
4593
|
+
parser.add_argument("--example", type=int, choices=range(1, 9),
|
|
4594
|
+
help="运行特定示例 (1-8)")
|
|
4595
|
+
|
|
4596
|
+
args = parser.parse_args()
|
|
4597
|
+
|
|
4598
|
+
if args.demo:
|
|
4599
|
+
quick_demo()
|
|
4600
|
+
elif args.all:
|
|
4601
|
+
run_all_examples()
|
|
4602
|
+
elif args.example:
|
|
4603
|
+
examples = [
|
|
4604
|
+
example_data_processing_pipeline,
|
|
4605
|
+
example_machine_learning_training,
|
|
4606
|
+
example_bioinformatics_workflow,
|
|
4607
|
+
example_cloud_batch_processing,
|
|
4608
|
+
example_real_time_monitoring,
|
|
4609
|
+
example_custom_workflow_orchestrator,
|
|
4610
|
+
example_disaster_recovery_drills,
|
|
4611
|
+
example_edge_computing_scenario,
|
|
4612
|
+
]
|
|
4613
|
+
if 1 <= args.example <= len(examples):
|
|
4614
|
+
examples[args.example - 1]()
|
|
4615
|
+
else:
|
|
4616
|
+
print(f"示例编号 {args.example} 无效,可用范围: 1-{len(examples)}")
|
|
4617
|
+
else:
|
|
4618
|
+
print("通用容器调度器 - 使用示例")
|
|
4619
|
+
print("\n用法:")
|
|
4620
|
+
print(" python examples.py --demo # 快速演示")
|
|
4621
|
+
print(" python examples.py --all # 运行所有示例")
|
|
4622
|
+
print(" python examples.py --example N # 运行特定示例")
|
|
4623
|
+
print("\n示例列表:")
|
|
4624
|
+
examples = [
|
|
4625
|
+
"1. 数据处理流水线",
|
|
4626
|
+
"2. 机器学习训练",
|
|
4627
|
+
"3. 生物信息学工作流",
|
|
4628
|
+
"4. 云批量处理",
|
|
4629
|
+
"5. 实时监控系统",
|
|
4630
|
+
"6. 自定义工作流编排",
|
|
4631
|
+
"7. 灾难恢复演练",
|
|
4632
|
+
"8. 边缘计算场景",
|
|
4633
|
+
]
|
|
4634
|
+
for example in examples:
|
|
4635
|
+
print(f" {example}")
|