py2ls 0.1.10.12__py3-none-any.whl → 0.2.7.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of py2ls might be problematic. Click here for more details.

Files changed (72) hide show
  1. py2ls/.DS_Store +0 -0
  2. py2ls/.git/.DS_Store +0 -0
  3. py2ls/.git/index +0 -0
  4. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  5. py2ls/.git/objects/.DS_Store +0 -0
  6. py2ls/.git/refs/.DS_Store +0 -0
  7. py2ls/ImageLoader.py +621 -0
  8. py2ls/__init__.py +7 -5
  9. py2ls/apptainer2ls.py +3940 -0
  10. py2ls/batman.py +164 -42
  11. py2ls/bio.py +2595 -0
  12. py2ls/cell_image_clf.py +1632 -0
  13. py2ls/container2ls.py +4635 -0
  14. py2ls/corr.py +475 -0
  15. py2ls/data/.DS_Store +0 -0
  16. py2ls/data/email/email_html_template.html +88 -0
  17. py2ls/data/hyper_param_autogluon_zeroshot2024.json +2383 -0
  18. py2ls/data/hyper_param_tabrepo_2024.py +1753 -0
  19. py2ls/data/mygenes_fields_241022.txt +355 -0
  20. py2ls/data/re_common_pattern.json +173 -0
  21. py2ls/data/sns_info.json +74 -0
  22. py2ls/data/styles/.DS_Store +0 -0
  23. py2ls/data/styles/example/.DS_Store +0 -0
  24. py2ls/data/styles/stylelib/.DS_Store +0 -0
  25. py2ls/data/styles/stylelib/grid.mplstyle +15 -0
  26. py2ls/data/styles/stylelib/high-contrast.mplstyle +6 -0
  27. py2ls/data/styles/stylelib/high-vis.mplstyle +4 -0
  28. py2ls/data/styles/stylelib/ieee.mplstyle +15 -0
  29. py2ls/data/styles/stylelib/light.mplstyl +6 -0
  30. py2ls/data/styles/stylelib/muted.mplstyle +6 -0
  31. py2ls/data/styles/stylelib/nature-reviews-latex.mplstyle +616 -0
  32. py2ls/data/styles/stylelib/nature-reviews.mplstyle +616 -0
  33. py2ls/data/styles/stylelib/nature.mplstyle +31 -0
  34. py2ls/data/styles/stylelib/no-latex.mplstyle +10 -0
  35. py2ls/data/styles/stylelib/notebook.mplstyle +36 -0
  36. py2ls/data/styles/stylelib/paper.mplstyle +290 -0
  37. py2ls/data/styles/stylelib/paper2.mplstyle +305 -0
  38. py2ls/data/styles/stylelib/retro.mplstyle +4 -0
  39. py2ls/data/styles/stylelib/sans.mplstyle +10 -0
  40. py2ls/data/styles/stylelib/scatter.mplstyle +7 -0
  41. py2ls/data/styles/stylelib/science.mplstyle +48 -0
  42. py2ls/data/styles/stylelib/std-colors.mplstyle +4 -0
  43. py2ls/data/styles/stylelib/vibrant.mplstyle +6 -0
  44. py2ls/data/tiles.csv +146 -0
  45. py2ls/data/usages_pd.json +1417 -0
  46. py2ls/data/usages_sns.json +31 -0
  47. py2ls/docker2ls.py +5446 -0
  48. py2ls/ec2ls.py +61 -0
  49. py2ls/fetch_update.py +145 -0
  50. py2ls/ich2ls.py +1955 -296
  51. py2ls/im2.py +8242 -0
  52. py2ls/image_ml2ls.py +2100 -0
  53. py2ls/ips.py +33909 -3418
  54. py2ls/ml2ls.py +7700 -0
  55. py2ls/mol.py +289 -0
  56. py2ls/mount2ls.py +1307 -0
  57. py2ls/netfinder.py +873 -351
  58. py2ls/nl2ls.py +283 -0
  59. py2ls/ocr.py +1581 -458
  60. py2ls/plot.py +10394 -314
  61. py2ls/rna2ls.py +311 -0
  62. py2ls/ssh2ls.md +456 -0
  63. py2ls/ssh2ls.py +5933 -0
  64. py2ls/ssh2ls_v01.py +2204 -0
  65. py2ls/stats.py +66 -172
  66. py2ls/temp20251124.py +509 -0
  67. py2ls/translator.py +2 -0
  68. py2ls/utils/decorators.py +3564 -0
  69. py2ls/utils_bio.py +3453 -0
  70. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/METADATA +113 -224
  71. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/RECORD +72 -16
  72. {py2ls-0.1.10.12.dist-info → py2ls-0.2.7.10.dist-info}/WHEEL +0 -0
py2ls/container2ls.py ADDED
@@ -0,0 +1,4635 @@
1
+ """
2
+ 通用容器调度器 (Universal Container Scheduler)
3
+ =============================================
4
+
5
+ 一个企业级的、生产就绪的命令执行框架,支持本地执行、容器化执行(Docker/Apptainer)
6
+ 以及HPC集群调度(SLURM/PBS/Torque)。提供完整的作业管理、监控、重试和资源管理功能。
7
+
8
+ 核心特性:
9
+ 多后端支持:Local, Docker, Apptainer, SLURM, PBS
10
+ 完整的作业管理:提交、执行、监控、取消
11
+ 智能重试机制:指数退避、条件重试
12
+ 资源管理:CPU、内存、GPU等资源请求和限制
13
+ 优先级队列:支持作业优先级调度
14
+ 超时监控:自动检测和取消超时作业
15
+ 结果缓存:避免重复执行相同命令
16
+ 持久化存储:SQLite数据库存储作业和结果
17
+ 插件系统:可扩展的插件架构
18
+ 健康检查:全面的系统健康监控
19
+ 指标收集:性能指标和统计信息
20
+ 命令行接口:完整的CLI支持
21
+ 设计原则:
22
+ ---------
23
+ - **统一接口**: 所有后端使用相同的API
24
+ - **配置驱动**: 通过配置类管理复杂参数
25
+ - **类型安全**: 完整的类型注解
26
+ - **可观测性**: 内置监控和日志
27
+ - **可扩展性**: 插件架构,易于扩展新功能
28
+
29
+ 快速开始:
30
+ ---------
31
+ ```python
32
+ from universal_scheduler import ContainerScheduler, Backend, ResourceRequest
33
+
34
+ # 创建调度器实例
35
+ scheduler = ContainerScheduler()
36
+
37
+ # 运行简单命令
38
+ result = scheduler.run("echo 'Hello, World!'", backend=Backend.LOCAL)
39
+
40
+ # 使用Docker容器
41
+ result = scheduler.run(
42
+ cmd="python -c 'import numpy; print(numpy.__version__)'",
43
+ backend=Backend.DOCKER,
44
+ image="python:3.9-slim",
45
+ mounts={"/data": "/data"}
46
+ )
47
+
48
+ # 提交SLURM作业
49
+ result = scheduler.run(
50
+ cmd="python train_model.py",
51
+ backend=Backend.SLURM,
52
+ resource=ResourceRequest(cpus=8, memory_gb=32, gpus=1),
53
+ job_name="model_training"
54
+ )
55
+
56
+ # 文件结构建议:
57
+ universal_scheduler/
58
+ ├── __init__.py # 主模块
59
+ ├── scheduler.py # 主调度器类
60
+ ├── models.py # 数据模型(ResourceRequest, JobResult等)
61
+ ├── plugins.py # 插件系统
62
+ ├── storage.py # 存储和缓存
63
+ ├── monitors.py # 监控和指标
64
+ ├── backends.py # 后端实现
65
+ ├── cli.py # 命令行接口
66
+ ├── config/
67
+ │ └── default.yaml # 默认配置
68
+ └── examples/ # 示例代码
69
+
70
+ """
71
+
72
+ import subprocess
73
+ import shlex
74
+ import time
75
+ import logging
76
+ import os
77
+ import signal
78
+ import sys
79
+ import tempfile
80
+ import json
81
+ import hashlib
82
+ import pickle
83
+ import sqlite3
84
+ import uuid
85
+ import socket
86
+ import threading
87
+ import inspect
88
+ import asyncio
89
+ import heapq
90
+ import random
91
+ import multiprocessing as mp
92
+ from pathlib import Path
93
+ from typing import Dict, List, Optional, Union, Callable, Any, Tuple, Set, Type
94
+ from dataclasses import dataclass, field, asdict
95
+ from enum import Enum, auto
96
+ from datetime import datetime, timedelta
97
+ from concurrent.futures import ThreadPoolExecutor, Future, ProcessPoolExecutor
98
+ from abc import ABC, abstractmethod
99
+ from functools import wraps, lru_cache
100
+ from contextlib import contextmanager
101
+ import warnings
102
+
103
+ try:
104
+ import yaml
105
+ YAML_AVAILABLE = True
106
+ except ImportError:
107
+ YAML_AVAILABLE = False
108
+ warnings.warn("PyYAML not installed, YAML config support disabled")
109
+
110
+ try:
111
+ import psutil
112
+ PSUTIL_AVAILABLE = True
113
+ except ImportError:
114
+ PSUTIL_AVAILABLE = False
115
+ warnings.warn("psutil not installed, resource monitoring limited")
116
+
117
+ # ============================================================================
118
+ # 核心枚举和数据类型
119
+ # ============================================================================
120
+
121
+ class Backend(Enum):
122
+ """支持的执行后端"""
123
+ LOCAL = "local"
124
+ DOCKER = "docker"
125
+ APPTAINER = "apptainer"
126
+ SLURM = "slurm"
127
+ PBS = "pbs"
128
+ KUBERNETES = "kubernetes"
129
+ AWS_BATCH = "aws_batch"
130
+ AZURE_BATCH = "azure_batch"
131
+
132
+ class JobPriority(Enum):
133
+ """作业优先级"""
134
+ LOWEST = 0
135
+ LOW = 1
136
+ NORMAL = 2
137
+ HIGH = 3
138
+ HIGHEST = 4
139
+ CRITICAL = 5
140
+
141
+ class JobStatus(Enum):
142
+ """作业状态"""
143
+ CREATED = "created"
144
+ PENDING = "pending"
145
+ QUEUED = "queued"
146
+ RUNNING = "running"
147
+ COMPLETED = "completed"
148
+ FAILED = "failed"
149
+ CANCELLED = "cancelled"
150
+ TIMEOUT = "timeout"
151
+ SUSPENDED = "suspended"
152
+ UNKNOWN = "unknown"
153
+
154
+ class ResourceType(Enum):
155
+ """资源类型"""
156
+ CPU = "cpu"
157
+ GPU = "gpu"
158
+ MEMORY = "memory"
159
+ DISK = "disk"
160
+ NETWORK = "network"
161
+ TIME = "time"
162
+
163
+ # ============================================================================
164
+ # 核心数据类
165
+ # ============================================================================
166
+
167
+ @dataclass
168
+ class ResourceRequest:
169
+ """
170
+ 资源请求配置
171
+ 定义作业所需的计算资源,包括CPU、内存、GPU等。
172
+ 支持不同后端的资源映射。
173
+ """
174
+ cpus: int = 1
175
+ memory_gb: float = 1.0
176
+ memory_mb: Optional[int] = None
177
+ gpus: int = 0
178
+ gpu_type: Optional[str] = None
179
+ time_minutes: Optional[int] = None
180
+ time_hours: Optional[int] = None
181
+ partition: Optional[str] = None
182
+ queue: Optional[str] = None
183
+ nodes: int = 1
184
+ tasks_per_node: int = 1
185
+ account: Optional[str] = None
186
+ reservation: Optional[str] = None
187
+ qos: Optional[str] = None
188
+ walltime: Optional[str] = None
189
+ exclusive: bool = False
190
+ constraints: Optional[str] = None
191
+ features: Optional[str] = None
192
+
193
+ def __post_init__(self):
194
+ """后初始化处理,确保内存单位一致"""
195
+ if self.memory_mb is None and self.memory_gb is not None:
196
+ self.memory_mb = int(self.memory_gb * 1024)
197
+
198
+ # 确保时间单位一致
199
+ if self.time_hours is not None and self.time_minutes is None:
200
+ self.time_minutes = self.time_hours * 60
201
+ elif self.time_minutes is not None and self.time_hours is None:
202
+ self.time_hours = self.time_minutes / 60
203
+
204
+ @property
205
+ def total_cpus(self) -> int:
206
+ """总CPU核心数"""
207
+ return self.cpus * self.nodes * self.tasks_per_node
208
+
209
+ def to_slurm_directives(self) -> Dict[str, str]:
210
+ """转换为SLURM指令"""
211
+ directives = {}
212
+ if self.cpus > 1:
213
+ directives["--cpus-per-task"] = str(self.cpus)
214
+ if self.memory_mb:
215
+ directives["--mem"] = f"{self.memory_mb}M"
216
+ if self.gpus > 0:
217
+ gres = f"gpu:{self.gpus}"
218
+ if self.gpu_type:
219
+ gres = f"gpu:{self.gpu_type}:{self.gpus}"
220
+ directives["--gres"] = gres
221
+ if self.time_minutes:
222
+ directives["--time"] = str(self.time_minutes)
223
+ if self.partition:
224
+ directives["--partition"] = self.partition
225
+ if self.account:
226
+ directives["--account"] = self.account
227
+ if self.qos:
228
+ directives["--qos"] = self.qos
229
+ if self.nodes > 1:
230
+ directives["--nodes"] = str(self.nodes)
231
+ if self.tasks_per_node > 1:
232
+ directives["--ntasks-per-node"] = str(self.tasks_per_node)
233
+ if self.exclusive:
234
+ directives["--exclusive"] = ""
235
+ if self.constraints:
236
+ directives["--constraint"] = self.constraints
237
+ return directives
238
+
239
+ def to_pbs_directives(self) -> Dict[str, str]:
240
+ """转换为PBS指令"""
241
+ directives = {}
242
+ directives["-l nodes"] = f"{self.nodes}:ppn={self.cpus}"
243
+ if self.memory_mb:
244
+ directives["-l mem"] = f"{self.memory_mb}mb"
245
+ if self.time_hours:
246
+ directives["-l walltime"] = f"{self.time_hours}:00:00"
247
+ if self.queue:
248
+ directives["-q"] = self.queue
249
+ if self.gpus > 0:
250
+ directives["-l gpus"] = str(self.gpus)
251
+ if self.gpu_type:
252
+ directives["-l gputype"] = self.gpu_type
253
+ return directives
254
+
255
+ @dataclass
256
+ class RetryConfig:
257
+ """
258
+ 重试策略配置
259
+ 定义作业失败时的重试行为,支持指数退避、条件重试等策略。
260
+ """
261
+ max_attempts: int = 1
262
+ delay_seconds: float = 1.0
263
+ backoff_factor: float = 2.0
264
+ max_delay_seconds: float = 300.0
265
+ jitter_seconds: float = 0.0
266
+ retry_on_exit_codes: List[int] = field(default_factory=list)
267
+ retry_on_timeout: bool = True
268
+ retry_on_signal: bool = False
269
+ retry_on_memory_error: bool = True
270
+ retry_on_disk_full: bool = True
271
+ retry_on_network_error: bool = True
272
+ retry_condition: Optional[Callable[['JobResult'], bool]] = None
273
+
274
+ def get_delay(self, attempt: int) -> float:
275
+ """计算第attempt次重试的延迟时间"""
276
+ delay = self.delay_seconds * (self.backoff_factor ** (attempt - 1))
277
+ delay = min(delay, self.max_delay_seconds)
278
+
279
+ # 添加抖动
280
+ if self.jitter_seconds > 0:
281
+ delay += random.uniform(-self.jitter_seconds, self.jitter_seconds)
282
+ delay = max(0, delay)
283
+
284
+ return delay
285
+
286
+ def should_retry(self, result: 'JobResult', attempt: int) -> bool:
287
+ """判断是否需要重试"""
288
+ if attempt >= self.max_attempts:
289
+ return False
290
+
291
+ # 检查退出码
292
+ if result.exit_code is not None:
293
+ if self.retry_on_exit_codes:
294
+ if result.exit_code in self.retry_on_exit_codes:
295
+ return True
296
+ elif result.exit_code != 0:
297
+ return True
298
+
299
+ # 检查超时
300
+ if result.status == JobStatus.TIMEOUT and self.retry_on_timeout:
301
+ return True
302
+
303
+ # 检查信号
304
+ if result.exit_code is not None and result.exit_code < 0 and self.retry_on_signal:
305
+ return True
306
+
307
+ # 检查错误信息中的关键词
308
+ error_msg = (result.error_message or "").lower()
309
+ stderr = (result.stderr or "").lower()
310
+
311
+ if self.retry_on_memory_error and any(
312
+ keyword in error_msg or keyword in stderr
313
+ for keyword in ["memory", "oom", "out of memory"]
314
+ ):
315
+ return True
316
+
317
+ if self.retry_on_disk_full and any(
318
+ keyword in error_msg or keyword in stderr
319
+ for keyword in ["disk full", "no space", "quota exceeded"]
320
+ ):
321
+ return True
322
+
323
+ if self.retry_on_network_error and any(
324
+ keyword in error_msg or keyword in stderr
325
+ for keyword in ["network", "connection", "timeout", "refused"]
326
+ ):
327
+ return True
328
+
329
+ # 自定义条件
330
+ if self.retry_condition and self.retry_condition(result):
331
+ return True
332
+
333
+ return False
334
+
335
+ @dataclass
336
+ class ExecutionConfig:
337
+ """
338
+ 执行配置
339
+ 定义命令执行的详细配置,包括工作目录、环境变量、挂载点等。
340
+ """
341
+ workdir: Optional[Path] = None
342
+ env: Dict[str, str] = field(default_factory=dict)
343
+ mounts: Dict[Path, Path] = field(default_factory=dict)
344
+ shell: str = "/bin/bash"
345
+ clean_temp: bool = True
346
+ capture_output: bool = True
347
+ stdout: Optional[Path] = None
348
+ stderr: Optional[Path] = None
349
+ stdin: Optional[str] = None
350
+ timeout: Optional[int] = None
351
+ check: bool = False
352
+ silent: bool = False
353
+ user: Optional[str] = None
354
+ group: Optional[str] = None
355
+ network_mode: Optional[str] = None
356
+ security_opts: Optional[List[str]] = None
357
+ ulimits: Optional[Dict[str, Tuple[int, int]]] = None
358
+ tmpfs: Optional[Dict[str, str]] = None
359
+ read_only: bool = False
360
+ detach: bool = False
361
+
362
+ def __post_init__(self):
363
+ """确保路径是绝对路径"""
364
+ if self.workdir is not None:
365
+ self.workdir = Path(self.workdir).resolve()
366
+ if self.stdout is not None:
367
+ self.stdout = Path(self.stdout).resolve()
368
+ if self.stderr is not None:
369
+ self.stderr = Path(self.stderr).resolve()
370
+
371
+ # 转换挂载路径
372
+ mounts = {}
373
+ for host_path, container_path in self.mounts.items():
374
+ mounts[Path(host_path).resolve()] = Path(container_path)
375
+ self.mounts = mounts
376
+
377
+ @dataclass
378
+ class JobResult:
379
+ """
380
+ 作业执行结果
381
+ 封装作业的执行结果,包括状态、退出码、输出、时间统计等。
382
+ """
383
+ job_id: str
384
+ status: JobStatus
385
+ exit_code: Optional[int] = None
386
+ stdout: Optional[str] = None
387
+ stderr: Optional[str] = None
388
+ start_time: Optional[datetime] = None
389
+ end_time: Optional[datetime] = None
390
+ duration: Optional[float] = None
391
+ attempts: int = 0
392
+ error_message: Optional[str] = None
393
+ backend: Optional[str] = None
394
+ command: Optional[str] = None
395
+ resource_usage: Optional[Dict[str, Any]] = None
396
+ metrics: Dict[str, Any] = field(default_factory=dict)
397
+ tags: Dict[str, str] = field(default_factory=dict)
398
+ parent_job_id: Optional[str] = None
399
+
400
+ def success(self) -> bool:
401
+ """作业是否成功完成"""
402
+ return self.status == JobStatus.COMPLETED and (self.exit_code == 0 or self.exit_code is None)
403
+
404
+ def failed(self) -> bool:
405
+ """作业是否失败"""
406
+ return self.status in {JobStatus.FAILED, JobStatus.TIMEOUT}
407
+
408
+ def running(self) -> bool:
409
+ """作业是否正在运行"""
410
+ return self.status == JobStatus.RUNNING
411
+
412
+ def cancelled(self) -> bool:
413
+ """作业是否被取消"""
414
+ return self.status == JobStatus.CANCELLED
415
+
416
+ def to_dict(self) -> Dict[str, Any]:
417
+ """转换为字典,用于序列化"""
418
+ data = asdict(self)
419
+ data['status'] = self.status.value
420
+ return data
421
+
422
+ def to_json(self) -> str:
423
+ """转换为JSON字符串"""
424
+ return json.dumps(self.to_dict(), default=str)
425
+
426
+ @classmethod
427
+ def from_json(cls, json_str: str) -> 'JobResult':
428
+ """从JSON字符串创建JobResult"""
429
+ data = json.loads(json_str)
430
+ data['status'] = JobStatus(data['status'])
431
+ if data['start_time']:
432
+ data['start_time'] = datetime.fromisoformat(data['start_time'].replace('Z', '+00:00'))
433
+ if data['end_time']:
434
+ data['end_time'] = datetime.fromisoformat(data['end_time'].replace('Z', '+00:00'))
435
+ return cls(**data)
436
+
437
+ @dataclass
438
+ class JobDependency:
439
+ """
440
+ 作业依赖关系
441
+ 定义作业之间的依赖关系,支持复杂的工作流。
442
+ """
443
+ job_id: str
444
+ condition: Optional[Callable[[JobResult], bool]] = None
445
+ timeout: Optional[float] = None
446
+ propagate_status: bool = True
447
+
448
+ @dataclass
449
+ class JobDefinition:
450
+ """
451
+ 作业定义
452
+ 完整定义作业的所有属性和配置。
453
+ """
454
+ cmd: str
455
+ backend: Backend = Backend.LOCAL
456
+ image: Optional[str] = None
457
+ config: ExecutionConfig = field(default_factory=ExecutionConfig)
458
+ resource: ResourceRequest = field(default_factory=ResourceRequest)
459
+ retry: RetryConfig = field(default_factory=RetryConfig)
460
+ job_id: Optional[str] = None
461
+ name: Optional[str] = None
462
+ description: Optional[str] = None
463
+ priority: JobPriority = JobPriority.NORMAL
464
+ dependencies: List[JobDependency] = field(default_factory=list)
465
+ tags: Dict[str, str] = field(default_factory=dict)
466
+ metadata: Dict[str, Any] = field(default_factory=dict)
467
+ callback: Optional[Callable[[JobResult], Any]] = None
468
+ result_handler: Optional[Callable[[JobResult], JobResult]] = None
469
+
470
+ def __post_init__(self):
471
+ if self.job_id is None:
472
+ self.job_id = f"job_{uuid.uuid4().hex[:8]}"
473
+ if self.name is None:
474
+ self.name = self.job_id
475
+
476
+ # ============================================================================
477
+ # 配置管理
478
+ # ============================================================================
479
+
480
+ @dataclass
481
+ class SchedulerConfig:
482
+ """
483
+ 调度器配置
484
+ 完整的调度器配置,支持从YAML文件加载和保存。
485
+ """
486
+ default_backend: Backend = Backend.LOCAL
487
+ default_image: Optional[str] = None
488
+ max_concurrent: int = 4
489
+ database_url: Optional[str] = None
490
+ cache_dir: Optional[Path] = None
491
+ log_level: str = "INFO"
492
+ log_file: Optional[Path] = None
493
+ enable_priority_queue: bool = False
494
+ enable_timeout_monitor: bool = True
495
+ health_check_interval: int = 30
496
+ cleanup_interval: int = 300
497
+ web_monitor_enabled: bool = False
498
+ web_monitor_port: int = 8080
499
+
500
+ # 资源限制
501
+ max_cpu_percent: float = 90.0
502
+ max_memory_percent: float = 90.0
503
+ max_disk_percent: float = 85.0
504
+
505
+ # 缓存配置
506
+ cache_max_size_mb: int = 1024
507
+ cache_max_age_days: int = 30
508
+
509
+ # 数据库配置
510
+ db_cleanup_days: int = 90
511
+ db_backup_days: int = 7
512
+
513
+ def __post_init__(self):
514
+ if self.cache_dir is not None:
515
+ self.cache_dir = Path(self.cache_dir)
516
+ if self.log_file is not None:
517
+ self.log_file = Path(self.log_file)
518
+
519
+ @classmethod
520
+ def load(cls, config_path: Optional[Path] = None) -> 'SchedulerConfig':
521
+ """从文件加载配置"""
522
+ if not YAML_AVAILABLE:
523
+ return cls()
524
+
525
+ if config_path is None:
526
+ # 查找默认配置文件位置
527
+ possible_paths = [
528
+ Path("config/scheduler.yaml"),
529
+ Path("scheduler.yaml"),
530
+ Path("~/.universal-scheduler/config.yaml").expanduser(),
531
+ Path("/etc/universal-scheduler/config.yaml")
532
+ ]
533
+
534
+ for path in possible_paths:
535
+ if path.exists():
536
+ config_path = path
537
+ break
538
+
539
+ if config_path and config_path.exists():
540
+ with open(config_path) as f:
541
+ data = yaml.safe_load(f)
542
+
543
+ # 转换字符串为枚举
544
+ if "default_backend" in data and isinstance(data["default_backend"], str):
545
+ data["default_backend"] = Backend(data["default_backend"])
546
+
547
+ return cls(**data)
548
+
549
+ return cls() # 返回默认配置
550
+
551
+ def save(self, config_path: Path):
552
+ """保存配置到文件"""
553
+ if not YAML_AVAILABLE:
554
+ raise ImportError("PyYAML is required to save configuration")
555
+
556
+ config_path.parent.mkdir(parents=True, exist_ok=True)
557
+ with open(config_path, 'w') as f:
558
+ data = self.to_dict()
559
+ # 转换枚举为字符串以便YAML序列化
560
+ if "default_backend" in data:
561
+ data["default_backend"] = data["default_backend"].value
562
+ yaml.dump(data, f, default_flow_style=False)
563
+
564
+ def to_dict(self) -> Dict[str, Any]:
565
+ """转换为字典"""
566
+ data = asdict(self)
567
+ # 转换枚举
568
+ if isinstance(data["default_backend"], Backend):
569
+ data["default_backend"] = data["default_backend"].value
570
+ # 转换路径
571
+ for key in ["cache_dir", "log_file"]:
572
+ if data[key] is not None:
573
+ data[key] = str(data[key])
574
+ return data
575
+
576
+ def validate(self) -> List[str]:
577
+ """验证配置,返回错误列表"""
578
+ errors = []
579
+
580
+ if self.max_concurrent <= 0:
581
+ errors.append("max_concurrent must be positive")
582
+
583
+ if self.max_cpu_percent <= 0 or self.max_cpu_percent > 100:
584
+ errors.append("max_cpu_percent must be between 0 and 100")
585
+
586
+ if self.cache_dir and not self.cache_dir.parent.exists():
587
+ errors.append(f"Cache directory parent does not exist: {self.cache_dir.parent}")
588
+
589
+ return errors
590
+
591
+ # ============================================================================
592
+ # 插件系统
593
+ # ============================================================================
594
+
595
+ class Plugin(ABC):
596
+ """
597
+ 插件基类
598
+ 扩展调度器功能的插件接口,可以添加监控、日志、缓存等功能。
599
+ """
600
+
601
+ @abstractmethod
602
+ def on_job_submit(self, job: JobDefinition) -> None:
603
+ """作业提交时调用"""
604
+ pass
605
+
606
+ @abstractmethod
607
+ def on_job_start(self, job_id: str) -> None:
608
+ """作业开始时调用"""
609
+ pass
610
+
611
+ @abstractmethod
612
+ def on_job_complete(self, result: JobResult) -> None:
613
+ """作业完成时调用"""
614
+ pass
615
+
616
+ @abstractmethod
617
+ def on_error(self, error: Exception) -> None:
618
+ """发生错误时调用"""
619
+ pass
620
+
621
+ class NotificationPlugin(Plugin):
622
+ """发送通知的插件"""
623
+
624
+ def __init__(self, webhook_url: Optional[str] = None, email: Optional[str] = None):
625
+ self.webhook_url = webhook_url
626
+ self.email = email
627
+ self.notification_count = 0
628
+
629
+ def on_job_submit(self, job: JobDefinition) -> None:
630
+ self._send_notification(f"Job submitted: {job.name} ({job.job_id})")
631
+
632
+ def on_job_start(self, job_id: str) -> None:
633
+ self._send_notification(f"Job started: {job_id}")
634
+
635
+ def on_job_complete(self, result: JobResult) -> None:
636
+ status = "SUCCESS" if result.success() else "FAILED"
637
+ self._send_notification(f"Job completed: {result.job_id} - {status}")
638
+
639
+ def on_error(self, error: Exception) -> None:
640
+ self._send_notification(f"Error: {str(error)}", level="ERROR")
641
+
642
+ def _send_notification(self, message: str, level: str = "INFO"):
643
+ """发送通知"""
644
+ self.notification_count += 1
645
+ timestamp = datetime.now().isoformat()
646
+ full_message = f"[{timestamp}] [{level}] {message}"
647
+
648
+ # 控制台输出
649
+ print(full_message)
650
+
651
+ # Webhook通知
652
+ if self.webhook_url:
653
+ try:
654
+ import requests
655
+ requests.post(self.webhook_url, json={"message": full_message}, timeout=5)
656
+ except ImportError:
657
+ print("requests module not installed, webhook disabled")
658
+ except Exception as e:
659
+ print(f"Webhook failed: {e}")
660
+
661
+ # 邮件通知(简化示例)
662
+ if self.email and level == "ERROR":
663
+ print(f"Would send email to {self.email}: {full_message}")
664
+
665
+ class ResourceLogger(Plugin):
666
+ """资源使用日志插件"""
667
+
668
+ def __init__(self, log_file: Optional[Path] = None):
669
+ self.log_file = log_file
670
+ self.resource_logs = []
671
+
672
+ def on_job_submit(self, job: JobDefinition) -> None:
673
+ log_entry = {
674
+ "timestamp": datetime.now().isoformat(),
675
+ "event": "submit",
676
+ "job_id": job.job_id,
677
+ "name": job.name,
678
+ "resource": {
679
+ "cpus": job.resource.cpus,
680
+ "memory_gb": job.resource.memory_gb,
681
+ "gpus": job.resource.gpus
682
+ }
683
+ }
684
+ self.resource_logs.append(log_entry)
685
+ print(f"[RESOURCE] Job {job.job_id} submitted with resource: {job.resource}")
686
+
687
+ def on_job_start(self, job_id: str) -> None:
688
+ print(f"[RESOURCE] Job {job_id} started")
689
+
690
+ def on_job_complete(self, result: JobResult) -> None:
691
+ if result.resource_usage:
692
+ log_entry = {
693
+ "timestamp": datetime.now().isoformat(),
694
+ "event": "complete",
695
+ "job_id": result.job_id,
696
+ "resource_usage": result.resource_usage,
697
+ "duration": result.duration
698
+ }
699
+ self.resource_logs.append(log_entry)
700
+ print(f"[RESOURCE] Job {result.job_id} used: {result.resource_usage}")
701
+
702
+ def on_error(self, error: Exception) -> None:
703
+ print(f"[RESOURCE] Error: {error}")
704
+
705
+ def save_logs(self):
706
+ """保存资源日志到文件"""
707
+ if self.log_file:
708
+ with open(self.log_file, 'w') as f:
709
+ json.dump(self.resource_logs, f, indent=2, default=str)
710
+
711
+ # ============================================================================
712
+ # 监控和指标收集
713
+ # ============================================================================
714
+
715
+ class MetricsCollector:
716
+ """
717
+ 指标收集器
718
+ 收集和报告作业执行的各种指标。
719
+ """
720
+
721
+ def __init__(self):
722
+ self._metrics = {
723
+ "jobs_total": 0,
724
+ "jobs_completed": 0,
725
+ "jobs_failed": 0,
726
+ "jobs_running": 0,
727
+ "total_duration": 0.0,
728
+ "total_cpu_hours": 0.0,
729
+ "total_memory_gb_hours": 0.0,
730
+ "retries_total": 0,
731
+ "backend_stats": {},
732
+ "resource_stats": {},
733
+ "timestamps": []
734
+ }
735
+ self._lock = threading.Lock()
736
+
737
+ def record_job_start(self, job: JobDefinition):
738
+ """记录作业开始"""
739
+ with self._lock:
740
+ self._metrics["jobs_total"] += 1
741
+ self._metrics["jobs_running"] += 1
742
+
743
+ # 记录后端统计
744
+ backend = job.backend.value
745
+ self._metrics["backend_stats"].setdefault(backend, 0)
746
+ self._metrics["backend_stats"][backend] += 1
747
+
748
+ def record_job_complete(self, result: JobResult, job: JobDefinition):
749
+ """记录作业完成"""
750
+ with self._lock:
751
+ self._metrics["jobs_running"] -= 1
752
+
753
+ if result.success():
754
+ self._metrics["jobs_completed"] += 1
755
+ else:
756
+ self._metrics["jobs_failed"] += 1
757
+
758
+ # 记录时长
759
+ if result.duration:
760
+ self._metrics["total_duration"] += result.duration
761
+
762
+ # 计算资源使用
763
+ if job.resource:
764
+ cpu_hours = job.resource.total_cpus * result.duration / 3600
765
+ self._metrics["total_cpu_hours"] += cpu_hours
766
+
767
+ if job.resource.memory_gb:
768
+ mem_hours = job.resource.memory_gb * result.duration / 3600
769
+ self._metrics["total_memory_gb_hours"] += mem_hours
770
+
771
+ # 记录重试次数
772
+ if result.attempts > 1:
773
+ self._metrics["retries_total"] += (result.attempts - 1)
774
+
775
+ # 记录时间戳
776
+ self._metrics["timestamps"].append({
777
+ "job_id": result.job_id,
778
+ "start_time": result.start_time.isoformat() if result.start_time else None,
779
+ "end_time": result.end_time.isoformat() if result.end_time else None,
780
+ "status": result.status.value,
781
+ "backend": job.backend.value
782
+ })
783
+
784
+ def get_metrics(self) -> Dict[str, Any]:
785
+ """获取所有指标"""
786
+ with self._lock:
787
+ metrics = self._metrics.copy()
788
+
789
+ # 计算成功率
790
+ if metrics["jobs_total"] > 0:
791
+ metrics["success_rate"] = metrics["jobs_completed"] / metrics["jobs_total"]
792
+ else:
793
+ metrics["success_rate"] = 0.0
794
+
795
+ # 计算平均时长
796
+ completed = metrics["jobs_completed"] + metrics["jobs_failed"]
797
+ if completed > 0:
798
+ metrics["avg_duration"] = metrics["total_duration"] / completed
799
+ else:
800
+ metrics["avg_duration"] = 0.0
801
+
802
+ return metrics
803
+
804
+ def reset(self):
805
+ """重置指标"""
806
+ with self._lock:
807
+ self._metrics = {
808
+ "jobs_total": 0,
809
+ "jobs_completed": 0,
810
+ "jobs_failed": 0,
811
+ "jobs_running": 0,
812
+ "total_duration": 0.0,
813
+ "total_cpu_hours": 0.0,
814
+ "total_memory_gb_hours": 0.0,
815
+ "retries_total": 0,
816
+ "backend_stats": {},
817
+ "resource_stats": {},
818
+ "timestamps": []
819
+ }
820
+
821
+ class ResourceMonitor:
822
+ """资源使用监控"""
823
+ def __init__(self):
824
+ self.start_time = datetime.now()
825
+ self.resource_usage = {}
826
+ self._lock = threading.Lock()
827
+
828
+ def record_usage(self, job_id: str, usage: Dict[str, Any]):
829
+ """记录资源使用"""
830
+ with self._lock:
831
+ self.resource_usage[job_id] = {
832
+ "timestamp": datetime.now(),
833
+ "usage": usage
834
+ }
835
+
836
+ def get_system_usage(self) -> Dict[str, Any]:
837
+ """获取系统资源使用情况"""
838
+ if not PSUTIL_AVAILABLE:
839
+ return {"error": "psutil not installed"}
840
+
841
+ try:
842
+ cpu_percent = psutil.cpu_percent(interval=0.1)
843
+ memory = psutil.virtual_memory()
844
+ disk = psutil.disk_usage('/')
845
+
846
+ return {
847
+ "cpu_percent": cpu_percent,
848
+ "memory_total_gb": memory.total / (1024**3),
849
+ "memory_used_gb": memory.used / (1024**3),
850
+ "memory_percent": memory.percent,
851
+ "disk_total_gb": disk.total / (1024**3),
852
+ "disk_used_gb": disk.used / (1024**3),
853
+ "disk_percent": disk.percent,
854
+ "uptime_seconds": (datetime.now() - self.start_time).total_seconds()
855
+ }
856
+ except Exception as e:
857
+ return {"error": str(e)}
858
+
859
+ def get_job_usage(self, job_id: str) -> Optional[Dict[str, Any]]:
860
+ """获取作业资源使用"""
861
+ with self._lock:
862
+ return self.resource_usage.get(job_id)
863
+
864
+ # ============================================================================
865
+ # 存储和缓存
866
+ # ============================================================================
867
+
868
+ class JobStore:
869
+ """
870
+ 作业存储
871
+ 持久化存储作业定义和结果,支持查询和历史记录。
872
+ """
873
+
874
+ def __init__(self, db_path: Union[str, Path] = "jobs.db"):
875
+ self.db_path = Path(db_path)
876
+ self._init_db()
877
+
878
+ def _init_db(self):
879
+ """初始化数据库"""
880
+ with sqlite3.connect(self.db_path) as conn:
881
+ # 作业表
882
+ conn.execute("""
883
+ CREATE TABLE IF NOT EXISTS jobs (
884
+ job_id TEXT PRIMARY KEY,
885
+ name TEXT,
886
+ description TEXT,
887
+ cmd TEXT,
888
+ backend TEXT,
889
+ image TEXT,
890
+ config_json TEXT,
891
+ resource_json TEXT,
892
+ retry_json TEXT,
893
+ priority INTEGER,
894
+ tags_json TEXT,
895
+ metadata_json TEXT,
896
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
897
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
898
+ )
899
+ """)
900
+
901
+ # 结果表
902
+ conn.execute("""
903
+ CREATE TABLE IF NOT EXISTS job_results (
904
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
905
+ job_id TEXT,
906
+ attempt INTEGER,
907
+ status TEXT,
908
+ exit_code INTEGER,
909
+ stdout TEXT,
910
+ stderr TEXT,
911
+ error_message TEXT,
912
+ start_time TIMESTAMP,
913
+ end_time TIMESTAMP,
914
+ duration REAL,
915
+ resource_usage_json TEXT,
916
+ metrics_json TEXT,
917
+ tags_json TEXT,
918
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
919
+ FOREIGN KEY (job_id) REFERENCES jobs (job_id)
920
+ )
921
+ """)
922
+
923
+ # 索引
924
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_job_results_job_id ON job_results (job_id)")
925
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_job_results_status ON job_results (status)")
926
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_job_results_start_time ON job_results (start_time)")
927
+
928
+ conn.commit()
929
+
930
+ def save_job(self, job: JobDefinition):
931
+ """保存作业定义"""
932
+ with sqlite3.connect(self.db_path) as conn:
933
+ conn.execute("""
934
+ INSERT OR REPLACE INTO jobs
935
+ (job_id, name, description, cmd, backend, image, config_json,
936
+ resource_json, retry_json, priority, tags_json, metadata_json)
937
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
938
+ """, (
939
+ job.job_id,
940
+ job.name,
941
+ job.description,
942
+ job.cmd,
943
+ job.backend.value,
944
+ job.image,
945
+ json.dumps(asdict(job.config)),
946
+ json.dumps(asdict(job.resource)),
947
+ json.dumps(asdict(job.retry)),
948
+ job.priority.value,
949
+ json.dumps(job.tags),
950
+ json.dumps(job.metadata)
951
+ ))
952
+ conn.commit()
953
+
954
+ def save_result(self, result: JobResult):
955
+ """保存作业结果"""
956
+ with sqlite3.connect(self.db_path) as conn:
957
+ conn.execute("""
958
+ INSERT INTO job_results
959
+ (job_id, attempt, status, exit_code, stdout, stderr, error_message,
960
+ start_time, end_time, duration, resource_usage_json, metrics_json, tags_json)
961
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
962
+ """, (
963
+ result.job_id,
964
+ result.attempts,
965
+ result.status.value,
966
+ result.exit_code,
967
+ result.stdout,
968
+ result.stderr,
969
+ result.error_message,
970
+ result.start_time.isoformat() if result.start_time else None,
971
+ result.end_time.isoformat() if result.end_time else None,
972
+ result.duration,
973
+ json.dumps(result.resource_usage) if result.resource_usage else None,
974
+ json.dumps(result.metrics),
975
+ json.dumps(result.tags)
976
+ ))
977
+ conn.commit()
978
+
979
+ def get_job(self, job_id: str) -> Optional[JobDefinition]:
980
+ """获取作业定义"""
981
+ with sqlite3.connect(self.db_path) as conn:
982
+ cursor = conn.execute("""
983
+ SELECT * FROM jobs WHERE job_id = ?
984
+ """, (job_id,))
985
+ row = cursor.fetchone()
986
+
987
+ if not row:
988
+ return None
989
+
990
+ # 重建作业定义
991
+ try:
992
+ config_data = json.loads(row[6])
993
+ resource_data = json.loads(row[7])
994
+ retry_data = json.loads(row[8])
995
+ tags = json.loads(row[10])
996
+ metadata = json.loads(row[11])
997
+ except json.JSONDecodeError as e:
998
+ print(f"Error decoding JSON for job {job_id}: {e}")
999
+ return None
1000
+
1001
+ job = JobDefinition(
1002
+ job_id=row[0],
1003
+ name=row[1],
1004
+ description=row[2],
1005
+ cmd=row[3],
1006
+ backend=Backend(row[4]),
1007
+ image=row[5],
1008
+ config=ExecutionConfig(**config_data),
1009
+ resource=ResourceRequest(**resource_data),
1010
+ retry=RetryConfig(**retry_data),
1011
+ priority=JobPriority(row[9]),
1012
+ tags=tags,
1013
+ metadata=metadata
1014
+ )
1015
+
1016
+ return job
1017
+
1018
+ def get_job_history(self, job_id: str) -> List[JobResult]:
1019
+ """获取作业历史结果"""
1020
+ with sqlite3.connect(self.db_path) as conn:
1021
+ cursor = conn.execute("""
1022
+ SELECT * FROM job_results
1023
+ WHERE job_id = ?
1024
+ ORDER BY attempt
1025
+ """, (job_id,))
1026
+
1027
+ results = []
1028
+ for row in cursor.fetchall():
1029
+ try:
1030
+ result = JobResult(
1031
+ job_id=row[1],
1032
+ status=JobStatus(row[3]),
1033
+ exit_code=row[4],
1034
+ stdout=row[5],
1035
+ stderr=row[6],
1036
+ error_message=row[7],
1037
+ start_time=datetime.fromisoformat(row[8].replace('Z', '+00:00')) if row[8] else None,
1038
+ end_time=datetime.fromisoformat(row[9].replace('Z', '+00:00')) if row[9] else None,
1039
+ duration=row[10],
1040
+ attempts=row[2],
1041
+ resource_usage=json.loads(row[11]) if row[11] else None,
1042
+ metrics=json.loads(row[12]) if row[12] else {},
1043
+ tags=json.loads(row[13]) if row[13] else {}
1044
+ )
1045
+ results.append(result)
1046
+ except Exception as e:
1047
+ print(f"Error loading result for job {job_id}: {e}")
1048
+
1049
+ return results
1050
+
1051
+ def search_jobs(
1052
+ self,
1053
+ status: Optional[JobStatus] = None,
1054
+ backend: Optional[Backend] = None,
1055
+ tags: Optional[Dict[str, str]] = None,
1056
+ limit: int = 100,
1057
+ offset: int = 0
1058
+ ) -> List[JobDefinition]:
1059
+ """搜索作业"""
1060
+ query = "SELECT job_id FROM jobs WHERE 1=1"
1061
+ params = []
1062
+
1063
+ if status:
1064
+ # 需要连接结果表
1065
+ query += " AND job_id IN (SELECT DISTINCT job_id FROM job_results WHERE status = ?)"
1066
+ params.append(status.value)
1067
+
1068
+ if backend:
1069
+ query += " AND backend = ?"
1070
+ params.append(backend.value)
1071
+
1072
+ if tags:
1073
+ for key, value in tags.items():
1074
+ query += f" AND json_extract(tags_json, '$.{key}') = ?"
1075
+ params.append(value)
1076
+
1077
+ query += " ORDER BY created_at DESC LIMIT ? OFFSET ?"
1078
+ params.extend([limit, offset])
1079
+
1080
+ with sqlite3.connect(self.db_path) as conn:
1081
+ cursor = conn.execute(query, params)
1082
+ job_ids = [row[0] for row in cursor.fetchall()]
1083
+
1084
+ jobs = []
1085
+ for job_id in job_ids:
1086
+ job = self.get_job(job_id)
1087
+ if job:
1088
+ jobs.append(job)
1089
+
1090
+ return jobs
1091
+
1092
+ def cleanup_old_jobs(self, days: int = 30):
1093
+ """清理旧作业"""
1094
+ cutoff = datetime.now() - timedelta(days=days)
1095
+
1096
+ with sqlite3.connect(self.db_path) as conn:
1097
+ # 删除旧结果
1098
+ conn.execute("""
1099
+ DELETE FROM job_results
1100
+ WHERE job_id IN (
1101
+ SELECT job_id FROM jobs
1102
+ WHERE created_at < ?
1103
+ )
1104
+ """, (cutoff.isoformat(),))
1105
+
1106
+ # 删除旧作业
1107
+ conn.execute("""
1108
+ DELETE FROM jobs WHERE created_at < ?
1109
+ """, (cutoff.isoformat(),))
1110
+
1111
+ conn.commit()
1112
+
1113
+ class ResultCache:
1114
+ """
1115
+ 结果缓存
1116
+ 缓存作业结果,支持结果复用和增量计算。
1117
+ """
1118
+
1119
+ def __init__(self, cache_dir: Union[str, Path] = ".job_cache"):
1120
+ self.cache_dir = Path(cache_dir)
1121
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
1122
+ self._index_file = self.cache_dir / "index.json"
1123
+ self._index = self._load_index()
1124
+ self._lock = threading.Lock()
1125
+
1126
+ def _load_index(self) -> Dict[str, Dict[str, Any]]:
1127
+ """加载索引"""
1128
+ if self._index_file.exists():
1129
+ try:
1130
+ with open(self._index_file) as f:
1131
+ return json.load(f)
1132
+ except json.JSONDecodeError:
1133
+ return {}
1134
+ return {}
1135
+
1136
+ def _save_index(self):
1137
+ """保存索引"""
1138
+ with open(self._index_file, 'w') as f:
1139
+ json.dump(self._index, f, indent=2)
1140
+
1141
+ def compute_key(
1142
+ self,
1143
+ cmd: str,
1144
+ backend: Backend = Backend.LOCAL,
1145
+ image: Optional[str] = None,
1146
+ mounts: Optional[Dict[Path, Path]] = None,
1147
+ workdir: Optional[Path] = None,
1148
+ env: Optional[Dict[str, str]] = None,
1149
+ resource: Optional[ResourceRequest] = None
1150
+ ) -> str:
1151
+ """
1152
+ 计算缓存键
1153
+ 基于命令和配置生成唯一的缓存键,相同配置返回相同键。
1154
+ """
1155
+ import hashlib
1156
+
1157
+ components = {
1158
+ "cmd": cmd,
1159
+ "backend": backend.value,
1160
+ "image": image or "",
1161
+ "mounts": json.dumps(sorted((str(k), str(v)) for k, v in (mounts or {}).items())),
1162
+ "workdir": str(workdir) if workdir else "",
1163
+ "env": json.dumps(sorted((k, v) for k, v in (env or {}).items())),
1164
+ "resource": json.dumps(asdict(resource)) if resource else ""
1165
+ }
1166
+
1167
+ content = json.dumps(components, sort_keys=True)
1168
+ return hashlib.sha256(content.encode()).hexdigest()
1169
+
1170
+ def set(self, key: str, result: JobResult, ttl_seconds: int = 86400):
1171
+ """
1172
+ 设置缓存
1173
+ """
1174
+ with self._lock:
1175
+ # 保存结果
1176
+ cache_file = self.cache_dir / f"{key}.pkl"
1177
+ try:
1178
+ with open(cache_file, 'wb') as f:
1179
+ pickle.dump(result, f)
1180
+
1181
+ # 更新索引
1182
+ self._index[key] = {
1183
+ "created": datetime.now().isoformat(),
1184
+ "expires": (datetime.now() + timedelta(seconds=ttl_seconds)).isoformat(),
1185
+ "size": cache_file.stat().st_size,
1186
+ "job_id": result.job_id,
1187
+ "status": result.status.value
1188
+ }
1189
+ self._save_index()
1190
+ except Exception as e:
1191
+ print(f"Error saving cache: {e}")
1192
+
1193
+ def get(self, key: str) -> Optional[JobResult]:
1194
+ """获取缓存结果"""
1195
+ with self._lock:
1196
+ if not self.has(key):
1197
+ return None
1198
+
1199
+ cache_file = self.cache_dir / f"{key}.pkl"
1200
+ try:
1201
+ with open(cache_file, 'rb') as f:
1202
+ return pickle.load(f)
1203
+ except Exception:
1204
+ # 缓存损坏,删除
1205
+ self.delete(key)
1206
+ return None
1207
+
1208
+ def has(self, key: str) -> bool:
1209
+ """检查缓存是否存在且有效"""
1210
+ with self._lock:
1211
+ if key not in self._index:
1212
+ return False
1213
+
1214
+ entry = self._index[key]
1215
+ expires = datetime.fromisoformat(entry["expires"])
1216
+
1217
+ if datetime.now() > expires:
1218
+ self.delete(key)
1219
+ return False
1220
+
1221
+ cache_file = self.cache_dir / f"{key}.pkl"
1222
+ return cache_file.exists()
1223
+
1224
+ def delete(self, key: str):
1225
+ """删除缓存"""
1226
+ with self._lock:
1227
+ if key in self._index:
1228
+ cache_file = self.cache_dir / f"{key}.pkl"
1229
+ if cache_file.exists():
1230
+ cache_file.unlink()
1231
+ del self._index[key]
1232
+ self._save_index()
1233
+
1234
+ def cleanup(self, max_size_mb: int = 1024, max_age_days: int = 30):
1235
+ """清理缓存"""
1236
+ with self._lock:
1237
+ now = datetime.now()
1238
+ total_size_mb = 0
1239
+
1240
+ # 收集需要删除的键
1241
+ to_delete = []
1242
+
1243
+ for key, entry in list(self._index.items()):
1244
+ cache_file = self.cache_dir / f"{key}.pkl"
1245
+
1246
+ # 检查过期
1247
+ expires = datetime.fromisoformat(entry["expires"])
1248
+ created = datetime.fromisoformat(entry["created"])
1249
+
1250
+ if now > expires:
1251
+ to_delete.append(key)
1252
+ elif (now - created).days > max_age_days:
1253
+ to_delete.append(key)
1254
+ elif cache_file.exists():
1255
+ total_size_mb += entry["size"] / (1024 * 1024)
1256
+ if total_size_mb > max_size_mb:
1257
+ to_delete.append(key)
1258
+
1259
+ # 删除
1260
+ for key in to_delete:
1261
+ self.delete(key)
1262
+
1263
+ # 按创建时间排序,删除最旧的
1264
+ if total_size_mb > max_size_mb:
1265
+ sorted_keys = sorted(
1266
+ self._index.keys(),
1267
+ key=lambda k: datetime.fromisoformat(self._index[k]["created"])
1268
+ )
1269
+ while total_size_mb > max_size_mb and sorted_keys:
1270
+ key = sorted_keys.pop(0)
1271
+ if key in self._index:
1272
+ total_size_mb -= self._index[key]["size"] / (1024 * 1024)
1273
+ self.delete(key)
1274
+
1275
+ def get_stats(self) -> Dict[str, Any]:
1276
+ """获取缓存统计"""
1277
+ with self._lock:
1278
+ total_size = 0
1279
+ count = 0
1280
+ status_counts = {}
1281
+
1282
+ for entry in self._index.values():
1283
+ total_size += entry["size"]
1284
+ count += 1
1285
+ status = entry["status"]
1286
+ status_counts[status] = status_counts.get(status, 0) + 1
1287
+
1288
+ return {
1289
+ "count": count,
1290
+ "total_size_bytes": total_size,
1291
+ "total_size_mb": total_size / (1024 * 1024),
1292
+ "status_counts": status_counts
1293
+ }
1294
+
1295
+ # ============================================================================
1296
+ # 队列和监控
1297
+ # ============================================================================
1298
+
1299
+ class PriorityJobQueue:
1300
+ """优先作业队列"""
1301
+ def __init__(self):
1302
+ self._queue = []
1303
+ self._lock = threading.Lock()
1304
+ self._stats = {
1305
+ "jobs_pushed": 0,
1306
+ "jobs_popped": 0,
1307
+ "max_size": 0
1308
+ }
1309
+
1310
+ def push(self, job: JobDefinition):
1311
+ with self._lock:
1312
+ heapq.heappush(self._queue, (-job.priority.value, time.time(), job.job_id, job))
1313
+ self._stats["jobs_pushed"] += 1
1314
+ self._stats["max_size"] = max(self._stats["max_size"], len(self._queue))
1315
+
1316
+ def pop(self) -> Optional[JobDefinition]:
1317
+ with self._lock:
1318
+ if self._queue:
1319
+ _, _, job_id, job = heapq.heappop(self._queue)
1320
+ self._stats["jobs_popped"] += 1
1321
+ return job
1322
+ return None
1323
+
1324
+ def peek(self) -> Optional[JobDefinition]:
1325
+ """查看队列中的下一个作业但不移除"""
1326
+ with self._lock:
1327
+ if self._queue:
1328
+ _, _, _, job = self._queue[0]
1329
+ return job
1330
+ return None
1331
+
1332
+ def get_stats(self) -> Dict[str, Any]:
1333
+ """获取队列统计信息"""
1334
+ with self._lock:
1335
+ stats = self._stats.copy()
1336
+ stats["current_size"] = len(self._queue)
1337
+ stats["avg_wait_time"] = self._calculate_avg_wait_time()
1338
+ return stats
1339
+
1340
+ def _calculate_avg_wait_time(self) -> float:
1341
+ """计算平均等待时间"""
1342
+ if not self._queue:
1343
+ return 0.0
1344
+
1345
+ now = time.time()
1346
+ total_wait = 0.0
1347
+ count = 0
1348
+
1349
+ for _, enqueue_time, _, _ in self._queue:
1350
+ total_wait += (now - enqueue_time)
1351
+ count += 1
1352
+
1353
+ return total_wait / count if count > 0 else 0.0
1354
+
1355
+ def get_queue_snapshot(self, limit: int = 10) -> List[Dict[str, Any]]:
1356
+ """获取队列快照"""
1357
+ with self._lock:
1358
+ snapshot = []
1359
+ # 复制并排序队列
1360
+ sorted_queue = sorted(self._queue, key=lambda x: (-x[0], x[1]))
1361
+ for priority_neg, enqueue_time, job_id, job in sorted_queue[:limit]:
1362
+ snapshot.append({
1363
+ "job_id": job_id,
1364
+ "name": job.name,
1365
+ "priority": JobPriority(-priority_neg).name,
1366
+ "enqueued_at": datetime.fromtimestamp(enqueue_time).isoformat(),
1367
+ "wait_time_seconds": time.time() - enqueue_time,
1368
+ "backend": job.backend.value,
1369
+ "cmd_preview": job.cmd[:100] + ("..." if len(job.cmd) > 100 else "")
1370
+ })
1371
+ return snapshot
1372
+
1373
+ class TimeoutMonitor:
1374
+ """作业超时监控"""
1375
+ def __init__(self, scheduler):
1376
+ self.scheduler = scheduler
1377
+ self._monitored_jobs = {}
1378
+ self._lock = threading.Lock()
1379
+ self._stats = {
1380
+ "jobs_timed_out": 0,
1381
+ "preemptive_cancellations": 0
1382
+ }
1383
+ self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
1384
+ self._thread.start()
1385
+
1386
+ def add_job(self, job_id: str, timeout_seconds: Optional[float] = None):
1387
+ """添加要监控的作业"""
1388
+ if timeout_seconds is None or timeout_seconds <= 0:
1389
+ return
1390
+
1391
+ with self._lock:
1392
+ self._monitored_jobs[job_id] = {
1393
+ "start_time": time.time(),
1394
+ "timeout": timeout_seconds,
1395
+ "warned": False
1396
+ }
1397
+
1398
+ def remove_job(self, job_id: str):
1399
+ """移除监控的作业"""
1400
+ with self._lock:
1401
+ if job_id in self._monitored_jobs:
1402
+ job_info = self._monitored_jobs[job_id]
1403
+ if job_info.get("warned", False):
1404
+ self._stats["preemptive_cancellations"] += 1
1405
+ del self._monitored_jobs[job_id]
1406
+
1407
+ def _monitor_loop(self):
1408
+ """监控循环"""
1409
+ while True:
1410
+ try:
1411
+ now = time.time()
1412
+ jobs_to_cancel = []
1413
+
1414
+ with self._lock:
1415
+ for job_id, job_info in list(self._monitored_jobs.items()):
1416
+ elapsed = now - job_info["start_time"]
1417
+ timeout = job_info["timeout"]
1418
+
1419
+ # 提前警告(80%超时)
1420
+ if not job_info["warned"] and elapsed > timeout * 0.8:
1421
+ self._warn_about_timeout(job_id, elapsed, timeout)
1422
+ job_info["warned"] = True
1423
+
1424
+ # 超时取消
1425
+ if elapsed > timeout:
1426
+ jobs_to_cancel.append(job_id)
1427
+ self._stats["jobs_timed_out"] += 1
1428
+
1429
+ # 在锁外取消作业
1430
+ for job_id in jobs_to_cancel:
1431
+ self.scheduler.logger.warning(f"[{job_id}] Job timeout, cancelling")
1432
+ self.scheduler.cancel(job_id)
1433
+ self.remove_job(job_id)
1434
+
1435
+ time.sleep(1)
1436
+
1437
+ except Exception as e:
1438
+ self.scheduler.logger.error(f"Timeout monitor error: {e}")
1439
+ time.sleep(5)
1440
+
1441
+ def _warn_about_timeout(self, job_id: str, elapsed: float, timeout: float):
1442
+ """超时警告"""
1443
+ self.scheduler.logger.warning(
1444
+ f"[{job_id}] Job is approaching timeout: "
1445
+ f"{elapsed:.1f}/{timeout:.1f} seconds ({elapsed/timeout:.1%})"
1446
+ )
1447
+
1448
+ def get_stats(self) -> Dict[str, Any]:
1449
+ """获取监控统计"""
1450
+ with self._lock:
1451
+ stats = self._stats.copy()
1452
+ stats["currently_monitored"] = len(self._monitored_jobs)
1453
+ return stats
1454
+
1455
+ # ============================================================================
1456
+ # 主调度器类
1457
+ # ============================================================================
1458
+
1459
+ class ContainerScheduler:
1460
+ """
1461
+ 通用容器调度器
1462
+ 企业级的命令执行框架,支持多种后端、资源管理、重试机制、监控等。
1463
+ """
1464
+
1465
+ def __init__(
1466
+ self,
1467
+ default_backend: Backend = Backend.LOCAL,
1468
+ default_image: Optional[str] = None,
1469
+ logger: Optional[logging.Logger] = None,
1470
+ max_concurrent: int = 4,
1471
+ metrics_collector: Optional[MetricsCollector] = None,
1472
+ job_store: Optional[JobStore] = None,
1473
+ result_cache: Optional[ResultCache] = None,
1474
+ plugins: Optional[List[Plugin]] = None,
1475
+ enable_web_monitor: bool = False,
1476
+ web_port: int = 8080,
1477
+ config: Optional[SchedulerConfig] = None,
1478
+ enable_priority_queue: bool = False,
1479
+ enable_timeout_monitor: bool = True,
1480
+ ):
1481
+ """
1482
+ 初始化调度器
1483
+
1484
+ Args:
1485
+ default_backend: 默认执行后端
1486
+ default_image: 默认容器镜像
1487
+ logger: 日志记录器
1488
+ max_concurrent: 最大并发作业数
1489
+ metrics_collector: 指标收集器
1490
+ job_store: 作业存储
1491
+ result_cache: 结果缓存
1492
+ plugins: 插件列表
1493
+ enable_web_monitor: 是否启用Web监控
1494
+ web_port: Web监控端口
1495
+ config: 调度器配置
1496
+ enable_priority_queue: 是否启用优先级队列
1497
+ enable_timeout_monitor: 是否启用超时监控
1498
+ """
1499
+ # 配置
1500
+ self.config = config or SchedulerConfig(
1501
+ default_backend=default_backend,
1502
+ max_concurrent=max_concurrent
1503
+ )
1504
+ self.default_backend = self.config.default_backend
1505
+ self.default_image = default_image
1506
+ self.max_concurrent = self.config.max_concurrent
1507
+
1508
+ # 日志
1509
+ if logger is None:
1510
+ logger = logging.getLogger(__name__)
1511
+ if not logger.handlers:
1512
+ handler = logging.StreamHandler()
1513
+ formatter = logging.Formatter(
1514
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
1515
+ )
1516
+ handler.setFormatter(formatter)
1517
+ logger.addHandler(handler)
1518
+ logger.setLevel(getattr(logging, self.config.log_level))
1519
+ self.logger = logger
1520
+
1521
+ # 核心组件
1522
+ self.metrics_collector = metrics_collector or MetricsCollector()
1523
+ self.job_store = job_store
1524
+ self.result_cache = result_cache
1525
+ self.plugins = plugins or []
1526
+ self.resource_monitor = ResourceMonitor()
1527
+
1528
+ # 队列系统
1529
+ self._priority_queue = None
1530
+ if enable_priority_queue:
1531
+ self._priority_queue = PriorityJobQueue()
1532
+ self._queue_thread = threading.Thread(
1533
+ target=self._process_queue_loop,
1534
+ daemon=True
1535
+ )
1536
+ self._queue_thread.start()
1537
+
1538
+ # 超时监控
1539
+ self._timeout_monitor = None
1540
+ if enable_timeout_monitor:
1541
+ self._timeout_monitor = TimeoutMonitor(self)
1542
+
1543
+ # 执行器和状态管理
1544
+ self._executor = ThreadPoolExecutor(max_workers=max_concurrent)
1545
+ self._futures: Dict[str, Future] = {}
1546
+ self._results: Dict[str, JobResult] = {}
1547
+ self._job_definitions: Dict[str, JobDefinition] = {}
1548
+ self._lock = threading.RLock()
1549
+
1550
+ # 信号处理
1551
+ signal.signal(signal.SIGINT, self._signal_handler)
1552
+ signal.signal(signal.SIGTERM, self._signal_handler)
1553
+
1554
+ # 清理线程
1555
+ self._cleanup_thread = threading.Thread(
1556
+ target=self._cleanup_loop,
1557
+ daemon=True
1558
+ )
1559
+ self._cleanup_thread.start()
1560
+
1561
+ # 健康检查线程
1562
+ self._health_check_thread = threading.Thread(
1563
+ target=self._health_check_loop,
1564
+ daemon=True
1565
+ )
1566
+ self._health_check_thread.start()
1567
+
1568
+ self.logger.info(f"Scheduler initialized with backend={self.default_backend}, max_concurrent={self.max_concurrent}")
1569
+
1570
+ def _signal_handler(self, signum, frame):
1571
+ """处理中断信号"""
1572
+ self.logger.warning(f"Received signal {signum}, shutting down...")
1573
+ self.shutdown()
1574
+
1575
+ def _cleanup_loop(self):
1576
+ """清理循环"""
1577
+ while True:
1578
+ time.sleep(self.config.cleanup_interval)
1579
+ try:
1580
+ self._cleanup_stale_jobs()
1581
+ if self.result_cache:
1582
+ self.result_cache.cleanup(
1583
+ max_size_mb=self.config.cache_max_size_mb,
1584
+ max_age_days=self.config.cache_max_age_days
1585
+ )
1586
+ if self.job_store:
1587
+ self.job_store.cleanup_old_jobs(days=self.config.db_cleanup_days)
1588
+ except Exception as e:
1589
+ self.logger.error(f"Cleanup error: {e}")
1590
+
1591
+ def _health_check_loop(self):
1592
+ """健康检查循环"""
1593
+ while True:
1594
+ try:
1595
+ time.sleep(self.config.health_check_interval)
1596
+ health = self.health_check()
1597
+ if health["status"] != "healthy":
1598
+ self.logger.warning(f"Health check failed: {health}")
1599
+ except Exception as e:
1600
+ self.logger.error(f"Health check error: {e}")
1601
+
1602
+ def _cleanup_stale_jobs(self):
1603
+ """清理过时的作业"""
1604
+ with self._lock:
1605
+ stale_time = datetime.now() - timedelta(hours=24)
1606
+ stale_jobs = []
1607
+
1608
+ for job_id, future in list(self._futures.items()):
1609
+ if future.done():
1610
+ stale_jobs.append(job_id)
1611
+
1612
+ for job_id in stale_jobs:
1613
+ if job_id in self._futures:
1614
+ del self._futures[job_id]
1615
+
1616
+ def _process_queue_loop(self):
1617
+ """处理优先级队列的循环"""
1618
+ while True:
1619
+ try:
1620
+ if self._priority_queue:
1621
+ job = self._priority_queue.pop()
1622
+ if job:
1623
+ # 检查是否有足够的资源
1624
+ if self._has_enough_resources(job):
1625
+ self.submit(job, wait=False)
1626
+ else:
1627
+ # 放回队列稍后重试
1628
+ time.sleep(5)
1629
+ self._priority_queue.push(job)
1630
+
1631
+ time.sleep(0.1)
1632
+
1633
+ except Exception as e:
1634
+ self.logger.error(f"Queue processing error: {e}")
1635
+ time.sleep(1)
1636
+
1637
+ def _has_enough_resources(self, job: JobDefinition) -> bool:
1638
+ """检查是否有足够资源运行作业"""
1639
+ try:
1640
+ if not PSUTIL_AVAILABLE:
1641
+ return True
1642
+
1643
+ # 获取系统资源
1644
+ system_usage = self.resource_monitor.get_system_usage()
1645
+
1646
+ if "error" in system_usage:
1647
+ return True
1648
+
1649
+ # 检查CPU
1650
+ if "cpu_percent" in system_usage:
1651
+ if system_usage["cpu_percent"] > self.config.max_cpu_percent:
1652
+ self.logger.debug(f"CPU usage too high: {system_usage['cpu_percent']}%")
1653
+ return False
1654
+
1655
+ # 检查内存
1656
+ if job.resource.memory_gb:
1657
+ available_memory_gb = (system_usage.get("memory_total_gb", 0) -
1658
+ system_usage.get("memory_used_gb", 0))
1659
+ if available_memory_gb < job.resource.memory_gb:
1660
+ self.logger.debug(f"Insufficient memory: {available_memory_gb:.1f}GB available, {job.resource.memory_gb:.1f}GB required")
1661
+ return False
1662
+
1663
+ return True
1664
+
1665
+ except Exception as e:
1666
+ self.logger.error(f"Resource check error: {e}")
1667
+ return True
1668
+
1669
+ def _build_command(
1670
+ self,
1671
+ job: JobDefinition
1672
+ ) -> Tuple[List[str], Optional[Path]]:
1673
+ """
1674
+ 构建执行命令
1675
+
1676
+ Args:
1677
+ job: 作业定义
1678
+
1679
+ Returns:
1680
+ (命令部分列表, 临时脚本路径)
1681
+ """
1682
+ backend = job.backend
1683
+ cmd = job.cmd
1684
+ image = job.image or self.default_image
1685
+ config = job.config
1686
+ resource = job.resource
1687
+
1688
+ if backend == Backend.LOCAL:
1689
+ return [config.shell, "-c", cmd], None
1690
+
1691
+ elif backend == Backend.DOCKER:
1692
+ if not image:
1693
+ raise ValueError("Docker backend requires image")
1694
+
1695
+ parts = ["docker", "run", "--rm", "-i"]
1696
+
1697
+ # 挂载卷
1698
+ for host_path, container_path in config.mounts.items():
1699
+ parts += ["-v", f"{host_path}:{container_path}"]
1700
+
1701
+ # 工作目录
1702
+ if config.workdir:
1703
+ parts += ["-w", str(config.workdir)]
1704
+
1705
+ # 环境变量
1706
+ for key, value in config.env.items():
1707
+ parts += ["-e", f"{key}={shlex.quote(str(value))}"]
1708
+
1709
+ # 用户
1710
+ if config.user:
1711
+ parts += ["-u", config.user]
1712
+
1713
+ # 网络
1714
+ if config.network_mode:
1715
+ parts += ["--network", config.network_mode]
1716
+
1717
+ # 安全选项
1718
+ if config.security_opts:
1719
+ for opt in config.security_opts:
1720
+ parts += ["--security-opt", opt]
1721
+
1722
+ # 资源限制
1723
+ if resource:
1724
+ parts += ["--cpus", str(resource.cpus)]
1725
+ if resource.memory_mb:
1726
+ parts += ["--memory", f"{resource.memory_mb}m"]
1727
+
1728
+ parts.append(image)
1729
+ parts += [config.shell, "-c", cmd]
1730
+
1731
+ return parts, None
1732
+
1733
+ elif backend == Backend.APPTAINER:
1734
+ if not image:
1735
+ raise ValueError("Apptainer backend requires image")
1736
+
1737
+ parts = ["apptainer", "exec", "--containall"]
1738
+
1739
+ # 挂载卷
1740
+ for host_path, container_path in config.mounts.items():
1741
+ parts += ["--bind", f"{host_path}:{container_path}"]
1742
+
1743
+ # 工作目录
1744
+ if config.workdir:
1745
+ parts += ["--pwd", str(config.workdir)]
1746
+
1747
+ # 环境变量
1748
+ for key, value in config.env.items():
1749
+ parts += ["--env", f"{key}={shlex.quote(str(value))}"]
1750
+
1751
+ # 确保镜像存在
1752
+ image_path = Path(image)
1753
+ if not image_path.exists():
1754
+ self.logger.info(f"Pulling image: {image}")
1755
+ pull_cmd = ["apptainer", "pull", "--force", str(image_path), f"docker://{image}"]
1756
+ subprocess.run(pull_cmd, check=False, capture_output=True)
1757
+
1758
+ parts.append(str(image_path))
1759
+ parts += [config.shell, "-c", cmd]
1760
+
1761
+ return parts, None
1762
+
1763
+ elif backend == Backend.SLURM:
1764
+ # 创建SLURM脚本
1765
+ return self._build_slurm_script(job)
1766
+
1767
+ elif backend == Backend.PBS:
1768
+ # 创建PBS脚本
1769
+ return self._build_pbs_script(job)
1770
+
1771
+ else:
1772
+ raise ValueError(f"Unsupported backend: {backend}")
1773
+
1774
+ def _build_slurm_script(self, job: JobDefinition) -> Tuple[List[str], Path]:
1775
+ """构建SLURM脚本"""
1776
+ # 创建临时目录
1777
+ temp_dir = Path(tempfile.mkdtemp(prefix="slurm_"))
1778
+ script_path = temp_dir / f"{job.job_id}.sh"
1779
+
1780
+ # 构建脚本内容
1781
+ script_lines = ["#!/bin/bash"]
1782
+
1783
+ # SLURM指令
1784
+ directives = job.resource.to_slurm_directives()
1785
+ for key, value in directives.items():
1786
+ if value:
1787
+ script_lines.append(f"#SBATCH {key}={value}")
1788
+ else:
1789
+ script_lines.append(f"#SBATCH {key}")
1790
+
1791
+ # 输出文件
1792
+ if job.config.stdout:
1793
+ script_lines.append(f"#SBATCH --output={job.config.stdout}")
1794
+ if job.config.stderr:
1795
+ script_lines.append(f"#SBATCH --error={job.config.stderr}")
1796
+
1797
+ script_lines.append("")
1798
+
1799
+ # 环境变量
1800
+ for key, value in job.config.env.items():
1801
+ script_lines.append(f"export {key}={shlex.quote(str(value))}")
1802
+
1803
+ script_lines.append("")
1804
+
1805
+ # 容器命令
1806
+ if job.image:
1807
+ container_cmd = ["apptainer", "exec"]
1808
+ for host_path, container_path in job.config.mounts.items():
1809
+ container_cmd += ["--bind", f"{host_path}:{container_path}"]
1810
+ if job.config.workdir:
1811
+ container_cmd += ["--pwd", str(job.config.workdir)]
1812
+ container_cmd.append(job.image)
1813
+ container_cmd += [job.config.shell, "-c", shlex.quote(job.cmd)]
1814
+ script_lines.append(" ".join(container_cmd))
1815
+ else:
1816
+ if job.config.workdir:
1817
+ script_lines.append(f"cd {job.config.workdir}")
1818
+ script_lines.append(job.cmd)
1819
+
1820
+ # 写入文件
1821
+ script_path.write_text("\n".join(script_lines))
1822
+ script_path.chmod(0o755)
1823
+
1824
+ return ["sbatch", "--parsable", str(script_path)], script_path
1825
+
1826
+ def _build_pbs_script(self, job: JobDefinition) -> Tuple[List[str], Path]:
1827
+ """构建PBS脚本"""
1828
+ temp_dir = Path(tempfile.mkdtemp(prefix="pbs_"))
1829
+ script_path = temp_dir / f"{job.job_id}.pbs"
1830
+
1831
+ script_lines = ["#!/bin/bash"]
1832
+
1833
+ # PBS指令
1834
+ directives = job.resource.to_pbs_directives()
1835
+ for key, value in directives.items():
1836
+ script_lines.append(f"#PBS {key} {value}")
1837
+
1838
+ # 输出文件
1839
+ if job.config.stdout:
1840
+ script_lines.append(f"#PBS -o {job.config.stdout}")
1841
+ if job.config.stderr:
1842
+ script_lines.append(f"#PBS -e {job.config.stderr}")
1843
+
1844
+ script_lines.append("")
1845
+
1846
+ # 环境变量
1847
+ for key, value in job.config.env.items():
1848
+ script_lines.append(f"export {key}={shlex.quote(str(value))}")
1849
+
1850
+ script_lines.append("")
1851
+ script_lines.append("cd $PBS_O_WORKDIR")
1852
+
1853
+ if job.config.workdir:
1854
+ script_lines.append(f"cd {job.config.workdir}")
1855
+
1856
+ script_lines.append("")
1857
+
1858
+ # 容器命令
1859
+ if job.image:
1860
+ container_cmd = ["apptainer", "exec"]
1861
+ for host_path, container_path in job.config.mounts.items():
1862
+ container_cmd += ["--bind", f"{host_path}:{container_path}"]
1863
+ container_cmd.append(job.image)
1864
+ container_cmd += [job.config.shell, "-c", shlex.quote(job.cmd)]
1865
+ script_lines.append(" ".join(container_cmd))
1866
+ else:
1867
+ script_lines.append(job.cmd)
1868
+
1869
+ # 写入文件
1870
+ script_path.write_text("\n".join(script_lines))
1871
+ script_path.chmod(0o755)
1872
+
1873
+ return ["qsub", str(script_path)], script_path
1874
+
1875
+ def _execute_job(
1876
+ self,
1877
+ job: JobDefinition,
1878
+ use_cache: bool = True
1879
+ ) -> JobResult:
1880
+ """
1881
+ 执行单个作业
1882
+
1883
+ Args:
1884
+ job: 作业定义
1885
+ use_cache: 是否使用缓存
1886
+
1887
+ Returns:
1888
+ 作业结果
1889
+ """
1890
+ job_id = job.job_id
1891
+
1892
+ # 触发插件事件
1893
+ for plugin in self.plugins:
1894
+ try:
1895
+ plugin.on_job_submit(job)
1896
+ except Exception as e:
1897
+ self.logger.error(f"Plugin on_job_submit error: {e}")
1898
+
1899
+ # 指标收集
1900
+ self.metrics_collector.record_job_start(job)
1901
+
1902
+ # 检查缓存
1903
+ if use_cache and self.result_cache:
1904
+ cache_key = self.result_cache.compute_key(
1905
+ cmd=job.cmd,
1906
+ backend=job.backend,
1907
+ image=job.image,
1908
+ mounts=job.config.mounts,
1909
+ workdir=job.config.workdir,
1910
+ env=job.config.env,
1911
+ resource=job.resource
1912
+ )
1913
+
1914
+ if self.result_cache.has(cache_key):
1915
+ cached_result = self.result_cache.get(cache_key)
1916
+ if cached_result:
1917
+ self.logger.info(f"[{job_id}] Using cached result")
1918
+
1919
+ # 更新作业ID
1920
+ cached_result.job_id = job_id
1921
+ cached_result.attempts = 1
1922
+
1923
+ # 触发完成事件
1924
+ for plugin in self.plugins:
1925
+ try:
1926
+ plugin.on_job_complete(cached_result)
1927
+ except Exception as e:
1928
+ self.logger.error(f"Plugin on_job_complete error: {e}")
1929
+
1930
+ self.metrics_collector.record_job_complete(cached_result, job)
1931
+
1932
+ # 保存到存储
1933
+ if self.job_store:
1934
+ self.job_store.save_job(job)
1935
+ self.job_store.save_result(cached_result)
1936
+
1937
+ return cached_result
1938
+
1939
+ # 构建命令
1940
+ cmd_parts, script_path = self._build_command(job)
1941
+
1942
+ result = JobResult(
1943
+ job_id=job_id,
1944
+ status=JobStatus.RUNNING,
1945
+ backend=job.backend.value,
1946
+ command=job.cmd,
1947
+ tags=job.tags.copy()
1948
+ )
1949
+
1950
+ result.start_time = datetime.now()
1951
+
1952
+ try:
1953
+ # 触发开始事件
1954
+ for plugin in self.plugins:
1955
+ try:
1956
+ plugin.on_job_start(job_id)
1957
+ except Exception as e:
1958
+ self.logger.error(f"Plugin on_job_start error: {e}")
1959
+
1960
+ # 添加超时监控
1961
+ if self._timeout_monitor and job.config.timeout:
1962
+ self._timeout_monitor.add_job(job_id, job.config.timeout)
1963
+
1964
+ self.logger.info(f"[{job_id}] Starting job: {job.name}")
1965
+ if not job.config.silent:
1966
+ self.logger.info(f"[{job_id}] Command: {' '.join(cmd_parts)}")
1967
+
1968
+ # 准备输出
1969
+ stdout_dest = None
1970
+ stderr_dest = None
1971
+ stdout_file = None
1972
+ stderr_file = None
1973
+
1974
+ if job.config.capture_output:
1975
+ stdout_dest = subprocess.PIPE
1976
+ stderr_dest = subprocess.PIPE
1977
+ elif job.config.stdout:
1978
+ stdout_file = open(job.config.stdout, 'w')
1979
+ stdout_dest = stdout_file
1980
+ elif job.config.stderr:
1981
+ stderr_file = open(job.config.stderr, 'w')
1982
+ stderr_dest = stderr_file
1983
+
1984
+ # 执行命令
1985
+ stdin_input = job.config.stdin.encode() if job.config.stdin else None
1986
+
1987
+ process = subprocess.run(
1988
+ cmd_parts,
1989
+ shell=False,
1990
+ check=False, # 我们手动检查退出码
1991
+ timeout=job.config.timeout,
1992
+ stdout=stdout_dest,
1993
+ stderr=stderr_dest,
1994
+ stdin=subprocess.PIPE if job.config.stdin else None,
1995
+ text=not job.config.stdin, # 如果stdin是二进制,text=False
1996
+ cwd=job.config.workdir,
1997
+ input=stdin_input
1998
+ )
1999
+
2000
+ # 收集输出
2001
+ if job.config.capture_output:
2002
+ result.stdout = process.stdout
2003
+ result.stderr = process.stderr
2004
+ if result.stdout and len(result.stdout) > 10000: # 限制输出大小
2005
+ result.stdout = result.stdout[:10000] + "... [truncated]"
2006
+ if result.stderr and len(result.stderr) > 10000:
2007
+ result.stderr = result.stderr[:10000] + "... [truncated]"
2008
+
2009
+ result.exit_code = process.returncode
2010
+
2011
+ if process.returncode == 0:
2012
+ result.status = JobStatus.COMPLETED
2013
+ else:
2014
+ result.status = JobStatus.FAILED
2015
+ result.error_message = f"Command failed with exit code {process.returncode}"
2016
+
2017
+ except subprocess.TimeoutExpired:
2018
+ result.status = JobStatus.TIMEOUT
2019
+ result.error_message = f"Command timed out after {job.config.timeout} seconds"
2020
+ except subprocess.CalledProcessError as e:
2021
+ result.status = JobStatus.FAILED
2022
+ result.exit_code = e.returncode
2023
+ result.error_message = str(e)
2024
+ if e.stdout:
2025
+ result.stdout = e.stdout
2026
+ if e.stderr:
2027
+ result.stderr = e.stderr
2028
+ except FileNotFoundError as e:
2029
+ result.status = JobStatus.FAILED
2030
+ result.error_message = f"Command not found: {e}"
2031
+ except PermissionError as e:
2032
+ result.status = JobStatus.FAILED
2033
+ result.error_message = f"Permission denied: {e}"
2034
+ except Exception as e:
2035
+ result.status = JobStatus.FAILED
2036
+ result.error_message = f"Unexpected error: {e}"
2037
+
2038
+ finally:
2039
+ # 关闭文件
2040
+ if stdout_file:
2041
+ stdout_file.close()
2042
+ if stderr_file:
2043
+ stderr_file.close()
2044
+
2045
+ # 移除超时监控
2046
+ if self._timeout_monitor:
2047
+ self._timeout_monitor.remove_job(job_id)
2048
+
2049
+ result.end_time = datetime.now()
2050
+ if result.start_time and result.end_time:
2051
+ result.duration = (result.end_time - result.start_time).total_seconds()
2052
+
2053
+ # 清理临时脚本
2054
+ if script_path and job.config.clean_temp:
2055
+ try:
2056
+ script_path.unlink()
2057
+ script_path.parent.rmdir()
2058
+ except Exception as e:
2059
+ self.logger.warning(f"[{job_id}] Failed to clean temp files: {e}")
2060
+
2061
+ # 结果处理
2062
+ if job.result_handler:
2063
+ try:
2064
+ result = job.result_handler(result)
2065
+ except Exception as e:
2066
+ self.logger.error(f"[{job_id}] Result handler failed: {e}")
2067
+
2068
+ # 触发完成事件
2069
+ for plugin in self.plugins:
2070
+ try:
2071
+ plugin.on_job_complete(result)
2072
+ except Exception as e:
2073
+ self.logger.error(f"Plugin on_job_complete error: {e}")
2074
+
2075
+ # 指标收集
2076
+ self.metrics_collector.record_job_complete(result, job)
2077
+
2078
+ # 保存结果
2079
+ if self.job_store:
2080
+ self.job_store.save_job(job)
2081
+ self.job_store.save_result(result)
2082
+
2083
+ # 缓存结果
2084
+ if use_cache and self.result_cache and result.success():
2085
+ cache_key = self.result_cache.compute_key(
2086
+ cmd=job.cmd,
2087
+ backend=job.backend,
2088
+ image=job.image,
2089
+ mounts=job.config.mounts,
2090
+ workdir=job.config.workdir,
2091
+ env=job.config.env,
2092
+ resource=job.resource
2093
+ )
2094
+ self.result_cache.set(cache_key, result)
2095
+
2096
+ # 回调函数
2097
+ if job.callback:
2098
+ try:
2099
+ job.callback(result)
2100
+ except Exception as e:
2101
+ self.logger.error(f"[{job_id}] Callback failed: {e}")
2102
+
2103
+ self.logger.info(f"[{job_id}] Job completed with status: {result.status.value}")
2104
+
2105
+ # 存储结果
2106
+ with self._lock:
2107
+ self._results[job_id] = result
2108
+
2109
+ return result
2110
+
2111
+ def submit(self, job: JobDefinition, wait: bool = True) -> Union[JobResult, Future]:
2112
+ """
2113
+ 提交作业
2114
+
2115
+ Args:
2116
+ job: 作业定义
2117
+ wait: 是否等待作业完成
2118
+
2119
+ Returns:
2120
+ 如果wait=True返回JobResult,否则返回Future
2121
+ """
2122
+ # 保存作业定义
2123
+ with self._lock:
2124
+ self._job_definitions[job.job_id] = job
2125
+
2126
+ # 提交到线程池
2127
+ future = self._executor.submit(self._execute_job, job)
2128
+
2129
+ with self._lock:
2130
+ self._futures[job.job_id] = future
2131
+
2132
+ if wait:
2133
+ try:
2134
+ return future.result()
2135
+ except Exception as e:
2136
+ self.logger.error(f"[{job.job_id}] Job execution failed: {e}")
2137
+ result = JobResult(
2138
+ job_id=job.job_id,
2139
+ status=JobStatus.FAILED,
2140
+ error_message=str(e)
2141
+ )
2142
+ return result
2143
+ else:
2144
+ return future
2145
+
2146
+ def run(
2147
+ self,
2148
+ cmd: str,
2149
+ backend: Union[Backend, str] = None,
2150
+ image: str = None,
2151
+ mounts: Dict[Union[str, Path], Union[str, Path]] = None,
2152
+ workdir: Union[str, Path] = None,
2153
+ env: Dict[str, str] = None,
2154
+ dry_run: bool = False,
2155
+ resource: Union[ResourceRequest, Dict[str, Any]] = None,
2156
+ retry: Union[RetryConfig, Dict[str, Any]] = None,
2157
+ config: Union[ExecutionConfig, Dict[str, Any]] = None,
2158
+ job_id: str = None,
2159
+ wait: bool = True,
2160
+ name: str = None,
2161
+ description: str = None,
2162
+ tags: Dict[str, str] = None,
2163
+ priority: Union[JobPriority, int] = JobPriority.NORMAL,
2164
+ use_cache: bool = True
2165
+ ) -> Union[JobResult, Future]:
2166
+ """
2167
+ 运行命令(简化接口)
2168
+
2169
+ Args:
2170
+ cmd: 要执行的命令
2171
+ backend: 执行后端,默认使用调度器默认后端
2172
+ image: 容器镜像
2173
+ mounts: 挂载映射 {主机路径: 容器路径}
2174
+ workdir: 工作目录
2175
+ env: 环境变量
2176
+ dry_run: 只打印命令不执行
2177
+ resource: 资源请求配置或字典
2178
+ retry: 重试配置或字典
2179
+ config: 执行配置或字典
2180
+ job_id: 作业ID,自动生成如果未提供
2181
+ wait: 是否等待作业完成
2182
+ name: 作业名称
2183
+ description: 作业描述
2184
+ tags: 作业标签
2185
+ priority: 作业优先级
2186
+ use_cache: 是否使用结果缓存
2187
+
2188
+ Returns:
2189
+ 如果wait=True返回JobResult,否则返回Future
2190
+ """
2191
+ # 参数转换
2192
+ if backend is None:
2193
+ backend = self.default_backend
2194
+ elif isinstance(backend, str):
2195
+ backend = Backend(backend)
2196
+
2197
+ if isinstance(resource, dict):
2198
+ resource = ResourceRequest(**resource)
2199
+ elif resource is None:
2200
+ resource = ResourceRequest()
2201
+
2202
+ if isinstance(retry, dict):
2203
+ retry = RetryConfig(**retry)
2204
+ elif retry is None:
2205
+ retry = RetryConfig()
2206
+
2207
+ if isinstance(config, dict):
2208
+ config = ExecutionConfig(**config)
2209
+ elif config is None:
2210
+ config = ExecutionConfig()
2211
+
2212
+ # 挂载转换
2213
+ mounts_dict = {}
2214
+ if mounts:
2215
+ for host_path, container_path in mounts.items():
2216
+ mounts_dict[Path(host_path)] = Path(container_path)
2217
+
2218
+ # 工作目录转换
2219
+ if workdir:
2220
+ config.workdir = Path(workdir)
2221
+
2222
+ # 环境变量
2223
+ if env:
2224
+ config.env.update(env)
2225
+
2226
+ # 挂载点
2227
+ if mounts_dict:
2228
+ config.mounts.update(mounts_dict)
2229
+
2230
+ # 优先级转换
2231
+ if isinstance(priority, int):
2232
+ priority = JobPriority(priority)
2233
+
2234
+ # 创建作业定义
2235
+ job = JobDefinition(
2236
+ cmd=cmd,
2237
+ backend=backend,
2238
+ image=image,
2239
+ config=config,
2240
+ resource=resource,
2241
+ retry=retry,
2242
+ job_id=job_id,
2243
+ name=name,
2244
+ description=description,
2245
+ priority=priority,
2246
+ tags=tags or {}
2247
+ )
2248
+
2249
+ # 干运行
2250
+ if dry_run:
2251
+ cmd_parts, _ = self._build_command(job)
2252
+ self.logger.info(f"[DRY RUN] Command: {' '.join(cmd_parts)}")
2253
+
2254
+ result = JobResult(
2255
+ job_id=job.job_id,
2256
+ status=JobStatus.COMPLETED,
2257
+ command=cmd
2258
+ )
2259
+ return result
2260
+
2261
+ # 提交作业
2262
+ return self.submit(job, wait=wait)
2263
+
2264
+ def enqueue(self, job: JobDefinition) -> str:
2265
+ """
2266
+ 将作业加入优先级队列(而不是立即执行)
2267
+
2268
+ Args:
2269
+ job: 作业定义
2270
+
2271
+ Returns:
2272
+ 作业ID
2273
+ """
2274
+ if not self._priority_queue:
2275
+ raise RuntimeError("Priority queue is not enabled")
2276
+
2277
+ # 保存作业定义
2278
+ with self._lock:
2279
+ self._job_definitions[job.job_id] = job
2280
+
2281
+ # 加入优先级队列
2282
+ self._priority_queue.push(job)
2283
+
2284
+ self.logger.info(f"[{job.job_id}] Job enqueued with priority {job.priority}")
2285
+
2286
+ return job.job_id
2287
+
2288
+ def run_many(
2289
+ self,
2290
+ commands: List[Union[str, Dict[str, Any]]],
2291
+ backend: Backend = None,
2292
+ max_workers: int = None,
2293
+ progress_callback: Callable[[int, int], None] = None,
2294
+ stop_on_error: bool = False,
2295
+ use_cache: bool = True
2296
+ ) -> List[JobResult]:
2297
+ """
2298
+ 批量运行多个命令
2299
+
2300
+ Args:
2301
+ commands: 命令列表,可以是字符串或配置字典
2302
+ backend: 执行后端,覆盖单个命令的后端设置
2303
+ max_workers: 最大工作线程数,默认使用调度器设置
2304
+ progress_callback: 进度回调函数 (completed, total)
2305
+ stop_on_error: 遇到错误是否停止
2306
+ use_cache: 是否使用结果缓存
2307
+
2308
+ Returns:
2309
+ 作业结果列表
2310
+ """
2311
+ if max_workers is None:
2312
+ max_workers = self.max_concurrent
2313
+
2314
+ # 转换命令为作业定义
2315
+ jobs = []
2316
+ for i, cmd_spec in enumerate(commands):
2317
+ if isinstance(cmd_spec, JobDefinition):
2318
+ # 已经是JobDefinition对象
2319
+ job = cmd_spec
2320
+ # 确保有作业ID
2321
+ if not job.job_id:
2322
+ job.job_id = f"batch_{i}_{uuid.uuid4().hex[:4]}"
2323
+ elif isinstance(cmd_spec, str):
2324
+ # 简单字符串命令
2325
+ job = JobDefinition(
2326
+ cmd=cmd_spec,
2327
+ backend=backend or self.default_backend,
2328
+ name=f"batch_{i}",
2329
+ job_id=f"batch_{i}_{uuid.uuid4().hex[:4]}"
2330
+ )
2331
+ else:
2332
+ # 配置字典
2333
+ cmd_spec = cmd_spec.copy()
2334
+
2335
+ # 提取命令
2336
+ cmd = cmd_spec.pop("cmd")
2337
+
2338
+ # 处理后端
2339
+ if backend is not None and "backend" not in cmd_spec:
2340
+ cmd_spec["backend"] = backend
2341
+
2342
+ # 创建作业定义
2343
+ try:
2344
+ job = JobDefinition(cmd=cmd, **cmd_spec)
2345
+ except TypeError as e:
2346
+ self.logger.error(f"Error creating job from spec {cmd_spec}: {e}")
2347
+ continue
2348
+
2349
+ # 如果没有作业ID,生成一个
2350
+ if not job.job_id:
2351
+ job.job_id = f"batch_{i}_{uuid.uuid4().hex[:4]}"
2352
+
2353
+ # 如果没有名称,使用作业ID
2354
+ if not job.name:
2355
+ job.name = job.job_id
2356
+
2357
+ jobs.append(job)
2358
+
2359
+ total = len(jobs)
2360
+ results = []
2361
+ completed = 0
2362
+
2363
+ # 使用执行器并行运行
2364
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
2365
+ futures = []
2366
+
2367
+ for job in jobs:
2368
+ future = executor.submit(self._execute_job, job, use_cache)
2369
+ futures.append((job.job_id, future))
2370
+
2371
+ for job_id, future in futures:
2372
+ try:
2373
+ result = future.result()
2374
+ results.append(result)
2375
+
2376
+ completed += 1
2377
+ if progress_callback:
2378
+ progress_callback(completed, total)
2379
+
2380
+ # 检查是否需要停止
2381
+ if stop_on_error and result.failed():
2382
+ self.logger.warning(f"Stopping batch due to failed job: {job_id}")
2383
+ break
2384
+
2385
+ except Exception as e:
2386
+ self.logger.error(f"Job {job_id} failed with exception: {e}")
2387
+
2388
+ error_result = JobResult(
2389
+ job_id=job_id,
2390
+ status=JobStatus.FAILED,
2391
+ error_message=str(e)
2392
+ )
2393
+ results.append(error_result)
2394
+
2395
+ completed += 1
2396
+ if progress_callback:
2397
+ progress_callback(completed, total)
2398
+
2399
+ if stop_on_error:
2400
+ break
2401
+
2402
+ return results
2403
+
2404
+ def cancel(self, job_id: str):
2405
+ """取消作业"""
2406
+ with self._lock:
2407
+ if job_id in self._futures:
2408
+ future = self._futures[job_id]
2409
+ future.cancel()
2410
+
2411
+ # 更新结果
2412
+ if job_id in self._results:
2413
+ self._results[job_id].status = JobStatus.CANCELLED
2414
+
2415
+ self.logger.info(f"[{job_id}] Job cancelled")
2416
+ else:
2417
+ self.logger.warning(f"[{job_id}] Job not found or already completed")
2418
+
2419
+ def cancel_all(self):
2420
+ """取消所有作业"""
2421
+ with self._lock:
2422
+ for job_id in list(self._futures.keys()):
2423
+ self.cancel(job_id)
2424
+
2425
+ def wait_all(self, timeout: float = None) -> List[JobResult]:
2426
+ """等待所有作业完成"""
2427
+ results = []
2428
+ with self._lock:
2429
+ futures = list(self._futures.items())
2430
+
2431
+ for job_id, future in futures:
2432
+ try:
2433
+ result = future.result(timeout=timeout)
2434
+ results.append(result)
2435
+ except Exception as e:
2436
+ self.logger.error(f"Error waiting for job {job_id}: {e}")
2437
+
2438
+ return results
2439
+
2440
+ def get_job(self, job_id: str) -> Optional[JobDefinition]:
2441
+ """获取作业定义"""
2442
+ with self._lock:
2443
+ return self._job_definitions.get(job_id)
2444
+
2445
+ def get_result(self, job_id: str) -> Optional[JobResult]:
2446
+ """获取作业结果"""
2447
+ # 首先检查内存中的结果
2448
+ with self._lock:
2449
+ if job_id in self._results:
2450
+ return self._results[job_id]
2451
+
2452
+ # 然后检查存储
2453
+ if self.job_store:
2454
+ history = self.job_store.get_job_history(job_id)
2455
+ if history:
2456
+ return history[-1] # 返回最新结果
2457
+
2458
+ return None
2459
+
2460
+ def get_status(self, job_id: str) -> Optional[JobStatus]:
2461
+ """获取作业状态"""
2462
+ result = self.get_result(job_id)
2463
+ if result:
2464
+ return result.status
2465
+
2466
+ # 检查是否在运行
2467
+ with self._lock:
2468
+ if job_id in self._futures and not self._futures[job_id].done():
2469
+ return JobStatus.RUNNING
2470
+
2471
+ return None
2472
+
2473
+ def health_check(self) -> Dict[str, Any]:
2474
+ """检查调度器健康状况"""
2475
+ health_status = {
2476
+ "status": "healthy",
2477
+ "timestamp": datetime.now().isoformat(),
2478
+ "components": {}
2479
+ }
2480
+
2481
+ # 检查执行器
2482
+ health_status["components"]["executor"] = {
2483
+ "running": not self._executor._shutdown,
2484
+ "active_threads": threading.active_count(),
2485
+ "max_workers": self.max_concurrent
2486
+ }
2487
+
2488
+ # 检查后端可用性
2489
+ backend_health = {}
2490
+ for backend in [Backend.LOCAL, Backend.DOCKER, Backend.APPTAINER, Backend.SLURM, Backend.PBS]:
2491
+ backend_health[backend.value] = self._check_backend_health(backend)
2492
+
2493
+ health_status["components"]["backends"] = backend_health
2494
+
2495
+ # 检查系统资源
2496
+ try:
2497
+ system_usage = self.resource_monitor.get_system_usage()
2498
+ if "error" in system_usage:
2499
+ health_status["components"]["system"] = {"error": system_usage["error"]}
2500
+ else:
2501
+ health_status["components"]["system"] = {
2502
+ "cpu_percent": system_usage["cpu_percent"],
2503
+ "memory_percent": system_usage["memory_percent"],
2504
+ "disk_percent": system_usage["disk_percent"],
2505
+ "status": "healthy" if system_usage["cpu_percent"] < 95 and
2506
+ system_usage["memory_percent"] < 95 else "warning"
2507
+ }
2508
+ except Exception as e:
2509
+ health_status["components"]["system"] = {"error": str(e)}
2510
+
2511
+ # 检查存储
2512
+ if self.job_store:
2513
+ try:
2514
+ with sqlite3.connect(self.job_store.db_path) as conn:
2515
+ conn.execute("SELECT 1")
2516
+ health_status["components"]["storage"] = {"status": "healthy"}
2517
+ except Exception as e:
2518
+ health_status["components"]["storage"] = {"error": str(e), "status": "unhealthy"}
2519
+ health_status["status"] = "degraded"
2520
+
2521
+ # 检查缓存
2522
+ if self.result_cache:
2523
+ try:
2524
+ stats = self.result_cache.get_stats()
2525
+ health_status["components"]["cache"] = {
2526
+ "status": "healthy",
2527
+ "count": stats["count"],
2528
+ "size_mb": stats["total_size_mb"]
2529
+ }
2530
+ except Exception as e:
2531
+ health_status["components"]["cache"] = {"error": str(e), "status": "unhealthy"}
2532
+ health_status["status"] = "degraded"
2533
+
2534
+ # 如果任何组件不健康,更新整体状态
2535
+ for component, status in health_status["components"].items():
2536
+ if status.get("status") == "unhealthy":
2537
+ health_status["status"] = "unhealthy"
2538
+ break
2539
+ elif status.get("status") == "warning":
2540
+ health_status["status"] = "degraded"
2541
+
2542
+ return health_status
2543
+
2544
+ def _check_backend_health(self, backend: Backend) -> Dict[str, Any]:
2545
+ """检查后端健康状态"""
2546
+ try:
2547
+ if backend == Backend.LOCAL:
2548
+ return {"available": True, "status": "healthy"}
2549
+
2550
+ elif backend == Backend.DOCKER:
2551
+ result = subprocess.run(
2552
+ ["docker", "info"],
2553
+ capture_output=True,
2554
+ text=True
2555
+ )
2556
+ return {
2557
+ "available": result.returncode == 0,
2558
+ "status": "healthy" if result.returncode == 0 else "unhealthy"
2559
+ }
2560
+
2561
+ elif backend == Backend.APPTAINER:
2562
+ result = subprocess.run(
2563
+ ["apptainer", "version"],
2564
+ capture_output=True,
2565
+ text=True
2566
+ )
2567
+ return {
2568
+ "available": result.returncode == 0,
2569
+ "status": "healthy" if result.returncode == 0 else "unhealthy"
2570
+ }
2571
+
2572
+ elif backend == Backend.SLURM:
2573
+ result = subprocess.run(
2574
+ ["sinfo", "--version"],
2575
+ capture_output=True,
2576
+ text=True
2577
+ )
2578
+ return {
2579
+ "available": result.returncode == 0,
2580
+ "status": "healthy" if result.returncode == 0 else "unhealthy"
2581
+ }
2582
+
2583
+ elif backend == Backend.PBS:
2584
+ result = subprocess.run(
2585
+ ["qstat", "--version"],
2586
+ capture_output=True,
2587
+ text=True
2588
+ )
2589
+ return {
2590
+ "available": result.returncode == 0,
2591
+ "status": "healthy" if result.returncode == 0 else "unhealthy"
2592
+ }
2593
+
2594
+ return {"available": False, "status": "unknown"}
2595
+
2596
+ except Exception as e:
2597
+ return {"available": False, "status": "error", "error": str(e)}
2598
+
2599
+ def get_metrics(self) -> Dict[str, Any]:
2600
+ """获取调度器指标"""
2601
+ metrics = self.metrics_collector.get_metrics()
2602
+
2603
+ # 添加当前状态
2604
+ with self._lock:
2605
+ current = {
2606
+ "jobs_total": len(self._job_definitions),
2607
+ "jobs_pending": len([f for f in self._futures.values() if not f.done()]),
2608
+ "jobs_running": len([f for f in self._futures.values() if not f.done()]),
2609
+ "jobs_completed": len(self._results),
2610
+ "backends_available": [b.value for b in Backend]
2611
+ }
2612
+
2613
+ metrics["current"] = current
2614
+
2615
+ # 添加缓存统计
2616
+ if self.result_cache:
2617
+ metrics["cache"] = self.result_cache.get_stats()
2618
+
2619
+ # 添加队列统计
2620
+ if self._priority_queue:
2621
+ metrics["queue"] = self._priority_queue.get_stats()
2622
+
2623
+ # 添加超时监控统计
2624
+ if self._timeout_monitor:
2625
+ metrics["timeout_monitor"] = self._timeout_monitor.get_stats()
2626
+
2627
+ return metrics
2628
+
2629
+ def get_queue_status(self) -> Dict[str, Any]:
2630
+ """获取队列状态"""
2631
+ if not self._priority_queue:
2632
+ return {"enabled": False}
2633
+
2634
+ return {
2635
+ "enabled": True,
2636
+ "stats": self._priority_queue.get_stats(),
2637
+ "snapshot": self._priority_queue.get_queue_snapshot()
2638
+ }
2639
+
2640
+ def shutdown(self, wait: bool = True, cancel_jobs: bool = True):
2641
+ """
2642
+ 关闭调度器
2643
+
2644
+ Args:
2645
+ wait: 是否等待正在进行的作业完成
2646
+ cancel_jobs: 是否取消正在运行的作业
2647
+ """
2648
+ self.logger.info("Shutting down scheduler...")
2649
+
2650
+ if cancel_jobs:
2651
+ self.cancel_all()
2652
+
2653
+ self._executor.shutdown(wait=wait)
2654
+
2655
+ # 保存插件数据
2656
+ for plugin in self.plugins:
2657
+ if isinstance(plugin, ResourceLogger):
2658
+ plugin.save_logs()
2659
+
2660
+ self.logger.info("Scheduler shut down")
2661
+
2662
+ def __del__(self):
2663
+ """析构函数"""
2664
+ try:
2665
+ self.shutdown(wait=False, cancel_jobs=True)
2666
+ except:
2667
+ pass
2668
+ def run_workflow(
2669
+ self,
2670
+ workflow: List[Dict[str, Any]],
2671
+ max_workers: int = None,
2672
+ stop_on_error: bool = True,
2673
+ name: str = "workflow"
2674
+ ) -> Dict[str, JobResult]:
2675
+ """
2676
+ 运行工作流(带依赖关系的作业)
2677
+
2678
+ Args:
2679
+ workflow: 工作流定义列表,每个元素包含命令和依赖关系
2680
+ max_workers: 最大工作线程数
2681
+ stop_on_error: 遇到错误是否停止整个工作流
2682
+ name: 工作流名称
2683
+
2684
+ Returns:
2685
+ 作业ID到结果的映射
2686
+
2687
+ Examples:
2688
+ >>> # 简单线性工作流
2689
+ >>> workflow = [
2690
+ ... {"cmd": "download_data.sh", "job_id": "download"},
2691
+ ... {"cmd": "process_data.py", "job_id": "process",
2692
+ ... "dependencies": ["download"]},
2693
+ ... {"cmd": "analyze.py", "job_id": "analyze",
2694
+ ... "dependencies": ["process"]}
2695
+ ... ]
2696
+ >>> results = scheduler.run_workflow(workflow)
2697
+
2698
+ >>> # 并行工作流
2699
+ >>> workflow = [
2700
+ ... {"cmd": "preprocess.py --input data1.csv", "job_id": "preprocess1"},
2701
+ ... {"cmd": "preprocess.py --input data2.csv", "job_id": "preprocess2"},
2702
+ ... {"cmd": "merge_results.py", "job_id": "merge",
2703
+ ... "dependencies": ["preprocess1", "preprocess2"]}
2704
+ ... ]
2705
+ >>> results = scheduler.run_workflow(
2706
+ ... workflow,
2707
+ ... max_workers=2,
2708
+ ... name="data_pipeline"
2709
+ ... )
2710
+ """
2711
+ if max_workers is None:
2712
+ max_workers = self.max_concurrent
2713
+
2714
+ self.logger.info(f"Starting workflow: {name} with {len(workflow)} jobs")
2715
+
2716
+ # 创建工作流作业
2717
+ jobs = {}
2718
+ job_dependencies = {}
2719
+
2720
+ for spec in workflow:
2721
+ spec = spec.copy()
2722
+
2723
+ # 提取作业ID
2724
+ job_id = spec.pop("job_id", str(uuid.uuid4().hex[:8]))
2725
+
2726
+ # 提取依赖
2727
+ dependencies = spec.pop("dependencies", [])
2728
+ job_dependencies[job_id] = dependencies
2729
+
2730
+ # 创建作业定义
2731
+ cmd = spec.pop("cmd")
2732
+ # 处理其他参数
2733
+ backend = spec.pop("backend", None)
2734
+ if isinstance(backend, str):
2735
+ backend = Backend(backend)
2736
+ elif backend is None:
2737
+ backend = self.default_backend
2738
+ resource = spec.pop("resource", None)
2739
+ retry = spec.pop("retry", None)
2740
+ config = spec.pop("config", None)
2741
+
2742
+ # 创建配置对象
2743
+ if isinstance(resource, dict):
2744
+ resource = ResourceRequest(**resource)
2745
+ elif resource is None:
2746
+ resource = ResourceRequest()
2747
+
2748
+ if isinstance(retry, dict):
2749
+ retry = RetryConfig(**retry)
2750
+ elif retry is None:
2751
+ retry = RetryConfig()
2752
+
2753
+ if isinstance(config, dict):
2754
+ config = ExecutionConfig(**config)
2755
+ elif config is None:
2756
+ config = ExecutionConfig()
2757
+
2758
+ # 后端转换
2759
+ if isinstance(backend, str):
2760
+ backend = Backend(backend)
2761
+ elif backend is None:
2762
+ backend = self.default_backend
2763
+
2764
+ # 创建作业定义
2765
+ job = JobDefinition(
2766
+ cmd=cmd,
2767
+ backend=backend,
2768
+ config=config,
2769
+ resource=resource,
2770
+ retry=retry,
2771
+ job_id=job_id,
2772
+ **spec # 其他参数如name, tags等
2773
+ )
2774
+ jobs[job_id] = job
2775
+
2776
+ # 结果存储
2777
+ results = {}
2778
+ completed = set()
2779
+ failed = set()
2780
+
2781
+ # 工作流执行循环
2782
+ while len(results) < len(jobs):
2783
+ # 找到可以执行的作业(依赖都已满足)
2784
+ ready_jobs = []
2785
+
2786
+ for job_id, job in jobs.items():
2787
+ if job_id in results:
2788
+ continue # 已经完成
2789
+
2790
+ # 检查依赖
2791
+ dependencies = job_dependencies.get(job_id, [])
2792
+ can_run = True
2793
+
2794
+ for dep_id in dependencies:
2795
+ if dep_id not in results:
2796
+ can_run = False
2797
+ break
2798
+ elif results[dep_id].failed():
2799
+ can_run = False
2800
+ break
2801
+
2802
+ if can_run:
2803
+ ready_jobs.append(job)
2804
+
2805
+ if not ready_jobs:
2806
+ # 没有可运行的作业,可能是有循环依赖或依赖失败
2807
+ if stop_on_error and failed:
2808
+ self.logger.warning(f"Workflow {name} stopped due to failed dependencies")
2809
+ break
2810
+ else:
2811
+ # 检查是否有作业因为循环依赖而无法运行
2812
+ time.sleep(1)
2813
+ continue
2814
+
2815
+ # 执行就绪的作业
2816
+ batch_results = self.run_many(
2817
+ ready_jobs,
2818
+ max_workers=min(max_workers, len(ready_jobs)),
2819
+ stop_on_error=stop_on_error,
2820
+ use_cache=False # 工作流作业通常不使用缓存
2821
+ )
2822
+
2823
+ # 更新结果
2824
+ for result in batch_results:
2825
+ results[result.job_id] = result
2826
+
2827
+ if result.success():
2828
+ completed.add(result.job_id)
2829
+ else:
2830
+ failed.add(result.job_id)
2831
+
2832
+ if stop_on_error:
2833
+ self.logger.warning(f"Workflow job failed: {result.job_id}")
2834
+
2835
+ # 进度日志
2836
+ self.logger.info(
2837
+ f"Workflow {name} progress: {len(results)}/{len(jobs)} "
2838
+ f"(completed: {len(completed)}, failed: {len(failed)})"
2839
+ )
2840
+
2841
+ self.logger.info(f"Workflow {name} completed")
2842
+ return results
2843
+ # ============================================================================
2844
+ # 简化接口函数
2845
+ # ============================================================================
2846
+
2847
+ def run_command(
2848
+ cmd: str,
2849
+ backend: str = "local",
2850
+ image: str = None,
2851
+ mounts: Dict[str, str] = None,
2852
+ workdir: str = None,
2853
+ env: Dict[str, str] = None,
2854
+ dry_run: bool = False,
2855
+ resource: Dict[str, Any] = None,
2856
+ retry: Dict[str, Any] = None,
2857
+ **kwargs
2858
+ ) -> JobResult:
2859
+ """
2860
+ 简化接口函数 - 保持与原始版本兼容
2861
+
2862
+ Args:
2863
+ cmd: 要执行的命令
2864
+ backend: local | docker | apptainer | slurm | pbs
2865
+ image: 容器镜像
2866
+ mounts: 挂载映射
2867
+ workdir: 工作目录
2868
+ env: 环境变量
2869
+ dry_run: 只打印不执行
2870
+ resource: 资源请求配置
2871
+ retry: 重试配置
2872
+
2873
+ Returns:
2874
+ 作业结果
2875
+ """
2876
+ # 转换参数
2877
+ mounts_dict = None
2878
+ if mounts:
2879
+ mounts_dict = {Path(k): Path(v) for k, v in mounts.items()}
2880
+
2881
+ workdir_path = Path(workdir) if workdir else None
2882
+
2883
+ # 创建调度器实例
2884
+ scheduler = ContainerScheduler()
2885
+
2886
+ # 资源请求
2887
+ resource_obj = None
2888
+ if resource:
2889
+ resource_obj = ResourceRequest(**resource)
2890
+
2891
+ # 重试配置
2892
+ retry_obj = None
2893
+ if retry:
2894
+ retry_obj = RetryConfig(**retry)
2895
+
2896
+ # 执行配置
2897
+ config_kwargs = {}
2898
+ for key in ["timeout", "stdout", "stderr", "capture_output", "check", "silent"]:
2899
+ if key in kwargs:
2900
+ config_kwargs[key] = kwargs[key]
2901
+
2902
+ config = ExecutionConfig(
2903
+ workdir=workdir_path,
2904
+ env=env or {},
2905
+ mounts=mounts_dict or {},
2906
+ **config_kwargs
2907
+ )
2908
+
2909
+ # 运行命令
2910
+ return scheduler.run(
2911
+ cmd=cmd,
2912
+ backend=backend,
2913
+ image=image,
2914
+ config=config,
2915
+ resource=resource_obj,
2916
+ retry=retry_obj,
2917
+ dry_run=dry_run
2918
+ )
2919
+
2920
+ # ============================================================================
2921
+ # 命令行接口
2922
+ # ============================================================================
2923
+
2924
+ def main():
2925
+ """命令行入口点"""
2926
+ import argparse
2927
+
2928
+ parser = argparse.ArgumentParser(
2929
+ description="Universal Container Scheduler CLI",
2930
+ formatter_class=argparse.RawDescriptionHelpFormatter,
2931
+ epilog="""
2932
+ Examples:
2933
+ %(prog)s "echo Hello World"
2934
+ %(prog)s "python script.py" --backend docker --image python:3.9
2935
+ %(prog)s --health
2936
+ %(prog)s --stats
2937
+ %(prog)s "long_job.sh" --queue --priority high
2938
+ """
2939
+ )
2940
+
2941
+ # 主要模式
2942
+ parser.add_argument("command", nargs="?", help="Command to execute")
2943
+ parser.add_argument("--config", help="Configuration file path")
2944
+
2945
+ # 执行参数
2946
+ parser.add_argument("--backend", default="local",
2947
+ choices=["local", "docker", "apptainer", "slurm", "pbs"],
2948
+ help="Execution backend")
2949
+ parser.add_argument("--image", help="Container image")
2950
+ parser.add_argument("--workdir", help="Working directory")
2951
+ parser.add_argument("--mount", action="append",
2952
+ help="Mount mapping (host:container)")
2953
+ parser.add_argument("--env", action="append",
2954
+ help="Environment variable (KEY=VALUE)")
2955
+ parser.add_argument("--cpus", type=int, default=1, help="CPU cores")
2956
+ parser.add_argument("--memory", type=float, help="Memory in GB")
2957
+ parser.add_argument("--timeout", type=int, help="Timeout in seconds")
2958
+ parser.add_argument("--dry-run", action="store_true", help="Dry run")
2959
+ parser.add_argument("--output", help="Output file")
2960
+ parser.add_argument("--error", help="Error file")
2961
+ parser.add_argument("--retry", type=int, default=1, help="Max retry attempts")
2962
+
2963
+ # 队列和调度
2964
+ parser.add_argument("--queue", action="store_true", help="Add job to queue instead of immediate execution")
2965
+ parser.add_argument("--priority", default="normal",
2966
+ choices=["lowest", "low", "normal", "high", "highest", "critical"],
2967
+ help="Job priority")
2968
+
2969
+ # 监控和管理
2970
+ parser.add_argument("--health", action="store_true", help="Check scheduler health")
2971
+ parser.add_argument("--stats", action="store_true", help="Show scheduler statistics")
2972
+ parser.add_argument("--list-jobs", action="store_true", help="List all jobs")
2973
+ parser.add_argument("--queue-status", action="store_true", help="Show queue status")
2974
+ parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
2975
+
2976
+ args = parser.parse_args()
2977
+
2978
+ # 设置日志
2979
+ log_level = logging.DEBUG if args.verbose else logging.INFO
2980
+ logging.basicConfig(
2981
+ level=log_level,
2982
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
2983
+ )
2984
+
2985
+ logger = logging.getLogger(__name__)
2986
+
2987
+ # 加载配置
2988
+ config = None
2989
+ if args.config:
2990
+ config = SchedulerConfig.load(Path(args.config))
2991
+
2992
+ # 创建调度器
2993
+ scheduler = ContainerScheduler(
2994
+ config=config,
2995
+ enable_priority_queue=True if args.queue else False,
2996
+ enable_timeout_monitor=True
2997
+ )
2998
+
2999
+ try:
3000
+ # 处理不同模式
3001
+ if args.health:
3002
+ # 健康检查模式
3003
+ health = scheduler.health_check()
3004
+ print(json.dumps(health, indent=2, default=str))
3005
+ sys.exit(0 if health["status"] == "healthy" else 1)
3006
+
3007
+ elif args.stats:
3008
+ # 统计模式
3009
+ metrics = scheduler.get_metrics()
3010
+ print(json.dumps(metrics, indent=2, default=str))
3011
+ sys.exit(0)
3012
+
3013
+ elif args.queue_status:
3014
+ # 队列状态
3015
+ queue_status = scheduler.get_queue_status()
3016
+ print(json.dumps(queue_status, indent=2, default=str))
3017
+ sys.exit(0)
3018
+
3019
+ elif args.list_jobs:
3020
+ # 列出作业
3021
+ if scheduler.job_store:
3022
+ jobs = scheduler.job_store.search_jobs(limit=100)
3023
+ for job in jobs:
3024
+ result = scheduler.get_result(job.job_id)
3025
+ status = result.status.value if result else "unknown"
3026
+ print(f"{job.job_id}: {job.name} ({job.backend.value}) - {status}")
3027
+ else:
3028
+ print("Job store not enabled")
3029
+ sys.exit(0)
3030
+
3031
+ elif args.command:
3032
+ # 执行命令模式
3033
+ # 解析挂载
3034
+ mounts = {}
3035
+ if args.mount:
3036
+ for mount in args.mount:
3037
+ if ":" in mount:
3038
+ host, container = mount.split(":", 1)
3039
+ mounts[host] = container
3040
+
3041
+ # 解析环境变量
3042
+ env = {}
3043
+ if args.env:
3044
+ for env_var in args.env:
3045
+ if "=" in env_var:
3046
+ key, value = env_var.split("=", 1)
3047
+ env[key] = value
3048
+
3049
+ # 资源配置
3050
+ resource = {}
3051
+ if args.cpus > 1:
3052
+ resource["cpus"] = args.cpus
3053
+ if args.memory:
3054
+ resource["memory_gb"] = args.memory
3055
+
3056
+ # 执行配置
3057
+ config_dict = {}
3058
+ if args.timeout:
3059
+ config_dict["timeout"] = args.timeout
3060
+ if args.output:
3061
+ config_dict["stdout"] = args.output
3062
+ if args.error:
3063
+ config_dict["stderr"] = args.error
3064
+
3065
+ # 优先级转换
3066
+ priority_map = {
3067
+ "lowest": JobPriority.LOWEST,
3068
+ "low": JobPriority.LOW,
3069
+ "normal": JobPriority.NORMAL,
3070
+ "high": JobPriority.HIGH,
3071
+ "highest": JobPriority.HIGHEST,
3072
+ "critical": JobPriority.CRITICAL
3073
+ }
3074
+ priority = priority_map[args.priority]
3075
+
3076
+ if args.queue:
3077
+ # 加入队列
3078
+ job = JobDefinition(
3079
+ cmd=args.command,
3080
+ backend=Backend(args.backend),
3081
+ image=args.image,
3082
+ config=ExecutionConfig(
3083
+ workdir=Path(args.workdir) if args.workdir else None,
3084
+ env=env,
3085
+ mounts={Path(k): Path(v) for k, v in mounts.items()},
3086
+ **config_dict
3087
+ ),
3088
+ resource=ResourceRequest(**resource),
3089
+ retry=RetryConfig(max_attempts=args.retry),
3090
+ priority=priority,
3091
+ name="cli_job"
3092
+ )
3093
+ job_id = scheduler.enqueue(job)
3094
+ print(f"Job enqueued with ID: {job_id}")
3095
+ print(f"Use --queue-status to check queue status")
3096
+ else:
3097
+ # 立即执行
3098
+ result = scheduler.run(
3099
+ cmd=args.command,
3100
+ backend=args.backend,
3101
+ image=args.image,
3102
+ mounts=mounts,
3103
+ workdir=args.workdir,
3104
+ env=env,
3105
+ dry_run=args.dry_run,
3106
+ resource=resource,
3107
+ retry={"max_attempts": args.retry},
3108
+ config=config_dict,
3109
+ priority=priority
3110
+ )
3111
+
3112
+ # 输出结果
3113
+ if result.stdout:
3114
+ print(result.stdout)
3115
+
3116
+ if result.stderr:
3117
+ print(result.stderr, file=sys.stderr)
3118
+
3119
+ exit_code = result.exit_code or 0
3120
+ if result.success():
3121
+ print(f"Command completed successfully in {result.duration:.2f}s")
3122
+ else:
3123
+ print(f"Command failed with exit code {exit_code}: {result.error_message}")
3124
+
3125
+ sys.exit(exit_code)
3126
+ else:
3127
+ # 交互模式
3128
+ print("Universal Container Scheduler")
3129
+ print("No command specified. Available modes:")
3130
+ print(" --health Check scheduler health")
3131
+ print(" --stats Show scheduler statistics")
3132
+ print(" --queue Add job to queue")
3133
+ print(" --list-jobs List all jobs")
3134
+ print(" --queue-status Show queue status")
3135
+ sys.exit(1)
3136
+
3137
+ except KeyboardInterrupt:
3138
+ print("\nInterrupted by user")
3139
+ scheduler.shutdown(wait=False, cancel_jobs=True)
3140
+ sys.exit(130)
3141
+ except Exception as e:
3142
+ logger.error(f"Error: {e}")
3143
+ if args.verbose:
3144
+ import traceback
3145
+ traceback.print_exc()
3146
+ sys.exit(1)
3147
+ finally:
3148
+ scheduler.shutdown(wait=True, cancel_jobs=False)
3149
+
3150
+
3151
+ # ============================================================================
3152
+ # 使用示例
3153
+ # ============================================================================
3154
+
3155
+ # if __name__ == "__main__":
3156
+ # # 示例代码
3157
+ # print("Universal Container Scheduler - Example Usage")
3158
+ # print("=" * 50)
3159
+
3160
+ # # 示例1: 基本使用
3161
+ # print("\n1. Basic Usage:")
3162
+ # scheduler = ContainerScheduler()
3163
+ # result = scheduler.run("echo 'Hello, World!'")
3164
+ # print(f" Result: {result.status.value}, Output: {result.stdout}")
3165
+
3166
+ # # 示例2: 使用Docker容器
3167
+ # print("\n2. Docker Container Example:")
3168
+ # try:
3169
+ # result = scheduler.run(
3170
+ # cmd="python -c 'import sys; print(f\"Python {sys.version}\")'",
3171
+ # backend="docker",
3172
+ # image="python:3.9-slim",
3173
+ # dry_run=True # 干运行,不实际执行
3174
+ # )
3175
+ # print(f" Dry run completed for Docker command")
3176
+ # except Exception as e:
3177
+ # print(f" Docker not available: {e}")
3178
+
3179
+ # # 示例3: 带重试的作业
3180
+ # print("\n3. Job with Retry:")
3181
+ # result = scheduler.run(
3182
+ # cmd="echo 'Test with retry' && exit 1", # 这个命令会失败
3183
+ # retry={"max_attempts": 3, "delay_seconds": 1}
3184
+ # )
3185
+ # print(f" Final status: {result.status.value}, Attempts: {result.attempts}")
3186
+
3187
+ # # 示例4: 批量作业
3188
+ # print("\n4. Batch Jobs:")
3189
+ # commands = [f"echo 'Job {i}'" for i in range(3)]
3190
+ # results = scheduler.run_many(commands, max_workers=2)
3191
+ # print(f" Completed {len(results)} jobs, {sum(1 for r in results if r.success())} successful")
3192
+
3193
+ # # 示例5: 健康检查
3194
+ # print("\n5. Health Check:")
3195
+ # health = scheduler.health_check()
3196
+ # print(f" Status: {health['status']}")
3197
+
3198
+ # # 示例6: 指标收集
3199
+ # print("\n6. Metrics:")
3200
+ # metrics = scheduler.get_metrics()
3201
+ # print(f" Total jobs: {metrics['jobs_total']}")
3202
+ # print(f" Success rate: {metrics.get('success_rate', 0):.1%}")
3203
+
3204
+ # # 清理
3205
+ # scheduler.shutdown()
3206
+
3207
+ # print("\n" + "=" * 50)
3208
+ # print("All examples completed!")
3209
+ # print("\nTo use the command line interface:")
3210
+ # print(" python universal_scheduler.py --help")
3211
+ # print("\nExample commands:")
3212
+ # print(" python universal_scheduler.py 'echo Hello'")
3213
+ # print(" python universal_scheduler.py --health")
3214
+ # print(" python universal_scheduler.py --stats")
3215
+
3216
+
3217
+ # ============================================================================
3218
+ # 高级使用场景示例
3219
+ # ============================================================================
3220
+
3221
+ def example_data_processing_pipeline():
3222
+ """
3223
+ 示例1: 数据处理流水线
3224
+ 模拟真实的数据处理工作流:下载 -> 预处理 -> 分析 -> 报告
3225
+ """
3226
+ print("\n" + "="*60)
3227
+ print("示例1: 数据处理流水线")
3228
+ print("="*60)
3229
+
3230
+ # 创建配置化的调度器
3231
+ scheduler = ContainerScheduler(
3232
+ max_concurrent=4,
3233
+ job_store=JobStore("data_pipeline.db"),
3234
+ result_cache=ResultCache(".pipeline_cache"),
3235
+ plugins=[
3236
+ NotificationPlugin(),
3237
+ ResourceLogger()
3238
+ ],
3239
+ enable_priority_queue=True
3240
+ )
3241
+
3242
+ # 模拟数据文件
3243
+ data_files = [
3244
+ "sales_2023_q1.csv",
3245
+ "sales_2023_q2.csv",
3246
+ "sales_2023_q3.csv",
3247
+ "sales_2023_q4.csv"
3248
+ ]
3249
+
3250
+ try:
3251
+ # 阶段1: 并行下载数据(模拟)
3252
+ print("\n阶段1: 下载数据文件...")
3253
+ download_jobs = []
3254
+ for data_file in data_files:
3255
+ job = JobDefinition(
3256
+ cmd=f"curl -s https://example.com/data/{data_file} -o {data_file}",
3257
+ name=f"download_{data_file}",
3258
+ resource=ResourceRequest(cpus=1, memory_gb=2),
3259
+ retry=RetryConfig(max_attempts=3, delay_seconds=5),
3260
+ tags={"stage": "download", "file": data_file}
3261
+ )
3262
+ download_jobs.append(job)
3263
+
3264
+ # 批量提交下载作业
3265
+ download_results = scheduler.run_many(
3266
+ [{"cmd": f"echo '模拟下载 {f}' && sleep 1" for f in data_files}],
3267
+ progress_callback=lambda c, t: print(f" 下载进度: {c}/{t}")
3268
+ )
3269
+
3270
+ # 阶段2: 数据预处理
3271
+ print("\n阶段2: 数据预处理...")
3272
+ preprocess_jobs = []
3273
+ for data_file in data_files:
3274
+ output_file = data_file.replace(".csv", "_processed.parquet")
3275
+ job = JobDefinition(
3276
+ cmd=f"python preprocess.py --input {data_file} --output {output_file}",
3277
+ name=f"preprocess_{data_file}",
3278
+ backend=Backend.DOCKER,
3279
+ image="python:3.9-data-science",
3280
+ mounts={"/data": "/data"},
3281
+ config=ExecutionConfig(
3282
+ workdir=Path("/data"),
3283
+ env={"PYTHONPATH": "/data/scripts"}
3284
+ ),
3285
+ resource=ResourceRequest(cpus=4, memory_gb=8),
3286
+ retry=RetryConfig(max_attempts=2),
3287
+ tags={"stage": "preprocess", "file": data_file}
3288
+ )
3289
+ preprocess_jobs.append(job)
3290
+ scheduler.enqueue(job) # 加入队列
3291
+
3292
+ # 等待预处理完成
3293
+ print("等待预处理作业完成...")
3294
+ scheduler.wait_all()
3295
+
3296
+ # 阶段3: 聚合分析
3297
+ print("\n阶段3: 聚合分析...")
3298
+ analysis_job = JobDefinition(
3299
+ cmd="python analyze.py --pattern *_processed.parquet --output analysis_results.json",
3300
+ name="aggregate_analysis",
3301
+ backend=Backend.SLURM,
3302
+ resource=ResourceRequest(
3303
+ cpus=8,
3304
+ memory_gb=32,
3305
+ time_hours=2,
3306
+ partition="analysis"
3307
+ ),
3308
+ tags={"stage": "analysis"}
3309
+ )
3310
+
3311
+ analysis_result = scheduler.submit(analysis_job, wait=True)
3312
+
3313
+ # 阶段4: 生成报告
3314
+ print("\n阶段4: 生成报告...")
3315
+ report_job = JobDefinition(
3316
+ cmd="python generate_report.py --input analysis_results.json --output report.html",
3317
+ name="generate_report",
3318
+ resource=ResourceRequest(cpus=2, memory_gb=4),
3319
+ tags={"stage": "report"}
3320
+ )
3321
+
3322
+ report_result = scheduler.submit(report_job, wait=True)
3323
+
3324
+ # 检查最终结果
3325
+ if report_result.success():
3326
+ print(f"\n✅ 数据处理流水线完成!")
3327
+ print(f" 总作业数: {scheduler.metrics_collector.get_metrics()['jobs_total']}")
3328
+ print(f" 成功作业: {scheduler.metrics_collector.get_metrics()['jobs_completed']}")
3329
+ print(f" 总耗时: {report_result.duration:.1f}s")
3330
+ else:
3331
+ print(f"\n❌ 数据处理流水线失败!")
3332
+ print(f" 错误信息: {report_result.error_message}")
3333
+
3334
+ finally:
3335
+ scheduler.shutdown()
3336
+ print("调度器已关闭")
3337
+
3338
+ def example_machine_learning_training():
3339
+ """
3340
+ 示例2: 机器学习模型训练与超参数搜索
3341
+ 分布式模型训练和超参数优化
3342
+ """
3343
+ print("\n" + "="*60)
3344
+ print("示例2: 机器学习模型训练")
3345
+ print("="*60)
3346
+
3347
+ scheduler = ContainerScheduler(
3348
+ max_concurrent=8,
3349
+ job_store=JobStore("ml_training.db"),
3350
+ enable_priority_queue=True
3351
+ )
3352
+
3353
+ # 超参数搜索空间
3354
+ hyperparameters = [
3355
+ {"model": "resnet50", "lr": 0.001, "batch_size": 32, "epochs": 50},
3356
+ {"model": "resnet50", "lr": 0.01, "batch_size": 64, "epochs": 50},
3357
+ {"model": "efficientnet", "lr": 0.001, "batch_size": 32, "epochs": 50},
3358
+ {"model": "efficientnet", "lr": 0.01, "batch_size": 64, "epochs": 50},
3359
+ {"model": "vit", "lr": 0.0005, "batch_size": 16, "epochs": 100},
3360
+ {"model": "vit", "lr": 0.005, "batch_size": 32, "epochs": 100},
3361
+ ]
3362
+
3363
+ try:
3364
+ print(f"开始超参数搜索,共 {len(hyperparameters)} 组配置...")
3365
+
3366
+ # 为每组超参数创建训练任务
3367
+ training_jobs = []
3368
+ for i, params in enumerate(hyperparameters):
3369
+ params_str = " ".join([f"--{k} {v}" for k, v in params.items()])
3370
+
3371
+ job = JobDefinition(
3372
+ cmd=f"python train_model.py {params_str} --data /data/imagenet --output /output/model_{i}.pth",
3373
+ name=f"train_model_{i}",
3374
+ backend=Backend.SLURM,
3375
+ image="pytorch/pytorch:latest",
3376
+ config=ExecutionConfig(
3377
+ mounts={
3378
+ Path("/datasets/imagenet"): Path("/data"),
3379
+ Path("/models"): Path("/output")
3380
+ }
3381
+ ),
3382
+ resource=ResourceRequest(
3383
+ cpus=8,
3384
+ memory_gb=32,
3385
+ gpus=2,
3386
+ gpu_type="a100",
3387
+ time_hours=12,
3388
+ partition="gpu",
3389
+ exclusive=True
3390
+ ),
3391
+ retry=RetryConfig(
3392
+ max_attempts=2,
3393
+ retry_on_memory_error=True,
3394
+ retry_on_network_error=True
3395
+ ),
3396
+ tags={
3397
+ "task": "hyperparameter_search",
3398
+ "model": params["model"],
3399
+ "config_id": str(i)
3400
+ },
3401
+ callback=lambda result, i=i: print(f" 配置 {i} 训练完成: {result.status.value}")
3402
+ )
3403
+ training_jobs.append(job)
3404
+
3405
+ # 批量提交训练作业(并行执行)
3406
+ print("提交训练作业...")
3407
+ training_results = scheduler.run_many(
3408
+ training_jobs,
3409
+ max_workers=4, # 最多同时训练4个模型
3410
+ progress_callback=lambda c, t: print(f" 训练进度: {c}/{t}")
3411
+ )
3412
+
3413
+ # 收集最佳模型
3414
+ best_model = None
3415
+ best_accuracy = 0.0
3416
+
3417
+ for result in training_results:
3418
+ if result.success() and result.stdout:
3419
+ try:
3420
+ # 从输出中解析指标
3421
+ import json
3422
+ metrics = json.loads(result.stdout)
3423
+ accuracy = metrics.get("val_accuracy", 0)
3424
+
3425
+ if accuracy > best_accuracy:
3426
+ best_accuracy = accuracy
3427
+ best_model = result
3428
+ except:
3429
+ pass
3430
+
3431
+ if best_model:
3432
+ print(f"\n🎉 找到最佳模型!")
3433
+ print(f" 配置ID: {best_model.job_id}")
3434
+ print(f" 验证准确率: {best_accuracy:.2%}")
3435
+
3436
+ # 评估最佳模型
3437
+ print("\n评估最佳模型...")
3438
+ eval_job = JobDefinition(
3439
+ cmd=f"python evaluate_model.py --model /output/{best_model.job_id}.pth --test_data /data/imagenet_test",
3440
+ name="evaluate_best_model",
3441
+ backend=Backend.SLURM,
3442
+ resource=ResourceRequest(
3443
+ cpus=4,
3444
+ memory_gb=16,
3445
+ gpus=1,
3446
+ time_hours=2
3447
+ ),
3448
+ tags={"task": "evaluation", "best_model": best_model.job_id}
3449
+ )
3450
+
3451
+ eval_result = scheduler.submit(eval_job, wait=True)
3452
+
3453
+ if eval_result.success():
3454
+ print(" 评估完成!")
3455
+ print(f" 测试准确率: {eval_result.stdout}")
3456
+
3457
+ finally:
3458
+ scheduler.shutdown()
3459
+ print("\nML训练完成!")
3460
+
3461
+ def example_bioinformatics_workflow():
3462
+ """
3463
+ 示例3: 生物信息学工作流
3464
+ DNA测序数据分析流程
3465
+ """
3466
+ print("\n" + "="*60)
3467
+ print("示例3: 生物信息学工作流")
3468
+ print("="*60)
3469
+
3470
+ scheduler = ContainerScheduler(
3471
+ max_concurrent=6,
3472
+ default_backend=Backend.SLURM,
3473
+ job_store=JobStore("bioinformatics.db"),
3474
+ result_cache=ResultCache(".bio_cache")
3475
+ )
3476
+
3477
+ # 样本列表
3478
+ samples = [
3479
+ "sample_001", "sample_002", "sample_003",
3480
+ "sample_004", "sample_005", "sample_006"
3481
+ ]
3482
+
3483
+ try:
3484
+ # 工作流定义
3485
+ workflow = []
3486
+
3487
+ # 1. 质量控制(并行)
3488
+ for sample in samples:
3489
+ workflow.append({
3490
+ "job_id": f"qc_{sample}",
3491
+ "cmd": f"fastqc /data/raw/{sample}.fastq.gz -o /data/qc/{sample}",
3492
+ "backend": Backend.LOCAL,
3493
+ "resource": {"cpus": 2, "memory_gb": 4},
3494
+ "tags": {"stage": "quality_control", "sample": sample}
3495
+ })
3496
+
3497
+ # 2. 序列比对(有依赖关系)
3498
+ for sample in samples:
3499
+ workflow.append({
3500
+ "job_id": f"align_{sample}",
3501
+ "cmd": f"bwa mem -t 8 /data/reference/hg38.fasta /data/raw/{sample}.fastq.gz > /data/aligned/{sample}.sam",
3502
+ "dependencies": [f"qc_{sample}"],
3503
+ "backend": Backend.SLURM,
3504
+ "resource": {"cpus": 8, "memory_gb": 16, "time_hours": 4},
3505
+ "tags": {"stage": "alignment", "sample": sample}
3506
+ })
3507
+
3508
+ # 3. 变异检测(批量处理)
3509
+ workflow.append({
3510
+ "job_id": "variant_calling",
3511
+ "cmd": "gatk HaplotypeCaller -R /data/reference/hg38.fasta -I /data/aligned/*.bam -O /data/variants/all_variants.vcf",
3512
+ "dependencies": [f"align_{sample}" for sample in samples],
3513
+ "backend": Backend.SLURM,
3514
+ "resource": {"cpus": 32, "memory_gb": 64, "time_hours": 8, "partition": "large"},
3515
+ "tags": {"stage": "variant_calling"}
3516
+ })
3517
+
3518
+ # 4. 注释分析
3519
+ workflow.append({
3520
+ "job_id": "annotation",
3521
+ "cmd": "annovar /data/variants/all_variants.vcf /data/annotations/ -buildver hg38",
3522
+ "dependencies": ["variant_calling"],
3523
+ "backend": Backend.LOCAL,
3524
+ "resource": {"cpus": 4, "memory_gb": 8},
3525
+ "tags": {"stage": "annotation"}
3526
+ })
3527
+
3528
+ # 5. 生成报告
3529
+ workflow.append({
3530
+ "job_id": "generate_report",
3531
+ "cmd": "python generate_report.py --vcf /data/variants/all_variants.vcf --output /data/report/final_report.html",
3532
+ "dependencies": ["annotation"],
3533
+ "tags": {"stage": "report"}
3534
+ })
3535
+
3536
+ print(f"开始生物信息学工作流,共 {len(workflow)} 个步骤...")
3537
+
3538
+ # 运行工作流
3539
+ results = scheduler.run_workflow(
3540
+ workflow=workflow,
3541
+ max_workers=3,
3542
+ name="bioinformatics_pipeline",
3543
+ stop_on_error=True
3544
+ )
3545
+
3546
+ # 分析结果
3547
+ successful = sum(1 for r in results.values() if r.success())
3548
+ total = len(results)
3549
+
3550
+ print(f"\n工作流完成: {successful}/{total} 个步骤成功")
3551
+
3552
+ if successful == total:
3553
+ final_result = results["generate_report"]
3554
+ print(f"🎉 分析完成! 报告已生成")
3555
+ print(f" 总耗时: {sum(r.duration or 0 for r in results.values()):.1f}秒")
3556
+
3557
+ # 显示各阶段耗时
3558
+ print("\n各阶段耗时:")
3559
+ for job_id, result in results.items():
3560
+ if result.duration:
3561
+ print(f" {job_id}: {result.duration:.1f}s")
3562
+
3563
+ finally:
3564
+ scheduler.shutdown()
3565
+
3566
+ def example_cloud_batch_processing():
3567
+ """
3568
+ 示例4: 云批量处理
3569
+ 模拟AWS Batch或Azure Batch场景
3570
+ """
3571
+ print("\n" + "="*60)
3572
+ print("示例4: 云批量处理")
3573
+ print("="*60)
3574
+
3575
+ # 模拟云作业调度
3576
+ scheduler = ContainerScheduler(
3577
+ max_concurrent=20, # 高并发
3578
+ job_store=JobStore("cloud_batch.db"),
3579
+ plugins=[
3580
+ NotificationPlugin(),
3581
+ ResourceLogger("cloud_resources.json")
3582
+ ]
3583
+ )
3584
+
3585
+ # 模拟大量数据处理任务
3586
+ tasks = []
3587
+ for i in range(100):
3588
+ task = {
3589
+ "task_id": f"task_{i:03d}",
3590
+ "input_file": f"s3://bucket/input/data_{i}.json",
3591
+ "output_file": f"s3://bucket/output/processed_{i}.parquet",
3592
+ "complexity": random.choice(["simple", "medium", "complex"])
3593
+ }
3594
+ tasks.append(task)
3595
+
3596
+ try:
3597
+ print(f"开始处理 {len(tasks)} 个云任务...")
3598
+
3599
+ # 根据任务复杂度分配资源
3600
+ job_definitions = []
3601
+ for task in tasks:
3602
+ if task["complexity"] == "simple":
3603
+ cpus = 2
3604
+ memory_gb = 4
3605
+ priority = JobPriority.LOW
3606
+ elif task["complexity"] == "medium":
3607
+ cpus = 4
3608
+ memory_gb = 8
3609
+ priority = JobPriority.NORMAL
3610
+ else: # complex
3611
+ cpus = 8
3612
+ memory_gb = 16
3613
+ priority = JobPriority.HIGH
3614
+
3615
+ job = JobDefinition(
3616
+ cmd=f"python cloud_processor.py --input {task['input_file']} --output {task['output_file']}",
3617
+ name=f"cloud_task_{task['task_id']}",
3618
+ backend=Backend.DOCKER,
3619
+ image="python:3.9-cloud",
3620
+ config=ExecutionConfig(
3621
+ env={
3622
+ "AWS_ACCESS_KEY_ID": "xxx",
3623
+ "AWS_SECRET_ACCESS_KEY": "xxx",
3624
+ "AWS_DEFAULT_REGION": "us-east-1"
3625
+ }
3626
+ ),
3627
+ resource=ResourceRequest(cpus=cpus, memory_gb=memory_gb),
3628
+ retry=RetryConfig(
3629
+ max_attempts=3,
3630
+ backoff_factor=2.0,
3631
+ retry_on_network_error=True
3632
+ ),
3633
+ priority=priority,
3634
+ tags={
3635
+ "cloud": "aws",
3636
+ "task_type": "batch_processing",
3637
+ "complexity": task["complexity"],
3638
+ "task_id": task["task_id"]
3639
+ },
3640
+ callback=lambda r, t=task: print(f" 任务 {t['task_id']} 完成: {r.status.value}")
3641
+ )
3642
+ job_definitions.append(job)
3643
+
3644
+ # 批量提交(模拟云批量作业)
3645
+ print("提交任务到云队列...")
3646
+ batch_size = 10 # 每批处理10个任务
3647
+ all_results = []
3648
+
3649
+ for i in range(0, len(job_definitions), batch_size):
3650
+ batch = job_definitions[i:i+batch_size]
3651
+ print(f"处理批次 {i//batch_size + 1}/{(len(job_definitions)+batch_size-1)//batch_size}...")
3652
+
3653
+ batch_results = scheduler.run_many(
3654
+ batch,
3655
+ max_workers=10,
3656
+ progress_callback=lambda c, t: None # 静默进度
3657
+ )
3658
+ all_results.extend(batch_results)
3659
+
3660
+ # 批次间短暂暂停
3661
+ time.sleep(2)
3662
+
3663
+ # 统计结果
3664
+ successful = sum(1 for r in all_results if r.success())
3665
+ failed = len(all_results) - successful
3666
+
3667
+ print(f"\n📊 批量处理完成统计:")
3668
+ print(f" 总任务数: {len(all_results)}")
3669
+ print(f" 成功: {successful}")
3670
+ print(f" 失败: {failed}")
3671
+
3672
+ if failed > 0:
3673
+ print("\n失败任务:")
3674
+ for result in all_results:
3675
+ if result.failed():
3676
+ print(f" {result.job_id}: {result.error_message}")
3677
+
3678
+ # 显示资源使用情况
3679
+ metrics = scheduler.get_metrics()
3680
+ print(f"\n💻 资源使用统计:")
3681
+ print(f" CPU小时: {metrics['total_cpu_hours']:.2f}")
3682
+ print(f" 内存GB小时: {metrics['total_memory_gb_hours']:.2f}")
3683
+ print(f" 平均作业时长: {metrics.get('avg_duration', 0):.1f}s")
3684
+
3685
+ finally:
3686
+ scheduler.shutdown()
3687
+
3688
+ def example_real_time_monitoring():
3689
+ """
3690
+ 示例5: 实时监控和告警系统
3691
+ 模拟生产环境监控场景
3692
+ """
3693
+ print("\n" + "="*60)
3694
+ print("示例5: 实时监控系统")
3695
+ print("="*60)
3696
+
3697
+ # 创建带完整监控的调度器
3698
+ scheduler = ContainerScheduler(
3699
+ max_concurrent=10,
3700
+ job_store=JobStore("monitoring.db"),
3701
+ plugins=[
3702
+ NotificationPlugin(webhook_url="https://hooks.slack.com/services/XXX"),
3703
+ ResourceLogger("monitoring_logs.json")
3704
+ ],
3705
+ enable_timeout_monitor=True,
3706
+ enable_priority_queue=True
3707
+ )
3708
+
3709
+ # 监控任务定义
3710
+ monitoring_tasks = [
3711
+ {
3712
+ "name": "database_health_check",
3713
+ "cmd": "python check_database.py --host db-prod --timeout 30",
3714
+ "interval": 60, # 每60秒执行一次
3715
+ "timeout": 45,
3716
+ "priority": JobPriority.HIGH
3717
+ },
3718
+ {
3719
+ "name": "api_endpoint_check",
3720
+ "cmd": "curl -f https://api.example.com/health",
3721
+ "interval": 30,
3722
+ "timeout": 10,
3723
+ "retry": {"max_attempts": 2}
3724
+ },
3725
+ {
3726
+ "name": "disk_space_check",
3727
+ "cmd": "python check_disk.py --path / --threshold 90",
3728
+ "interval": 300,
3729
+ "priority": JobPriority.NORMAL
3730
+ },
3731
+ {
3732
+ "name": "service_metrics_collect",
3733
+ "cmd": "python collect_metrics.py --services web,api,cache,queue",
3734
+ "interval": 60,
3735
+ "resource": {"cpus": 2, "memory_gb": 4}
3736
+ },
3737
+ {
3738
+ "name": "log_analysis",
3739
+ "cmd": "python analyze_logs.py --logfile /var/log/app.log --pattern ERROR",
3740
+ "interval": 120,
3741
+ "backend": Backend.LOCAL,
3742
+ "resource": {"cpus": 4, "memory_gb": 8}
3743
+ }
3744
+ ]
3745
+
3746
+ try:
3747
+ print("启动实时监控系统...")
3748
+ print(f"监控任务数: {len(monitoring_tasks)}")
3749
+
3750
+ # 创建定期执行的任务
3751
+ monitor_threads = []
3752
+ stop_event = threading.Event()
3753
+
3754
+ for task_def in monitoring_tasks:
3755
+ def monitor_loop(def_copy=task_def, stop=stop_event):
3756
+ """监控循环"""
3757
+ task_name = def_copy["name"]
3758
+ interval = def_copy["interval"]
3759
+
3760
+ print(f" 启动监控: {task_name} (间隔: {interval}s)")
3761
+
3762
+ execution_count = 0
3763
+ while not stop.is_set():
3764
+ try:
3765
+ # 创建作业定义
3766
+ job = JobDefinition(
3767
+ cmd=def_copy["cmd"],
3768
+ name=f"monitor_{task_name}_{execution_count}",
3769
+ backend=def_copy.get("backend", Backend.LOCAL),
3770
+ config=ExecutionConfig(
3771
+ timeout=def_copy.get("timeout", 30),
3772
+ capture_output=True
3773
+ ),
3774
+ resource=ResourceRequest(**def_copy.get("resource", {"cpus": 1, "memory_gb": 1})),
3775
+ retry=RetryConfig(**def_copy.get("retry", {"max_attempts": 1})),
3776
+ priority=def_copy.get("priority", JobPriority.NORMAL),
3777
+ tags={
3778
+ "monitoring": "true",
3779
+ "task": task_name,
3780
+ "execution": str(execution_count)
3781
+ }
3782
+ )
3783
+
3784
+ # 提交作业(异步)
3785
+ future = scheduler.submit(job, wait=False)
3786
+
3787
+ # 记录执行
3788
+ execution_count += 1
3789
+
3790
+ # 等待间隔时间
3791
+ for _ in range(interval):
3792
+ if stop.is_set():
3793
+ break
3794
+ time.sleep(1)
3795
+
3796
+ except Exception as e:
3797
+ print(f"监控任务 {task_name} 错误: {e}")
3798
+ time.sleep(interval)
3799
+
3800
+ thread = threading.Thread(target=monitor_loop, daemon=True)
3801
+ thread.start()
3802
+ monitor_threads.append(thread)
3803
+
3804
+ # 运行监控一段时间
3805
+ print("\n监控系统运行中... (运行30秒演示)")
3806
+ print("按 Ctrl+C 停止监控")
3807
+
3808
+ # 演示期间显示实时状态
3809
+ for i in range(6): # 运行30秒
3810
+ if stop_event.is_set():
3811
+ break
3812
+
3813
+ time.sleep(5)
3814
+
3815
+ # 显示当前状态
3816
+ health = scheduler.health_check()
3817
+ metrics = scheduler.get_metrics()
3818
+
3819
+ print(f"\n[{datetime.now().strftime('%H:%M:%S')}] 系统状态:")
3820
+ print(f" 健康状态: {health['status']}")
3821
+ print(f" 运行作业: {metrics['current']['jobs_running']}")
3822
+ print(f" 总作业数: {metrics['jobs_total']}")
3823
+ print(f" 成功率: {metrics.get('success_rate', 0):.1%}")
3824
+
3825
+ # 如果有失败作业,显示警告
3826
+ failed_jobs = []
3827
+ if scheduler.job_store:
3828
+ failed_jobs = scheduler.job_store.search_jobs(status=JobStatus.FAILED, limit=3)
3829
+
3830
+ if failed_jobs:
3831
+ print(f" ⚠️ 最近失败作业:")
3832
+ for job in failed_jobs:
3833
+ result = scheduler.get_result(job.job_id)
3834
+ if result:
3835
+ print(f" {job.name}: {result.error_message}")
3836
+
3837
+ # 停止监控
3838
+ print("\n停止监控系统...")
3839
+ stop_event.set()
3840
+
3841
+ # 等待所有监控线程结束
3842
+ for thread in monitor_threads:
3843
+ thread.join(timeout=5)
3844
+
3845
+ # 最终报告
3846
+ print("\n📈 监控报告:")
3847
+ print("=" * 40)
3848
+
3849
+ metrics = scheduler.get_metrics()
3850
+ print(f"监控周期: 30秒")
3851
+ print(f"执行作业数: {metrics['jobs_total']}")
3852
+ print(f"成功作业: {metrics['jobs_completed']}")
3853
+ print(f"失败作业: {metrics['jobs_failed']}")
3854
+ print(f"成功率: {metrics.get('success_rate', 0):.1%}")
3855
+
3856
+ # 显示后端使用情况
3857
+ if metrics.get('backend_stats'):
3858
+ print("\n后端使用统计:")
3859
+ for backend, count in metrics['backend_stats'].items():
3860
+ print(f" {backend}: {count} 次")
3861
+
3862
+ except KeyboardInterrupt:
3863
+ print("\n监控被用户中断")
3864
+ finally:
3865
+ scheduler.shutdown()
3866
+ print("监控系统已关闭")
3867
+
3868
+ def example_custom_workflow_orchestrator():
3869
+ """
3870
+ 示例6: 自定义工作流编排器
3871
+ 复杂依赖关系和条件执行
3872
+ """
3873
+ print("\n" + "="*60)
3874
+ print("示例6: 自定义工作流编排器")
3875
+ print("="*60)
3876
+
3877
+ scheduler = ContainerScheduler(
3878
+ max_concurrent=8,
3879
+ job_store=JobStore("workflow_orchestrator.db"),
3880
+ enable_priority_queue=True
3881
+ )
3882
+
3883
+ # 定义复杂工作流
3884
+ workflow_def = {
3885
+ "name": "ml_pipeline_with_validation",
3886
+ "stages": [
3887
+ {
3888
+ "id": "data_extraction",
3889
+ "description": "从多个源提取数据",
3890
+ "parallel_tasks": [
3891
+ {
3892
+ "id": "extract_db",
3893
+ "cmd": "python extract_from_database.py --config db_config.yaml",
3894
+ "resource": {"cpus": 4, "memory_gb": 8}
3895
+ },
3896
+ {
3897
+ "id": "extract_api",
3898
+ "cmd": "python extract_from_api.py --endpoints api_endpoints.json",
3899
+ "resource": {"cpus": 2, "memory_gb": 4}
3900
+ },
3901
+ {
3902
+ "id": "extract_files",
3903
+ "cmd": "python extract_from_files.py --input /data/raw/ --pattern *.csv",
3904
+ "resource": {"cpus": 2, "memory_gb": 4}
3905
+ }
3906
+ ]
3907
+ },
3908
+ {
3909
+ "id": "data_validation",
3910
+ "description": "数据验证和质量检查",
3911
+ "dependencies": ["data_extraction"],
3912
+ "cmd": "python validate_data.py --sources extracted/ --output validation_report.json",
3913
+ "resource": {"cpus": 4, "memory_gb": 8},
3914
+ "condition": lambda r: r.exit_code == 0 # 只有成功才继续
3915
+ },
3916
+ {
3917
+ "id": "feature_engineering",
3918
+ "description": "特征工程(并行特征提取)",
3919
+ "dependencies": ["data_validation"],
3920
+ "parallel_tasks": [
3921
+ {
3922
+ "id": "numeric_features",
3923
+ "cmd": "python extract_numeric_features.py --input validated_data.parquet",
3924
+ "resource": {"cpus": 4, "memory_gb": 8}
3925
+ },
3926
+ {
3927
+ "id": "text_features",
3928
+ "cmd": "python extract_text_features.py --input validated_data.parquet",
3929
+ "resource": {"cpus": 4, "memory_gb": 16}
3930
+ },
3931
+ {
3932
+ "id": "time_features",
3933
+ "cmd": "python extract_time_features.py --input validated_data.parquet",
3934
+ "resource": {"cpus": 2, "memory_gb": 4}
3935
+ }
3936
+ ]
3937
+ },
3938
+ {
3939
+ "id": "model_training",
3940
+ "description": "模型训练和验证",
3941
+ "dependencies": ["feature_engineering"],
3942
+ "parallel_tasks": [
3943
+ {
3944
+ "id": "train_xgboost",
3945
+ "cmd": "python train_xgboost.py --features features/ --output models/xgboost.pkl",
3946
+ "resource": {"cpus": 8, "memory_gb": 16}
3947
+ },
3948
+ {
3949
+ "id": "train_nn",
3950
+ "cmd": "python train_neural_network.py --features features/ --output models/nn.h5",
3951
+ "backend": Backend.SLURM,
3952
+ "resource": {"cpus": 8, "memory_gb": 32, "gpus": 1}
3953
+ }
3954
+ ]
3955
+ },
3956
+ {
3957
+ "id": "model_evaluation",
3958
+ "description": "模型评估和选择",
3959
+ "dependencies": ["model_training"],
3960
+ "cmd": "python evaluate_models.py --models models/ --test_data test_set.parquet",
3961
+ "resource": {"cpus": 4, "memory_gb": 8}
3962
+ },
3963
+ {
3964
+ "id": "deployment_prep",
3965
+ "description": "部署准备",
3966
+ "dependencies": ["model_evaluation"],
3967
+ "cmd": "python prepare_deployment.py --best_model best_model.pkl --output deployment/",
3968
+ "resource": {"cpus": 2, "memory_gb": 4}
3969
+ }
3970
+ ]
3971
+ }
3972
+
3973
+ try:
3974
+ print(f"开始工作流: {workflow_def['name']}")
3975
+ print(f"阶段数: {len(workflow_def['stages'])}")
3976
+
3977
+ # 跟踪作业ID映射
3978
+ job_mapping = {}
3979
+ all_results = {}
3980
+
3981
+ # 执行每个阶段
3982
+ for stage in workflow_def['stages']:
3983
+ print(f"\n➤ 阶段: {stage['id']} - {stage['description']}")
3984
+
3985
+ # 检查依赖是否满足
3986
+ if 'dependencies' in stage:
3987
+ deps_satisfied = True
3988
+ for dep in stage['dependencies']:
3989
+ if dep not in all_results or not all_results[dep].success():
3990
+ deps_satisfied = False
3991
+ print(f" 等待依赖: {dep}")
3992
+ break
3993
+
3994
+ if not deps_satisfied:
3995
+ print(f" 跳过阶段 {stage['id']} (依赖未满足)")
3996
+ continue
3997
+
3998
+ # 并行任务
3999
+ if 'parallel_tasks' in stage:
4000
+ print(f" 并行任务数: {len(stage['parallel_tasks'])}")
4001
+
4002
+ # 创建并行作业
4003
+ parallel_jobs = []
4004
+ for task in stage['parallel_tasks']:
4005
+ job = JobDefinition(
4006
+ cmd=task['cmd'],
4007
+ name=f"{stage['id']}_{task['id']}",
4008
+ backend=task.get('backend', Backend.LOCAL),
4009
+ resource=ResourceRequest(**task.get('resource', {"cpus": 1, "memory_gb": 1})),
4010
+ tags={
4011
+ "workflow": workflow_def['name'],
4012
+ "stage": stage['id'],
4013
+ "task": task['id']
4014
+ }
4015
+ )
4016
+ parallel_jobs.append(job)
4017
+ job_mapping[job.job_id] = f"{stage['id']}.{task['id']}"
4018
+
4019
+ # 执行并行任务
4020
+ results = scheduler.run_many(
4021
+ parallel_jobs,
4022
+ max_workers=len(parallel_jobs),
4023
+ stop_on_error=True
4024
+ )
4025
+
4026
+ # 存储结果
4027
+ for job, result in zip(parallel_jobs, results):
4028
+ task_id = job_mapping[job.job_id]
4029
+ all_results[task_id] = result
4030
+ print(f" 任务 {task_id}: {result.status.value}")
4031
+
4032
+ # 检查是否所有并行任务都成功
4033
+ all_success = all(r.success() for r in results)
4034
+ if not all_success:
4035
+ print(f" 阶段 {stage['id']} 有任务失败,停止工作流")
4036
+ break
4037
+
4038
+ # 将整个阶段标记为成功
4039
+ stage_result = JobResult(
4040
+ job_id=stage['id'],
4041
+ status=JobStatus.COMPLETED if all_success else JobStatus.FAILED
4042
+ )
4043
+ all_results[stage['id']] = stage_result
4044
+
4045
+ # 单一任务
4046
+ elif 'cmd' in stage:
4047
+ job = JobDefinition(
4048
+ cmd=stage['cmd'],
4049
+ name=stage['id'],
4050
+ resource=ResourceRequest(**stage.get('resource', {"cpus": 1, "memory_gb": 1})),
4051
+ tags={
4052
+ "workflow": workflow_def['name'],
4053
+ "stage": stage['id']
4054
+ }
4055
+ )
4056
+
4057
+ result = scheduler.submit(job, wait=True)
4058
+ all_results[stage['id']] = result
4059
+
4060
+ print(f" 结果: {result.status.value}, 耗时: {result.duration:.1f}s")
4061
+
4062
+ # 检查条件
4063
+ if 'condition' in stage and callable(stage['condition']):
4064
+ if not stage['condition'](result):
4065
+ print(f" 条件不满足,停止工作流")
4066
+ break
4067
+
4068
+ # 工作流完成报告
4069
+ print("\n" + "="*40)
4070
+ print("工作流完成报告")
4071
+ print("="*40)
4072
+
4073
+ successful_stages = sum(1 for k, v in all_results.items() if '.' not in k and v.success())
4074
+ total_stages = sum(1 for k in all_results.keys() if '.' not in k)
4075
+
4076
+ print(f"完成阶段: {successful_stages}/{total_stages}")
4077
+ print(f"总作业数: {scheduler.metrics_collector.get_metrics()['jobs_total']}")
4078
+
4079
+ if successful_stages == total_stages:
4080
+ print("✅ 工作流完全成功!")
4081
+ else:
4082
+ print("⚠️ 工作流部分完成")
4083
+
4084
+ # 显示失败阶段
4085
+ print("\n失败阶段:")
4086
+ for stage_id, result in all_results.items():
4087
+ if '.' not in stage_id and result.failed():
4088
+ print(f" {stage_id}: {result.error_message}")
4089
+
4090
+ finally:
4091
+ scheduler.shutdown()
4092
+
4093
+ def example_disaster_recovery_drills():
4094
+ """
4095
+ 示例7: 灾难恢复演练
4096
+ 模拟系统故障和恢复过程
4097
+ """
4098
+ print("\n" + "="*60)
4099
+ print("示例7: 灾难恢复演练")
4100
+ print("="*60)
4101
+
4102
+ # 创建具有高可用性特性的调度器
4103
+ scheduler = ContainerScheduler(
4104
+ max_concurrent=5,
4105
+ job_store=JobStore("dr_drill.db"),
4106
+ result_cache=ResultCache(".dr_cache"),
4107
+ plugins=[
4108
+ NotificationPlugin(email="admin@example.com")
4109
+ ],
4110
+ enable_priority_queue=True,
4111
+ enable_timeout_monitor=True
4112
+ )
4113
+
4114
+ try:
4115
+ print("开始灾难恢复演练...")
4116
+
4117
+ # 模拟正常操作
4118
+ print("\n阶段1: 正常操作")
4119
+ normal_operations = [
4120
+ "python process_transactions.py --batch-size 1000",
4121
+ "python generate_reports.py --date $(date +%Y-%m-%d)",
4122
+ "python backup_database.py --incremental",
4123
+ "python monitor_services.py --all",
4124
+ "python cleanup_logs.py --older-than 7d"
4125
+ ]
4126
+
4127
+ normal_results = scheduler.run_many(
4128
+ normal_operations,
4129
+ progress_callback=lambda c, t: print(f" 正常操作进度: {c}/{t}")
4130
+ )
4131
+
4132
+ # 模拟故障注入
4133
+ print("\n阶段2: 故障注入和检测")
4134
+ fault_jobs = [
4135
+ {
4136
+ "name": "simulate_network_partition",
4137
+ "cmd": "python simulate_fault.py --type network --duration 30",
4138
+ "retry": {"max_attempts": 5, "delay_seconds": 10},
4139
+ "tags": {"dr_test": "network_failure"}
4140
+ },
4141
+ {
4142
+ "name": "simulate_disk_failure",
4143
+ "cmd": "python simulate_fault.py --type disk --path /data --severity high",
4144
+ "priority": JobPriority.HIGH,
4145
+ "tags": {"dr_test": "disk_failure"}
4146
+ },
4147
+ {
4148
+ "name": "simulate_service_outage",
4149
+ "cmd": "python simulate_fault.py --type service --services db,cache,queue",
4150
+ "timeout": 60,
4151
+ "tags": {"dr_test": "service_outage"}
4152
+ }
4153
+ ]
4154
+
4155
+ fault_results = scheduler.run_many(
4156
+ fault_jobs,
4157
+ stop_on_error=True
4158
+ )
4159
+
4160
+ # 检查系统健康状态
4161
+ print("\n阶段3: 系统健康检查")
4162
+ health = scheduler.health_check()
4163
+
4164
+ if health["status"] != "healthy":
4165
+ print(f"⚠️ 系统健康状态: {health['status']}")
4166
+ print("触发恢复程序...")
4167
+
4168
+ # 执行恢复步骤
4169
+ recovery_steps = [
4170
+ {
4171
+ "step": "1. 故障隔离",
4172
+ "cmd": "python isolate_fault.py --diagnosis fault_report.json",
4173
+ "priority": JobPriority.CRITICAL
4174
+ },
4175
+ {
4176
+ "step": "2. 启动备用系统",
4177
+ "cmd": "python start_backup_systems.py --components db,cache",
4178
+ "resource": {"cpus": 8, "memory_gb": 16}
4179
+ },
4180
+ {
4181
+ "step": "3. 数据恢复",
4182
+ "cmd": "python restore_data.py --backup latest --target /data",
4183
+ "timeout": 300,
4184
+ "retry": {"max_attempts": 3}
4185
+ },
4186
+ {
4187
+ "step": "4. 服务恢复",
4188
+ "cmd": "python restore_services.py --services all --validate",
4189
+ "priority": JobPriority.HIGH
4190
+ },
4191
+ {
4192
+ "step": "5. 数据同步",
4193
+ "cmd": "python sync_data.py --source backup --target production",
4194
+ "timeout": 600
4195
+ }
4196
+ ]
4197
+
4198
+ print("\n执行恢复步骤:")
4199
+ recovery_results = []
4200
+
4201
+ for step in recovery_steps:
4202
+ print(f" {step['step']}...")
4203
+
4204
+ job = JobDefinition(
4205
+ cmd=step['cmd'],
4206
+ name=f"recovery_{step['step'].split('.')[0]}",
4207
+ priority=step.get('priority', JobPriority.NORMAL),
4208
+ config=ExecutionConfig(
4209
+ timeout=step.get('timeout', 60)
4210
+ ),
4211
+ resource=ResourceRequest(**step.get('resource', {"cpus": 2, "memory_gb": 4})),
4212
+ retry=RetryConfig(**step.get('retry', {"max_attempts": 1})),
4213
+ tags={"dr_test": "recovery", "step": step['step']}
4214
+ )
4215
+
4216
+ result = scheduler.submit(job, wait=True)
4217
+ recovery_results.append(result)
4218
+
4219
+ if result.success():
4220
+ print(f" ✅ 完成")
4221
+ else:
4222
+ print(f" ❌ 失败: {result.error_message}")
4223
+
4224
+ # 验证恢复
4225
+ print("\n阶段4: 恢复验证")
4226
+ verification_jobs = [
4227
+ "python verify_system.py --full-check",
4228
+ "python verify_data.py --integrity --consistency",
4229
+ "python verify_services.py --all --timeout 30",
4230
+ "python verify_performance.py --baseline baseline_metrics.json"
4231
+ ]
4232
+
4233
+ verification_results = scheduler.run_many(verification_jobs)
4234
+
4235
+ successful_verifications = sum(1 for r in verification_results if r.success())
4236
+
4237
+ if successful_verifications == len(verification_results):
4238
+ print("🎉 灾难恢复演练成功完成!")
4239
+ print(" 所有系统功能正常恢复")
4240
+ else:
4241
+ print("⚠️ 恢复验证部分失败")
4242
+ print(f" 成功验证: {successful_verifications}/{len(verification_results)}")
4243
+
4244
+ else:
4245
+ print("系统仍然健康,故障被自动恢复")
4246
+
4247
+ # 生成演练报告
4248
+ print("\n📋 灾难恢复演练报告:")
4249
+ print("="*40)
4250
+
4251
+ metrics = scheduler.get_metrics()
4252
+ total_jobs = metrics['jobs_total']
4253
+ successful_jobs = metrics['jobs_completed']
4254
+ success_rate = metrics.get('success_rate', 0)
4255
+
4256
+ print(f"总作业数: {total_jobs}")
4257
+ print(f"成功作业: {successful_jobs}")
4258
+ print(f"成功率: {success_rate:.1%}")
4259
+ print(f"重试次数: {metrics.get('retries_total', 0)}")
4260
+
4261
+ # 显示演练耗时
4262
+ if scheduler.job_store:
4263
+ all_jobs = scheduler.job_store.search_jobs(tags={"dr_test": True})
4264
+ total_duration = 0
4265
+ for job in all_jobs:
4266
+ result = scheduler.get_result(job.job_id)
4267
+ if result and result.duration:
4268
+ total_duration += result.duration
4269
+
4270
+ print(f"总演练耗时: {total_duration:.1f}秒")
4271
+
4272
+ finally:
4273
+ scheduler.shutdown()
4274
+ print("\n灾难恢复演练完成")
4275
+
4276
+ def example_edge_computing_scenario():
4277
+ """
4278
+ 示例8: 边缘计算场景
4279
+ 分布式边缘节点任务调度
4280
+ """
4281
+ print("\n" + "="*60)
4282
+ print("示例8: 边缘计算场景")
4283
+ print("="*60)
4284
+
4285
+ # 模拟多个边缘节点
4286
+ edge_nodes = [
4287
+ {"id": "edge-01", "location": "factory-floor", "cpus": 8, "memory_gb": 16, "gpus": 1},
4288
+ {"id": "edge-02", "location": "warehouse", "cpus": 4, "memory_gb": 8, "gpus": 0},
4289
+ {"id": "edge-03", "location": "retail-store", "cpus": 2, "memory_gb": 4, "gpus": 0},
4290
+ {"id": "edge-04", "location": "field-office", "cpus": 4, "memory_gb": 8, "gpus": 0},
4291
+ {"id": "edge-05", "location": "research-lab", "cpus": 16, "memory_gb": 32, "gpus": 2}
4292
+ ]
4293
+
4294
+ # 创建主调度器
4295
+ master_scheduler = ContainerScheduler(
4296
+ max_concurrent=10,
4297
+ job_store=JobStore("edge_computing.db"),
4298
+ plugins=[NotificationPlugin()]
4299
+ )
4300
+
4301
+ # 为每个边缘节点创建子调度器(模拟)
4302
+ edge_schedulers = {}
4303
+
4304
+ try:
4305
+ print(f"初始化 {len(edge_nodes)} 个边缘节点...")
4306
+
4307
+ # 边缘计算任务
4308
+ edge_tasks = []
4309
+
4310
+ # 1. 实时视频分析
4311
+ for camera_id in range(5):
4312
+ task = {
4313
+ "type": "video_analytics",
4314
+ "cmd": f"python analyze_video.py --camera {camera_id} --model person_detection",
4315
+ "requirements": {"gpus": 1, "latency": "low"},
4316
+ "priority": JobPriority.HIGH
4317
+ }
4318
+ edge_tasks.append(task)
4319
+
4320
+ # 2. 传感器数据处理
4321
+ for sensor_group in ["temperature", "humidity", "vibration", "pressure"]:
4322
+ task = {
4323
+ "type": "sensor_processing",
4324
+ "cmd": f"python process_sensors.py --type {sensor_group} --window 60",
4325
+ "requirements": {"cpus": 2, "interval": 60},
4326
+ "priority": JobPriority.NORMAL
4327
+ }
4328
+ edge_tasks.append(task)
4329
+
4330
+ # 3. 预测性维护
4331
+ task = {
4332
+ "type": "predictive_maintenance",
4333
+ "cmd": "python predictive_maintenance.py --equipment all --horizon 24",
4334
+ "requirements": {"cpus": 4, "memory_gb": 8},
4335
+ "priority": JobPriority.HIGH
4336
+ }
4337
+ edge_tasks.append(task)
4338
+
4339
+ # 4. 本地AI推理
4340
+ for model in ["defect_detection", "quality_inspection", "anomaly_detection"]:
4341
+ task = {
4342
+ "type": "ai_inference",
4343
+ "cmd": f"python run_inference.py --model {model} --input /data/latest",
4344
+ "requirements": {"gpus": 1, "memory_gb": 4},
4345
+ "priority": JobPriority.CRITICAL
4346
+ }
4347
+ edge_tasks.append(task)
4348
+
4349
+ print(f"总共 {len(edge_tasks)} 个边缘计算任务")
4350
+
4351
+ # 任务分发策略
4352
+ print("\n任务分发到边缘节点...")
4353
+
4354
+ scheduled_tasks = []
4355
+ for task in edge_tasks:
4356
+ # 选择最适合的边缘节点
4357
+ suitable_nodes = []
4358
+ for node in edge_nodes:
4359
+ suitable = True
4360
+
4361
+ # 检查GPU需求
4362
+ if task["requirements"].get("gpus", 0) > 0 and node["gpus"] == 0:
4363
+ suitable = False
4364
+
4365
+ # 检查CPU需求
4366
+ if task["requirements"].get("cpus", 1) > node["cpus"]:
4367
+ suitable = False
4368
+
4369
+ # 检查内存需求
4370
+ if task["requirements"].get("memory_gb", 1) > node["memory_gb"]:
4371
+ suitable = False
4372
+
4373
+ if suitable:
4374
+ suitable_nodes.append(node)
4375
+
4376
+ if suitable_nodes:
4377
+ # 选择负载最低的节点(简化策略)
4378
+ selected_node = suitable_nodes[0]
4379
+
4380
+ job = JobDefinition(
4381
+ cmd=task["cmd"],
4382
+ name=f"edge_{task['type']}_{selected_node['id']}",
4383
+ backend=Backend.LOCAL, # 假设边缘节点使用本地执行
4384
+ resource=ResourceRequest(
4385
+ cpus=task["requirements"].get("cpus", 1),
4386
+ memory_gb=task["requirements"].get("memory_gb", 1),
4387
+ gpus=task["requirements"].get("gpus", 0)
4388
+ ),
4389
+ priority=task["priority"],
4390
+ tags={
4391
+ "edge_computing": "true",
4392
+ "node_id": selected_node["id"],
4393
+ "location": selected_node["location"],
4394
+ "task_type": task["type"],
4395
+ "latency": task["requirements"].get("latency", "normal")
4396
+ }
4397
+ )
4398
+
4399
+ scheduled_tasks.append(job)
4400
+ print(f" 任务 '{task['type']}' 分配到节点 '{selected_node['id']}'")
4401
+ else:
4402
+ print(f" ⚠️ 任务 '{task['type']}' 无合适节点,调度到云端")
4403
+
4404
+ # 调度到云
4405
+ cloud_job = JobDefinition(
4406
+ cmd=task["cmd"],
4407
+ name=f"cloud_{task['type']}",
4408
+ backend=Backend.AWS_BATCH, # 假设使用AWS Batch
4409
+ resource=ResourceRequest(
4410
+ cpus=task["requirements"].get("cpus", 1),
4411
+ memory_gb=task["requirements"].get("memory_gb", 1),
4412
+ gpus=task["requirements"].get("gpus", 0)
4413
+ ),
4414
+ priority=task["priority"],
4415
+ tags={
4416
+ "edge_computing": "true",
4417
+ "node_id": "cloud",
4418
+ "task_type": task["type"]
4419
+ }
4420
+ )
4421
+ scheduled_tasks.append(cloud_job)
4422
+
4423
+ # 执行所有任务
4424
+ print(f"\n开始执行 {len(scheduled_tasks)} 个边缘计算任务...")
4425
+
4426
+ results = master_scheduler.run_many(
4427
+ scheduled_tasks,
4428
+ max_workers=5,
4429
+ progress_callback=lambda c, t: print(f" 执行进度: {c}/{t}")
4430
+ )
4431
+
4432
+ # 分析结果
4433
+ print("\n📊 边缘计算任务执行统计:")
4434
+
4435
+ # 按节点统计
4436
+ node_stats = {}
4437
+ for job, result in zip(scheduled_tasks, results):
4438
+ node_id = job.tags.get("node_id", "unknown")
4439
+ if node_id not in node_stats:
4440
+ node_stats[node_id] = {"total": 0, "success": 0}
4441
+
4442
+ node_stats[node_id]["total"] += 1
4443
+ if result.success():
4444
+ node_stats[node_id]["success"] += 1
4445
+
4446
+ for node_id, stats in node_stats.items():
4447
+ success_rate = stats["success"] / stats["total"] if stats["total"] > 0 else 0
4448
+ print(f" 节点 {node_id}: {stats['success']}/{stats['total']} 成功 ({success_rate:.1%})")
4449
+
4450
+ # 按任务类型统计
4451
+ type_stats = {}
4452
+ for job, result in zip(scheduled_tasks, results):
4453
+ task_type = job.tags.get("task_type", "unknown")
4454
+ if task_type not in type_stats:
4455
+ type_stats[task_type] = {"total": 0, "success": 0}
4456
+
4457
+ type_stats[task_type]["total"] += 1
4458
+ if result.success():
4459
+ type_stats[task_type]["success"] += 1
4460
+
4461
+ print("\n📈 按任务类型统计:")
4462
+ for task_type, stats in type_stats.items():
4463
+ success_rate = stats["success"] / stats["total"] if stats["total"] > 0 else 0
4464
+ print(f" {task_type}: {stats['success']}/{stats['total']} 成功 ({success_rate:.1%})")
4465
+
4466
+ # 总体统计
4467
+ total_success = sum(1 for r in results if r.success())
4468
+ total_tasks = len(results)
4469
+ overall_success_rate = total_success / total_tasks if total_tasks > 0 else 0
4470
+
4471
+ print(f"\n🎯 总体统计:")
4472
+ print(f" 总任务数: {total_tasks}")
4473
+ print(f" 成功任务: {total_success}")
4474
+ print(f" 成功率: {overall_success_rate:.1%}")
4475
+
4476
+ # 计算平均延迟
4477
+ successful_results = [r for r in results if r.success() and r.duration]
4478
+ if successful_results:
4479
+ avg_duration = sum(r.duration for r in successful_results) / len(successful_results)
4480
+ print(f" 平均执行时间: {avg_duration:.2f}秒")
4481
+
4482
+ # 低延迟任务统计
4483
+ low_latency_tasks = [job for job in scheduled_tasks if job.tags.get("latency") == "low"]
4484
+ if low_latency_tasks:
4485
+ low_latency_durations = []
4486
+ for job in low_latency_tasks:
4487
+ result = next((r for r in results if r.job_id == job.job_id), None)
4488
+ if result and result.duration:
4489
+ low_latency_durations.append(result.duration)
4490
+
4491
+ if low_latency_durations:
4492
+ avg_low_latency = sum(low_latency_durations) / len(low_latency_durations)
4493
+ print(f" 低延迟任务平均时间: {avg_low_latency:.2f}秒")
4494
+
4495
+ finally:
4496
+ master_scheduler.shutdown()
4497
+ print("\n边缘计算场景模拟完成")
4498
+
4499
+ # ============================================================================
4500
+ # 运行所有示例
4501
+ # ============================================================================
4502
+
4503
+ def run_all_examples():
4504
+ """运行所有示例场景"""
4505
+ print("通用容器调度器 - 高级使用场景示例")
4506
+ print("="*70)
4507
+
4508
+ examples = [
4509
+ ("数据处理流水线", example_data_processing_pipeline),
4510
+ ("机器学习训练", example_machine_learning_training),
4511
+ ("生物信息学工作流", example_bioinformatics_workflow),
4512
+ ("云批量处理", example_cloud_batch_processing),
4513
+ ("实时监控系统", example_real_time_monitoring),
4514
+ ("自定义工作流编排", example_custom_workflow_orchestrator),
4515
+ ("灾难恢复演练", example_disaster_recovery_drills),
4516
+ ("边缘计算场景", example_edge_computing_scenario),
4517
+ ]
4518
+
4519
+ for i, (name, func) in enumerate(examples, 1):
4520
+ print(f"\n示例 {i}: {name}")
4521
+ print("-"*40)
4522
+
4523
+ try:
4524
+ func()
4525
+ print(f"✅ {name} 示例完成")
4526
+ except KeyboardInterrupt:
4527
+ print(f"⏹️ {name} 示例被中断")
4528
+ break
4529
+ except Exception as e:
4530
+ print(f"❌ {name} 示例错误: {e}")
4531
+ import traceback
4532
+ traceback.print_exc()
4533
+
4534
+ # 示例间暂停
4535
+ if i < len(examples):
4536
+ print("\n" + "="*70)
4537
+ input("按 Enter 键继续下一个示例...")
4538
+
4539
+ print("\n" + "="*70)
4540
+ print("所有示例运行完成!")
4541
+
4542
+ def quick_demo():
4543
+ """快速演示核心功能"""
4544
+ print("快速演示 - 核心功能")
4545
+ print("="*50)
4546
+
4547
+ # 1. 基本使用
4548
+ print("\n1. 基本命令执行:")
4549
+ scheduler = ContainerScheduler()
4550
+ result = scheduler.run("echo 'Hello from Universal Scheduler!'")
4551
+ print(f" 状态: {result.status.value}, 输出: {result.stdout}")
4552
+
4553
+ # 2. 批量处理
4554
+ print("\n2. 批量作业处理:")
4555
+ commands = [f"echo 'Task {i}' && sleep 0.1" for i in range(5)]
4556
+ results = scheduler.run_many(commands, max_workers=3)
4557
+ print(f" 完成 {len(results)} 个任务, {sum(1 for r in results if r.success())} 个成功")
4558
+
4559
+ # 3. 工作流示例
4560
+ print("\n3. 简单工作流:")
4561
+ workflow = [
4562
+ {"cmd": "echo 'Step 1: Data extraction'", "job_id": "step1"},
4563
+ {"cmd": "echo 'Step 2: Processing'", "job_id": "step2", "dependencies": ["step1"]},
4564
+ {"cmd": "echo 'Step 3: Analysis'", "job_id": "step3", "dependencies": ["step2"]},
4565
+ ]
4566
+ results = scheduler.run_workflow(workflow)
4567
+ print(f" 工作流完成: {len(results)}/{len(workflow)} 步骤成功")
4568
+
4569
+ # 4. 健康检查
4570
+ print("\n4. 系统健康检查:")
4571
+ health = scheduler.health_check()
4572
+ print(f" 健康状态: {health['status']}")
4573
+
4574
+ # 5. 指标查看
4575
+ print("\n5. 性能指标:")
4576
+ metrics = scheduler.get_metrics()
4577
+ print(f" 总作业数: {metrics['jobs_total']}")
4578
+ print(f" 成功率: {metrics.get('success_rate', 0):.1%}")
4579
+
4580
+ scheduler.shutdown()
4581
+ print("\n✅ 快速演示完成!")
4582
+
4583
+ # ============================================================================
4584
+ # 主入口
4585
+ # ============================================================================
4586
+
4587
+ if __name__ == "__main__":
4588
+ import argparse
4589
+
4590
+ parser = argparse.ArgumentParser(description="通用容器调度器示例")
4591
+ parser.add_argument("--demo", action="store_true", help="运行快速演示")
4592
+ parser.add_argument("--all", action="store_true", help="运行所有示例")
4593
+ parser.add_argument("--example", type=int, choices=range(1, 9),
4594
+ help="运行特定示例 (1-8)")
4595
+
4596
+ args = parser.parse_args()
4597
+
4598
+ if args.demo:
4599
+ quick_demo()
4600
+ elif args.all:
4601
+ run_all_examples()
4602
+ elif args.example:
4603
+ examples = [
4604
+ example_data_processing_pipeline,
4605
+ example_machine_learning_training,
4606
+ example_bioinformatics_workflow,
4607
+ example_cloud_batch_processing,
4608
+ example_real_time_monitoring,
4609
+ example_custom_workflow_orchestrator,
4610
+ example_disaster_recovery_drills,
4611
+ example_edge_computing_scenario,
4612
+ ]
4613
+ if 1 <= args.example <= len(examples):
4614
+ examples[args.example - 1]()
4615
+ else:
4616
+ print(f"示例编号 {args.example} 无效,可用范围: 1-{len(examples)}")
4617
+ else:
4618
+ print("通用容器调度器 - 使用示例")
4619
+ print("\n用法:")
4620
+ print(" python examples.py --demo # 快速演示")
4621
+ print(" python examples.py --all # 运行所有示例")
4622
+ print(" python examples.py --example N # 运行特定示例")
4623
+ print("\n示例列表:")
4624
+ examples = [
4625
+ "1. 数据处理流水线",
4626
+ "2. 机器学习训练",
4627
+ "3. 生物信息学工作流",
4628
+ "4. 云批量处理",
4629
+ "5. 实时监控系统",
4630
+ "6. 自定义工作流编排",
4631
+ "7. 灾难恢复演练",
4632
+ "8. 边缘计算场景",
4633
+ ]
4634
+ for example in examples:
4635
+ print(f" {example}")