celestialflow 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celestialflow/__init__.py +39 -0
- celestialflow/task_graph.py +665 -0
- celestialflow/task_logging.py +154 -0
- celestialflow/task_manage.py +1070 -0
- celestialflow/task_nodes.py +160 -0
- celestialflow/task_progress.py +57 -0
- celestialflow/task_report.py +162 -0
- celestialflow/task_structure.py +151 -0
- celestialflow/task_tools.py +501 -0
- celestialflow/task_types.py +61 -0
- celestialflow/task_web.py +170 -0
- celestialflow-3.0.1.dist-info/METADATA +301 -0
- celestialflow-3.0.1.dist-info/RECORD +15 -0
- celestialflow-3.0.1.dist-info/WHEEL +5 -0
- celestialflow-3.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# 版本 3.01
|
|
3
|
+
# 作者:Mr-xiaotian, GPT-4o, GPT-5
|
|
4
|
+
# 时间:11/5/2025
|
|
5
|
+
# Github: https://github.com/Mr-xiaotian/CelestialFlow
|
|
6
|
+
|
|
7
|
+
from .task_graph import TaskGraph
|
|
8
|
+
from .task_manage import TaskManager
|
|
9
|
+
from .task_nodes import TaskSplitter, TaskRedisTransfer
|
|
10
|
+
from .task_structure import (
|
|
11
|
+
TaskChain,
|
|
12
|
+
TaskLoop,
|
|
13
|
+
TaskCross,
|
|
14
|
+
TaskComplete,
|
|
15
|
+
TaskWheel,
|
|
16
|
+
TaskGrid,
|
|
17
|
+
)
|
|
18
|
+
from .task_types import TerminationSignal
|
|
19
|
+
from .task_tools import load_task_by_stage, load_task_by_error, make_hashable, format_table
|
|
20
|
+
from .task_web import TaskWebServer
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"TaskGraph",
|
|
24
|
+
"TaskChain",
|
|
25
|
+
"TaskLoop",
|
|
26
|
+
"TaskCross",
|
|
27
|
+
"TaskComplete",
|
|
28
|
+
"TaskWheel",
|
|
29
|
+
"TaskGrid",
|
|
30
|
+
"TaskManager",
|
|
31
|
+
"TaskSplitter",
|
|
32
|
+
"TaskRedisTransfer",
|
|
33
|
+
"TerminationSignal",
|
|
34
|
+
"TaskWebServer",
|
|
35
|
+
"load_task_by_stage",
|
|
36
|
+
"load_task_by_error",
|
|
37
|
+
"make_hashable",
|
|
38
|
+
"format_table",
|
|
39
|
+
]
|
|
@@ -0,0 +1,665 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import multiprocessing
|
|
3
|
+
from collections import defaultdict, deque
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from multiprocessing import Value as MPValue, Lock as MPLock
|
|
6
|
+
from multiprocessing import Queue as MPQueue
|
|
7
|
+
from typing import Any, Dict, List, Tuple
|
|
8
|
+
|
|
9
|
+
from .task_manage import TaskManager
|
|
10
|
+
from .task_nodes import TaskSplitter
|
|
11
|
+
from .task_report import TaskReporter
|
|
12
|
+
from .task_logging import LogListener, TaskLogger
|
|
13
|
+
from .task_types import (
|
|
14
|
+
StageStatus,
|
|
15
|
+
ValueWrapper,
|
|
16
|
+
SumCounter,
|
|
17
|
+
TerminationSignal,
|
|
18
|
+
TERMINATION_SIGNAL
|
|
19
|
+
)
|
|
20
|
+
from .task_tools import (
|
|
21
|
+
format_duration,
|
|
22
|
+
format_timestamp,
|
|
23
|
+
cleanup_mpqueue,
|
|
24
|
+
make_hashable,
|
|
25
|
+
build_structure_graph,
|
|
26
|
+
format_structure_list_from_graph,
|
|
27
|
+
append_jsonl_log,
|
|
28
|
+
format_networkx_graph,
|
|
29
|
+
is_directed_acyclic_graph,
|
|
30
|
+
compute_node_levels,
|
|
31
|
+
cluster_by_value_sorted,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TaskGraph:
|
|
36
|
+
def __init__(self, root_stages: List[TaskManager], layout_mode: str = "process"):
|
|
37
|
+
"""
|
|
38
|
+
初始化 TaskGraph 实例。
|
|
39
|
+
|
|
40
|
+
TaskGraph 表示一组 TaskManager 节点所构成的任务图,可用于构建并行、串行、
|
|
41
|
+
分层等多种形式的任务执行流程。通过分析图结构和调度布局策略,实现灵活的
|
|
42
|
+
DAG 任务调度控制。
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
root_stages : List[TaskManager]
|
|
47
|
+
根节点 TaskManager 列表,用于构建任务图的入口节点。
|
|
48
|
+
支持多根节点(森林结构),系统将自动构建整个任务依赖图。
|
|
49
|
+
|
|
50
|
+
layout_mode : str, optional, default = 'process'
|
|
51
|
+
控制任务图的调度布局模式,支持以下两种策略:
|
|
52
|
+
|
|
53
|
+
- 'process':
|
|
54
|
+
默认模式。所有节点一次性调度并发执行,依赖关系通过队列流自动控制。
|
|
55
|
+
适用于最大化并行度的执行场景。
|
|
56
|
+
|
|
57
|
+
- 'serial':
|
|
58
|
+
分层执行模式。任务图必须为有向无环图(DAG)。
|
|
59
|
+
节点按层级顺序逐层启动,确保上层所有任务完成后再启动下一层。
|
|
60
|
+
更利于调试、性能分析和阶段性资源控制。
|
|
61
|
+
|
|
62
|
+
Raises
|
|
63
|
+
------
|
|
64
|
+
ValueError
|
|
65
|
+
如果输入图不合法或 layout_mode 参数错误。
|
|
66
|
+
"""
|
|
67
|
+
self.set_root_stages(root_stages)
|
|
68
|
+
|
|
69
|
+
self.init_env()
|
|
70
|
+
self.init_structure_graph()
|
|
71
|
+
self.analyze_graph()
|
|
72
|
+
self.set_layout_mode(layout_mode)
|
|
73
|
+
self.set_reporter()
|
|
74
|
+
|
|
75
|
+
def init_env(self):
|
|
76
|
+
"""
|
|
77
|
+
初始化环境
|
|
78
|
+
"""
|
|
79
|
+
self.processes: List[multiprocessing.Process] = []
|
|
80
|
+
|
|
81
|
+
self.init_dict()
|
|
82
|
+
self.init_resources()
|
|
83
|
+
self.init_log()
|
|
84
|
+
|
|
85
|
+
def init_dict(self):
|
|
86
|
+
"""
|
|
87
|
+
初始化字典
|
|
88
|
+
"""
|
|
89
|
+
self.stages_status_dict: Dict[str, dict] = defaultdict(
|
|
90
|
+
dict
|
|
91
|
+
) # 用于保存每个节点的状态信息
|
|
92
|
+
self.stage_extra_stats = defaultdict(dict) # 用于保存每个阶段的额外统计信息
|
|
93
|
+
self.last_status_dict = {} # 用于保存每个节点的最后状态信息
|
|
94
|
+
|
|
95
|
+
self.edge_queue_map: Dict[Tuple[str, str], MPQueue] = (
|
|
96
|
+
{}
|
|
97
|
+
) # 用于保存每个节点到下一个节点的队列
|
|
98
|
+
|
|
99
|
+
self.stage_locks = {} # 锁,用于控制每个阶段success_counter的并发
|
|
100
|
+
self.stage_task_counter = {} # 用于保存每个阶段处理的任务数
|
|
101
|
+
self.stage_success_counter = {} # 用于保存每个阶段成功处理的任务数
|
|
102
|
+
self.stage_error_counter = {} # 用于保存每个阶段失败处理的任务数
|
|
103
|
+
self.stage_duplicate_counter = {} # 用于保存每个阶段重复处理的任务数
|
|
104
|
+
|
|
105
|
+
self.error_timeline_dict: Dict[str, list] = defaultdict(
|
|
106
|
+
list
|
|
107
|
+
) # 用于保存错误到出现该错误任务的映射
|
|
108
|
+
self.all_stage_error_dict: Dict[str, dict] = defaultdict(
|
|
109
|
+
dict
|
|
110
|
+
) # 用于保存节点到节点失败任务的映射
|
|
111
|
+
|
|
112
|
+
def init_resources(self):
|
|
113
|
+
"""
|
|
114
|
+
初始化每个阶段资源
|
|
115
|
+
"""
|
|
116
|
+
self.fail_queue = MPQueue()
|
|
117
|
+
|
|
118
|
+
visited_stages = set()
|
|
119
|
+
queue = deque(self.root_stages) # BFS 用队列代替递归
|
|
120
|
+
|
|
121
|
+
while queue:
|
|
122
|
+
stage = queue.popleft()
|
|
123
|
+
stage_tag = stage.get_stage_tag()
|
|
124
|
+
if stage_tag in visited_stages:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# 记录节点
|
|
128
|
+
self.stages_status_dict[stage_tag]["stage"] = stage
|
|
129
|
+
|
|
130
|
+
# 初始化 counters(全部用 MPValue)
|
|
131
|
+
self.stage_task_counter[stage_tag] = SumCounter()
|
|
132
|
+
self.stage_success_counter[stage_tag] = self.stage_success_counter.get(
|
|
133
|
+
stage_tag, MPValue("i", 0)
|
|
134
|
+
)
|
|
135
|
+
self.stage_error_counter[stage_tag] = MPValue("i", 0)
|
|
136
|
+
self.stage_duplicate_counter[stage_tag] = MPValue("i", 0)
|
|
137
|
+
self.stage_locks[stage_tag] = MPLock()
|
|
138
|
+
|
|
139
|
+
self.stage_extra_stats[stage_tag] = self.stage_extra_stats.get(
|
|
140
|
+
stage_tag, {}
|
|
141
|
+
)
|
|
142
|
+
if isinstance(stage, TaskSplitter):
|
|
143
|
+
self.stage_extra_stats[stage_tag].setdefault(
|
|
144
|
+
"split_output_count", MPValue("i", 0)
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# 为每个边 (prev -> stage) 创建队列
|
|
148
|
+
for prev_stage in stage.prev_stages:
|
|
149
|
+
prev_tag = prev_stage.get_stage_tag() if prev_stage else None
|
|
150
|
+
self.edge_queue_map[(prev_tag, stage_tag)] = MPQueue()
|
|
151
|
+
|
|
152
|
+
if isinstance(prev_stage, TaskSplitter):
|
|
153
|
+
self.stage_extra_stats[prev_tag] = self.stage_extra_stats.get(
|
|
154
|
+
prev_tag, {}
|
|
155
|
+
)
|
|
156
|
+
self.stage_extra_stats[prev_tag].setdefault(
|
|
157
|
+
"split_output_count", MPValue("i", 0)
|
|
158
|
+
)
|
|
159
|
+
self.stage_task_counter[stage_tag].add_counter(
|
|
160
|
+
self.stage_extra_stats[prev_tag]["split_output_count"]
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
# 确保上游 success_counter 已存在
|
|
164
|
+
self.stage_success_counter[prev_tag] = (
|
|
165
|
+
self.stage_success_counter.get(prev_tag, MPValue("i", 0))
|
|
166
|
+
)
|
|
167
|
+
self.stage_task_counter[stage_tag].add_counter(
|
|
168
|
+
self.stage_success_counter[prev_tag]
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if not stage.prev_stages:
|
|
172
|
+
# 起点节点
|
|
173
|
+
self.edge_queue_map[(None, stage_tag)] = MPQueue()
|
|
174
|
+
|
|
175
|
+
visited_stages.add(stage_tag)
|
|
176
|
+
|
|
177
|
+
for next_stage in stage.next_stages:
|
|
178
|
+
queue.append(next_stage)
|
|
179
|
+
|
|
180
|
+
def init_log(self):
|
|
181
|
+
"""
|
|
182
|
+
初始化日志
|
|
183
|
+
"""
|
|
184
|
+
self.log_listener = LogListener(level="INFO")
|
|
185
|
+
self.task_logger = TaskLogger(self.log_listener.get_queue())
|
|
186
|
+
|
|
187
|
+
def init_structure_graph(self):
|
|
188
|
+
"""
|
|
189
|
+
初始化任务图结构
|
|
190
|
+
"""
|
|
191
|
+
self.structure_graph = build_structure_graph(self.root_stages)
|
|
192
|
+
|
|
193
|
+
def set_root_stages(self, root_stages: List[TaskManager]):
|
|
194
|
+
"""
|
|
195
|
+
设置根节点
|
|
196
|
+
:param root_stages: 根节点列表
|
|
197
|
+
"""
|
|
198
|
+
self.root_stages = root_stages
|
|
199
|
+
for stage in root_stages:
|
|
200
|
+
if not stage.prev_stages:
|
|
201
|
+
stage.add_prev_stages(None)
|
|
202
|
+
|
|
203
|
+
def set_layout_mode(self, layout_mode: str):
|
|
204
|
+
"""
|
|
205
|
+
设置任务链的执行模式
|
|
206
|
+
:param layout_mode: 节点执行模式, 可选值为 'serial' 或 'process'
|
|
207
|
+
"""
|
|
208
|
+
if layout_mode == "serial" and self.isDAG:
|
|
209
|
+
self.layout_mode = "serial"
|
|
210
|
+
else:
|
|
211
|
+
self.layout_mode = "process"
|
|
212
|
+
|
|
213
|
+
def set_reporter(self, is_report=False, host="127.0.0.1", port=5000):
|
|
214
|
+
"""
|
|
215
|
+
设定报告器
|
|
216
|
+
"""
|
|
217
|
+
self.is_report = is_report
|
|
218
|
+
self.reporter = TaskReporter(self, self.log_listener.get_queue(), host, port)
|
|
219
|
+
|
|
220
|
+
def set_graph_mode(self, stage_mode: str, execution_mode: str):
|
|
221
|
+
"""
|
|
222
|
+
设置任务链的执行模式
|
|
223
|
+
:param stage_mode: 节点执行模式, 可选值为 'serial' 或 'process'
|
|
224
|
+
:param execution_mode: 节点内部执行模式, 可选值为 'serial' 或 'thread''
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
def set_subsequent_stage_mode(stage: TaskManager):
|
|
228
|
+
stage.set_stage_mode(stage_mode)
|
|
229
|
+
stage.set_execution_mode(execution_mode)
|
|
230
|
+
visited_stages.add(stage)
|
|
231
|
+
|
|
232
|
+
for next_stage in stage.next_stages:
|
|
233
|
+
if next_stage in visited_stages:
|
|
234
|
+
continue
|
|
235
|
+
set_subsequent_stage_mode(next_stage)
|
|
236
|
+
|
|
237
|
+
visited_stages = set()
|
|
238
|
+
for root_stage in self.root_stages:
|
|
239
|
+
set_subsequent_stage_mode(root_stage)
|
|
240
|
+
self.init_structure_graph()
|
|
241
|
+
|
|
242
|
+
def put_stage_queue(self, tasks_dict: dict, put_termination_signal=True):
|
|
243
|
+
"""
|
|
244
|
+
将任务放入队列
|
|
245
|
+
:param tasks_dict: 待处理的任务字典
|
|
246
|
+
:param put_termination_signal: 是否放入终止信号
|
|
247
|
+
"""
|
|
248
|
+
for tag, tasks in tasks_dict.items():
|
|
249
|
+
prev_stage: TaskManager = self.stages_status_dict[tag]["stage"].prev_stages[
|
|
250
|
+
0
|
|
251
|
+
]
|
|
252
|
+
prev_tag = prev_stage.get_stage_tag() if prev_stage else None
|
|
253
|
+
for task in tasks:
|
|
254
|
+
self.edge_queue_map[(prev_tag, tag)].put(make_hashable(task))
|
|
255
|
+
if isinstance(task, TerminationSignal):
|
|
256
|
+
self.task_logger._log(
|
|
257
|
+
"TRACE", f"TERMINATION_SIGNAL put into {(prev_tag, tag)}"
|
|
258
|
+
)
|
|
259
|
+
continue
|
|
260
|
+
self.task_logger._log("TRACE", f"{task} put into {(prev_tag, tag)}")
|
|
261
|
+
self.stage_task_counter[tag] = self.stage_task_counter.get(
|
|
262
|
+
tag, SumCounter()
|
|
263
|
+
)
|
|
264
|
+
self.stage_task_counter[tag].add_init_value(1)
|
|
265
|
+
|
|
266
|
+
if put_termination_signal:
|
|
267
|
+
for root_stage in self.root_stages:
|
|
268
|
+
pre_stage_tag = (
|
|
269
|
+
root_stage.prev_stages[0].get_stage_tag()
|
|
270
|
+
if root_stage.prev_stages[0]
|
|
271
|
+
else None
|
|
272
|
+
)
|
|
273
|
+
edge_key = (pre_stage_tag, root_stage.get_stage_tag())
|
|
274
|
+
self.edge_queue_map[edge_key].put(TERMINATION_SIGNAL)
|
|
275
|
+
self.task_logger._log(
|
|
276
|
+
"TRACE", f"TERMINATION_SIGNAL put into {edge_key}"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def start_graph(self, init_tasks_dict: dict, put_termination_signal: bool = True):
|
|
280
|
+
"""
|
|
281
|
+
启动任务链
|
|
282
|
+
"""
|
|
283
|
+
try:
|
|
284
|
+
self.log_listener.start()
|
|
285
|
+
self.start_time = time.time()
|
|
286
|
+
self.task_logger.start_graph(self.get_structure_list())
|
|
287
|
+
self._persist_structure_metadata()
|
|
288
|
+
self.reporter.start() if self.is_report else None
|
|
289
|
+
|
|
290
|
+
self.put_stage_queue(init_tasks_dict, put_termination_signal)
|
|
291
|
+
self._excute_stages()
|
|
292
|
+
|
|
293
|
+
finally:
|
|
294
|
+
self.finalize_nodes()
|
|
295
|
+
self.reporter.stop()
|
|
296
|
+
self.handle_fail_queue()
|
|
297
|
+
self.release_resources()
|
|
298
|
+
|
|
299
|
+
self.task_logger.end_graph(time.time() - self.start_time)
|
|
300
|
+
self.log_listener.stop()
|
|
301
|
+
|
|
302
|
+
def _excute_stages(self):
|
|
303
|
+
if self.layout_mode == "process":
|
|
304
|
+
# 默认逻辑:一次性执行所有节点
|
|
305
|
+
for tag in self.stages_status_dict:
|
|
306
|
+
self._execute_stage(self.stages_status_dict[tag]["stage"])
|
|
307
|
+
|
|
308
|
+
for p in self.processes:
|
|
309
|
+
p.join()
|
|
310
|
+
self.stages_status_dict[p.name]["status"] = StageStatus.STOPPED
|
|
311
|
+
self.task_logger._log("DEBUG", f"{p.name} exitcode: {p.exitcode}")
|
|
312
|
+
else:
|
|
313
|
+
# serial layout_mode:一层层地顺序执行
|
|
314
|
+
for layer_level, layer in self.layers_dict.items():
|
|
315
|
+
self.task_logger.start_layer(layer, layer_level)
|
|
316
|
+
start_time = time.time()
|
|
317
|
+
|
|
318
|
+
processes = []
|
|
319
|
+
for stage_tag in layer:
|
|
320
|
+
stage: TaskManager = self.stages_status_dict[stage_tag]["stage"]
|
|
321
|
+
self._execute_stage(stage)
|
|
322
|
+
if stage.stage_mode == "process":
|
|
323
|
+
processes.append(self.processes[-1]) # 最新的进程
|
|
324
|
+
|
|
325
|
+
# join 当前层的所有进程(如果有)
|
|
326
|
+
for p in processes:
|
|
327
|
+
p.join()
|
|
328
|
+
self.stages_status_dict[p.name]["status"] = StageStatus.STOPPED
|
|
329
|
+
self.task_logger._log("DEBUG", f"{p.name} exitcode: {p.exitcode}")
|
|
330
|
+
|
|
331
|
+
self.task_logger.end_layer(layer, time.time() - start_time)
|
|
332
|
+
|
|
333
|
+
def _execute_stage(self, stage: TaskManager):
|
|
334
|
+
stage_tag = stage.get_stage_tag()
|
|
335
|
+
|
|
336
|
+
# 输入输出队列
|
|
337
|
+
input_queues = [
|
|
338
|
+
self.edge_queue_map[(prev.get_stage_tag() if prev else None, stage_tag)]
|
|
339
|
+
for prev in stage.prev_stages
|
|
340
|
+
]
|
|
341
|
+
output_queues = (
|
|
342
|
+
[
|
|
343
|
+
self.edge_queue_map[(stage_tag, next_stage.get_stage_tag())]
|
|
344
|
+
for next_stage in stage.next_stages
|
|
345
|
+
]
|
|
346
|
+
if stage.next_stages
|
|
347
|
+
else []
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
logger_queue = self.log_listener.get_queue()
|
|
351
|
+
|
|
352
|
+
self.stages_status_dict[stage_tag]["status"] = StageStatus.RUNNING
|
|
353
|
+
self.stages_status_dict[stage_tag]["start_time"] = time.time()
|
|
354
|
+
|
|
355
|
+
# counter 都在 init_resources 里初始化完了,这里直接用
|
|
356
|
+
stage.init_counter(
|
|
357
|
+
self.stage_task_counter[stage_tag],
|
|
358
|
+
self.stage_success_counter[stage_tag],
|
|
359
|
+
self.stage_error_counter[stage_tag],
|
|
360
|
+
self.stage_duplicate_counter[stage_tag],
|
|
361
|
+
self.stage_locks[stage_tag],
|
|
362
|
+
self.stage_extra_stats[stage_tag],
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if stage.stage_mode == "process":
|
|
366
|
+
p = multiprocessing.Process(
|
|
367
|
+
target=stage.start_stage,
|
|
368
|
+
args=(input_queues, output_queues, self.fail_queue, logger_queue),
|
|
369
|
+
name=stage_tag,
|
|
370
|
+
)
|
|
371
|
+
p.start()
|
|
372
|
+
self.processes.append(p)
|
|
373
|
+
else:
|
|
374
|
+
stage.start_stage(
|
|
375
|
+
input_queues, output_queues, self.fail_queue, logger_queue
|
|
376
|
+
)
|
|
377
|
+
self.stages_status_dict[stage_tag]["status"] = StageStatus.STOPPED
|
|
378
|
+
|
|
379
|
+
def finalize_nodes(self):
|
|
380
|
+
"""
|
|
381
|
+
确保所有子进程安全结束,更新节点状态,并导出每个节点队列剩余任务。
|
|
382
|
+
返回: dict, {stage_tag: [剩余任务列表]}
|
|
383
|
+
"""
|
|
384
|
+
# 1️⃣ 确保所有进程安全结束(不一定要 terminate,但如果没结束就强制)
|
|
385
|
+
for p in self.processes:
|
|
386
|
+
if p.is_alive():
|
|
387
|
+
self.task_logger._log(
|
|
388
|
+
"WARNING", f"检测到进程 {p.name} 仍在运行, 尝试终止"
|
|
389
|
+
)
|
|
390
|
+
p.terminate()
|
|
391
|
+
p.join(timeout=5)
|
|
392
|
+
if p.is_alive():
|
|
393
|
+
self.task_logger._log("WARNING", f"进程 {p.name} 仍未完全退出")
|
|
394
|
+
self.task_logger._log("DEBUG", f"{p.name} exitcode: {p.exitcode}")
|
|
395
|
+
|
|
396
|
+
# 2️⃣ 更新所有节点状态为“已停止”
|
|
397
|
+
for stage_tag, stage_status in self.stages_status_dict.items():
|
|
398
|
+
stage_status["status"] = StageStatus.STOPPED # 已停止
|
|
399
|
+
|
|
400
|
+
# 3️⃣ 收集并持久化每个 stage 中未消费的任务
|
|
401
|
+
# for stage_tag, stage_status in self.stages_status_dict.items():
|
|
402
|
+
# queue: MPQueue = stage_status["task_queue"]
|
|
403
|
+
# while not queue.empty():
|
|
404
|
+
# try:
|
|
405
|
+
# task = queue.get_nowait()
|
|
406
|
+
# self.task_logger._log("DEBUG", f"获取 {stage_tag} 剩余任务: {task}")
|
|
407
|
+
|
|
408
|
+
# self._persist_unconsumed_task(stage_tag, task)
|
|
409
|
+
# except Exception as e:
|
|
410
|
+
# self.task_logger._log("WARNING", f"获取 {stage_tag} 剩余任务失败: {e}")
|
|
411
|
+
|
|
412
|
+
def release_resources(self):
|
|
413
|
+
"""
|
|
414
|
+
释放资源
|
|
415
|
+
"""
|
|
416
|
+
for stage_status_dict in self.stages_status_dict.values():
|
|
417
|
+
stage_status_dict["stage"].release_queue()
|
|
418
|
+
|
|
419
|
+
cleanup_mpqueue(self.fail_queue)
|
|
420
|
+
|
|
421
|
+
def handle_fail_queue(self):
|
|
422
|
+
"""
|
|
423
|
+
消费 fail_queue, 构建失败字典
|
|
424
|
+
"""
|
|
425
|
+
while not self.fail_queue.empty():
|
|
426
|
+
item: dict = self.fail_queue.get_nowait()
|
|
427
|
+
stage_tag = item["stage_tag"]
|
|
428
|
+
task_str = item["task"]
|
|
429
|
+
error_info = item["error_info"]
|
|
430
|
+
timestamp = item["timestamp"]
|
|
431
|
+
error_key = (error_info, stage_tag)
|
|
432
|
+
|
|
433
|
+
if task_str not in self.error_timeline_dict[error_key]:
|
|
434
|
+
self.error_timeline_dict[error_key].append((task_str, timestamp))
|
|
435
|
+
|
|
436
|
+
if task_str not in self.all_stage_error_dict[stage_tag]:
|
|
437
|
+
self.all_stage_error_dict[stage_tag][task_str] = error_key
|
|
438
|
+
|
|
439
|
+
self._persist_single_failure(task_str, error_info, stage_tag, timestamp)
|
|
440
|
+
|
|
441
|
+
def _persist_structure_metadata(self):
|
|
442
|
+
"""
|
|
443
|
+
在运行开始时写入任务结构元信息到 jsonl 文件
|
|
444
|
+
"""
|
|
445
|
+
log_item = {
|
|
446
|
+
"timestamp": datetime.now().isoformat(),
|
|
447
|
+
"structure": self.get_structure_json(),
|
|
448
|
+
}
|
|
449
|
+
append_jsonl_log(
|
|
450
|
+
log_item, self.start_time, "./fallback", "realtime_errors", self.task_logger
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
def _persist_single_failure(self, task_str, error_info, stage_tag, timestamp):
|
|
454
|
+
"""
|
|
455
|
+
增量写入单条错误日志到每日文件中
|
|
456
|
+
"""
|
|
457
|
+
log_item = {
|
|
458
|
+
"timestamp": datetime.fromtimestamp(timestamp).isoformat(),
|
|
459
|
+
"stage": stage_tag,
|
|
460
|
+
"error": error_info,
|
|
461
|
+
"task": task_str,
|
|
462
|
+
}
|
|
463
|
+
append_jsonl_log(
|
|
464
|
+
log_item, self.start_time, "./fallback", "realtime_errors", self.task_logger
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
def _persist_unconsumed_task(self, stage_tag, task):
|
|
468
|
+
"""
|
|
469
|
+
写入单个未消费任务到 JSONL 文件
|
|
470
|
+
"""
|
|
471
|
+
log_item = {
|
|
472
|
+
"timestamp": datetime.now().isoformat(),
|
|
473
|
+
"stage": stage_tag,
|
|
474
|
+
"task": str(task),
|
|
475
|
+
}
|
|
476
|
+
append_jsonl_log(
|
|
477
|
+
log_item, self.start_time, "./fallback", "leftover_tasks", self.task_logger
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
def get_error_timeline_dict(self):
|
|
481
|
+
"""
|
|
482
|
+
返回最终错误字典
|
|
483
|
+
"""
|
|
484
|
+
return dict(self.error_timeline_dict)
|
|
485
|
+
|
|
486
|
+
def get_all_stage_error_dict(self):
|
|
487
|
+
"""
|
|
488
|
+
返回最终失败字典
|
|
489
|
+
"""
|
|
490
|
+
return dict(self.all_stage_error_dict)
|
|
491
|
+
|
|
492
|
+
def get_fail_by_error_dict(self):
|
|
493
|
+
return {
|
|
494
|
+
key: [a for a, _ in tuple_list]
|
|
495
|
+
for key, tuple_list in self.get_error_timeline_dict().items()
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
def get_fail_by_stage_dict(self):
|
|
499
|
+
return {
|
|
500
|
+
stage: list(inner_dict.keys())
|
|
501
|
+
for stage, inner_dict in self.get_all_stage_error_dict().items()
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
def get_status_dict(self) -> Dict[str, dict]:
|
|
505
|
+
"""
|
|
506
|
+
获取任务链的状态字典
|
|
507
|
+
"""
|
|
508
|
+
status_dict = {}
|
|
509
|
+
now = time.time()
|
|
510
|
+
interval = self.reporter.interval
|
|
511
|
+
|
|
512
|
+
for tag, stage_status_dict in self.stages_status_dict.items():
|
|
513
|
+
stage: TaskManager = stage_status_dict["stage"]
|
|
514
|
+
last_stage_status_dict: dict = self.last_status_dict.get(tag, {})
|
|
515
|
+
|
|
516
|
+
status = stage_status_dict.get("status", StageStatus.NOT_STARTED)
|
|
517
|
+
|
|
518
|
+
input = self.stage_task_counter.get(tag, ValueWrapper()).value
|
|
519
|
+
successed = self.stage_success_counter.get(tag, ValueWrapper()).value
|
|
520
|
+
failed = self.stage_error_counter.get(tag, ValueWrapper()).value
|
|
521
|
+
duplicated = self.stage_duplicate_counter.get(tag, ValueWrapper()).value
|
|
522
|
+
processed = successed + failed + duplicated
|
|
523
|
+
pending = max(0, input - processed)
|
|
524
|
+
|
|
525
|
+
add_successed = successed - last_stage_status_dict.get("tasks_successed", 0)
|
|
526
|
+
add_failed = failed - last_stage_status_dict.get("tasks_failed", 0)
|
|
527
|
+
add_duplicated = duplicated - last_stage_status_dict.get(
|
|
528
|
+
"tasks_duplicated", 0
|
|
529
|
+
)
|
|
530
|
+
add_processed = processed - last_stage_status_dict.get("tasks_processed", 0)
|
|
531
|
+
add_pending = pending - last_stage_status_dict.get("tasks_pending", 0)
|
|
532
|
+
|
|
533
|
+
start_time = stage_status_dict.get("start_time", 0)
|
|
534
|
+
# 更新时间消耗(仅在 pending 非 0 时刷新)
|
|
535
|
+
if start_time:
|
|
536
|
+
elapsed = stage_status_dict.get("elapsed_time", 0)
|
|
537
|
+
# 如果上一次是 pending,则累计时间
|
|
538
|
+
if last_stage_status_dict.get("tasks_pending", 0):
|
|
539
|
+
# 如果上一次活跃, 那么无论当前状况,累计一次更新时间
|
|
540
|
+
elapsed += interval
|
|
541
|
+
else:
|
|
542
|
+
elapsed = 0
|
|
543
|
+
|
|
544
|
+
stage_status_dict["elapsed_time"] = elapsed
|
|
545
|
+
|
|
546
|
+
# 估算剩余时间
|
|
547
|
+
remaining = (pending / processed * elapsed) if processed and pending else 0
|
|
548
|
+
|
|
549
|
+
# 计算平均时间(秒/任务)并格式化为字符串
|
|
550
|
+
if processed:
|
|
551
|
+
avg_time = elapsed / processed
|
|
552
|
+
if avg_time >= 1.0:
|
|
553
|
+
# 显示 "X.XX s/it"
|
|
554
|
+
avg_time_str = f"{avg_time:.2f}s/it"
|
|
555
|
+
else:
|
|
556
|
+
# 显示 "X.XX it/s"(取倒数)
|
|
557
|
+
its_per_sec = processed / elapsed if elapsed else 0
|
|
558
|
+
avg_time_str = f"{its_per_sec:.2f}it/s"
|
|
559
|
+
else:
|
|
560
|
+
avg_time_str = "N/A" # 或 "0.00s/it"
|
|
561
|
+
|
|
562
|
+
history: list = stage_status_dict.get("history", [])
|
|
563
|
+
history.append(
|
|
564
|
+
{
|
|
565
|
+
"timestamp": now,
|
|
566
|
+
"tasks_processed": processed,
|
|
567
|
+
}
|
|
568
|
+
)
|
|
569
|
+
history.pop(0) if len(history) > 20 else None
|
|
570
|
+
stage_status_dict["history"] = history
|
|
571
|
+
|
|
572
|
+
status_dict[tag] = {
|
|
573
|
+
**stage.get_stage_summary(),
|
|
574
|
+
"status": status,
|
|
575
|
+
"tasks_successed": successed,
|
|
576
|
+
"tasks_failed": failed,
|
|
577
|
+
"tasks_duplicated": duplicated,
|
|
578
|
+
"tasks_processed": processed,
|
|
579
|
+
"tasks_pending": pending,
|
|
580
|
+
"add_tasks_successed": add_successed,
|
|
581
|
+
"add_tasks_failed": add_failed,
|
|
582
|
+
"add_tasks_duplicated": add_duplicated,
|
|
583
|
+
"add_tasks_processed": add_processed,
|
|
584
|
+
"add_tasks_pending": add_pending,
|
|
585
|
+
"start_time": format_timestamp(start_time),
|
|
586
|
+
"elapsed_time": format_duration(elapsed),
|
|
587
|
+
"remaining_time": format_duration(remaining),
|
|
588
|
+
"task_avg_time": avg_time_str,
|
|
589
|
+
"history": history,
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
self.last_status_dict = status_dict
|
|
593
|
+
|
|
594
|
+
return status_dict
|
|
595
|
+
|
|
596
|
+
def get_graph_topology(self):
|
|
597
|
+
return {
|
|
598
|
+
"isDAG": self.isDAG,
|
|
599
|
+
"layout_mode": self.layout_mode,
|
|
600
|
+
"class_name": self.__class__.__name__,
|
|
601
|
+
"layers_dict": self.layers_dict,
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
def get_structure_json(self):
|
|
605
|
+
return self.structure_graph
|
|
606
|
+
|
|
607
|
+
def get_structure_list(self):
|
|
608
|
+
return format_structure_list_from_graph(self.structure_graph)
|
|
609
|
+
|
|
610
|
+
def get_networkx_graph(self):
|
|
611
|
+
return format_networkx_graph(self.structure_graph)
|
|
612
|
+
|
|
613
|
+
def analyze_graph(self):
|
|
614
|
+
networkx_graph = self.get_networkx_graph()
|
|
615
|
+
self.layers_dict = {}
|
|
616
|
+
|
|
617
|
+
self.isDAG = is_directed_acyclic_graph(networkx_graph)
|
|
618
|
+
if self.isDAG:
|
|
619
|
+
self.stage_level_dict = compute_node_levels(networkx_graph)
|
|
620
|
+
self.layers_dict = cluster_by_value_sorted(self.stage_level_dict)
|
|
621
|
+
|
|
622
|
+
def test_methods(
|
|
623
|
+
self,
|
|
624
|
+
init_tasks_dict: Dict[str, List],
|
|
625
|
+
stage_modes: list = None,
|
|
626
|
+
execution_modes: list = None,
|
|
627
|
+
) -> Dict[str, Any]:
|
|
628
|
+
"""
|
|
629
|
+
测试 TaskGraph 在 'serial' 和 'process' 模式下的执行时间。
|
|
630
|
+
|
|
631
|
+
:param init_tasks_dict: 初始化任务字典
|
|
632
|
+
:param stage_modes: 阶段模式列表,默认为 ['serial', 'process']
|
|
633
|
+
:param execution_modes: 执行模式列表,默认为 ['serial', 'thread']
|
|
634
|
+
:return: 包含两种执行模式下的执行时间的字典
|
|
635
|
+
"""
|
|
636
|
+
results = {}
|
|
637
|
+
test_table_list = []
|
|
638
|
+
fail_by_error_dict = {}
|
|
639
|
+
fail_by_stage_dict = {}
|
|
640
|
+
|
|
641
|
+
stage_modes = stage_modes or ["serial", "process"]
|
|
642
|
+
execution_modes = execution_modes or ["serial", "thread"]
|
|
643
|
+
for stage_mode in stage_modes:
|
|
644
|
+
time_list = []
|
|
645
|
+
for execution_mode in execution_modes:
|
|
646
|
+
start_time = time.time()
|
|
647
|
+
self.init_env()
|
|
648
|
+
self.set_graph_mode(stage_mode, execution_mode)
|
|
649
|
+
self.start_graph(init_tasks_dict)
|
|
650
|
+
|
|
651
|
+
time_list.append(time.time() - start_time)
|
|
652
|
+
fail_by_error_dict.update(self.get_fail_by_error_dict())
|
|
653
|
+
fail_by_stage_dict.update(self.get_fail_by_stage_dict())
|
|
654
|
+
|
|
655
|
+
test_table_list.append(time_list)
|
|
656
|
+
|
|
657
|
+
results["Time table"] = (
|
|
658
|
+
test_table_list,
|
|
659
|
+
stage_modes,
|
|
660
|
+
execution_modes,
|
|
661
|
+
r"stage\execution",
|
|
662
|
+
)
|
|
663
|
+
results["Fail error dict"] = fail_by_error_dict
|
|
664
|
+
results["Fail stage dict"] = fail_by_stage_dict
|
|
665
|
+
return results
|