celestialflow 3.0.3__tar.gz → 3.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {celestialflow-3.0.3 → celestialflow-3.0.4}/PKG-INFO +3 -3
- {celestialflow-3.0.3 → celestialflow-3.0.4}/README.md +2 -2
- {celestialflow-3.0.3 → celestialflow-3.0.4}/pyproject.toml +1 -1
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/__init__.py +6 -1
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_graph.py +106 -187
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_logging.py +35 -7
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_manage.py +114 -247
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_nodes.py +3 -3
- celestialflow-3.0.4/src/celestialflow/task_queue.py +227 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_report.py +5 -18
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_structure.py +1 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_tools.py +4 -6
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_types.py +8 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_web.py +2 -2
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow.egg-info/PKG-INFO +3 -3
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow.egg-info/SOURCES.txt +1 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/tests/test_graph.py +8 -6
- {celestialflow-3.0.3 → celestialflow-3.0.4}/tests/test_nodes.py +16 -48
- {celestialflow-3.0.3 → celestialflow-3.0.4}/tests/test_structure.py +39 -8
- {celestialflow-3.0.3 → celestialflow-3.0.4}/setup.cfg +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/css/base.css +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/css/dashboard.css +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/css/errors.css +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/css/inject.css +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/favicon.ico +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/js/main.js +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/js/task_errors.js +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/js/task_injection.js +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/js/task_statuses.js +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/js/task_structure.js +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/js/task_topology.js +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/static/js/utils.js +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/task_progress.py +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow/templates/index.html +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow.egg-info/dependency_links.txt +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow.egg-info/entry_points.txt +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow.egg-info/requires.txt +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/src/celestialflow.egg-info/top_level.txt +0 -0
- {celestialflow-3.0.3 → celestialflow-3.0.4}/tests/test_manage.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: celestialflow
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.4
|
|
4
4
|
Summary: A flexible GRAPH-based task orchestration framework.
|
|
5
5
|
Author-email: Mr-xiaotian <mingxiaomingtian@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -26,7 +26,7 @@ Requires-Dist: jinja2
|
|
|
26
26
|
# CelestialFlow ——一个轻量级、可并行、基于图结构的 Python 任务调度框架
|
|
27
27
|
|
|
28
28
|
<p align="center">
|
|
29
|
-
<img src="img/
|
|
29
|
+
<img src="https://raw.githubusercontent.com/Mr-xiaotian/CelestialFlow/main/img/logo.png" width="1080" alt="CelestialFlow Logo">
|
|
30
30
|
</p>
|
|
31
31
|
|
|
32
32
|
<p align="center">
|
|
@@ -111,7 +111,7 @@ python src/celestialflow/task_web.py 5005
|
|
|
111
111
|
|
|
112
112
|
可查看任务结构、执行状态、错误日志、以及实时注入任务等功能。
|
|
113
113
|
|
|
114
|
-

|
|
114
|
+

|
|
115
115
|
|
|
116
116
|
### 运行测试示例
|
|
117
117
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# CelestialFlow ——一个轻量级、可并行、基于图结构的 Python 任务调度框架
|
|
2
2
|
|
|
3
3
|
<p align="center">
|
|
4
|
-
<img src="img/
|
|
4
|
+
<img src="https://raw.githubusercontent.com/Mr-xiaotian/CelestialFlow/main/img/logo.png" width="1080" alt="CelestialFlow Logo">
|
|
5
5
|
</p>
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
@@ -86,7 +86,7 @@ python src/celestialflow/task_web.py 5005
|
|
|
86
86
|
|
|
87
87
|
可查看任务结构、执行状态、错误日志、以及实时注入任务等功能。
|
|
88
88
|
|
|
89
|
-

|
|
89
|
+

|
|
90
90
|
|
|
91
91
|
### 运行测试示例
|
|
92
92
|
|
|
@@ -10,7 +10,12 @@ from .task_structure import (
|
|
|
10
10
|
TaskGrid,
|
|
11
11
|
)
|
|
12
12
|
from .task_types import TerminationSignal
|
|
13
|
-
from .task_tools import
|
|
13
|
+
from .task_tools import (
|
|
14
|
+
load_task_by_stage,
|
|
15
|
+
load_task_by_error,
|
|
16
|
+
make_hashable,
|
|
17
|
+
format_table,
|
|
18
|
+
)
|
|
14
19
|
from .task_web import TaskWebServer
|
|
15
20
|
|
|
16
21
|
__all__ = [
|
|
@@ -2,21 +2,14 @@ import time
|
|
|
2
2
|
import multiprocessing
|
|
3
3
|
from collections import defaultdict, deque
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from multiprocessing import Value as MPValue, Lock as MPLock
|
|
6
5
|
from multiprocessing import Queue as MPQueue
|
|
7
|
-
from typing import Any, Dict, List
|
|
6
|
+
from typing import Any, Dict, List
|
|
8
7
|
|
|
9
8
|
from .task_manage import TaskManager
|
|
10
|
-
from .task_nodes import TaskSplitter
|
|
11
9
|
from .task_report import TaskReporter
|
|
12
10
|
from .task_logging import LogListener, TaskLogger
|
|
13
|
-
from .
|
|
14
|
-
|
|
15
|
-
ValueWrapper,
|
|
16
|
-
SumCounter,
|
|
17
|
-
TerminationSignal,
|
|
18
|
-
TERMINATION_SIGNAL
|
|
19
|
-
)
|
|
11
|
+
from .task_queue import TaskQueue
|
|
12
|
+
from .task_types import StageStatus, TerminationSignal, TERMINATION_SIGNAL
|
|
20
13
|
from .task_tools import (
|
|
21
14
|
format_duration,
|
|
22
15
|
format_timestamp,
|
|
@@ -29,6 +22,8 @@ from .task_tools import (
|
|
|
29
22
|
is_directed_acyclic_graph,
|
|
30
23
|
compute_node_levels,
|
|
31
24
|
cluster_by_value_sorted,
|
|
25
|
+
load_task_by_stage,
|
|
26
|
+
load_task_by_error
|
|
32
27
|
)
|
|
33
28
|
|
|
34
29
|
|
|
@@ -73,8 +68,8 @@ class TaskGraph:
|
|
|
73
68
|
self.processes: List[multiprocessing.Process] = []
|
|
74
69
|
|
|
75
70
|
self.init_dict()
|
|
76
|
-
self.init_resources()
|
|
77
71
|
self.init_log()
|
|
72
|
+
self.init_resources()
|
|
78
73
|
|
|
79
74
|
def init_dict(self):
|
|
80
75
|
"""
|
|
@@ -83,25 +78,11 @@ class TaskGraph:
|
|
|
83
78
|
self.stages_status_dict: Dict[str, dict] = defaultdict(
|
|
84
79
|
dict
|
|
85
80
|
) # 用于保存每个节点的状态信息
|
|
86
|
-
self.
|
|
87
|
-
self.last_status_dict = {} # 用于保存每个节点的最后状态信息
|
|
88
|
-
|
|
89
|
-
self.edge_queue_map: Dict[Tuple[str, str], MPQueue] = (
|
|
90
|
-
{}
|
|
91
|
-
) # 用于保存每个节点到下一个节点的队列
|
|
92
|
-
|
|
93
|
-
self.stage_locks = {} # 锁,用于控制每个阶段success_counter的并发
|
|
94
|
-
self.stage_task_counter = {} # 用于保存每个阶段处理的任务数
|
|
95
|
-
self.stage_success_counter = {} # 用于保存每个阶段成功处理的任务数
|
|
96
|
-
self.stage_error_counter = {} # 用于保存每个阶段失败处理的任务数
|
|
97
|
-
self.stage_duplicate_counter = {} # 用于保存每个阶段重复处理的任务数
|
|
98
|
-
|
|
99
|
-
self.error_timeline_dict: Dict[str, list] = defaultdict(
|
|
100
|
-
list
|
|
101
|
-
) # 用于保存错误到出现该错误任务的映射
|
|
102
|
-
self.all_stage_error_dict: Dict[str, dict] = defaultdict(
|
|
81
|
+
self.last_status_dict: Dict[str, dict] = defaultdict(
|
|
103
82
|
dict
|
|
104
|
-
) #
|
|
83
|
+
) # 用于保存每个节点的上一次状态信息
|
|
84
|
+
|
|
85
|
+
self.error_data: List[dict] = []
|
|
105
86
|
|
|
106
87
|
def init_resources(self):
|
|
107
88
|
"""
|
|
@@ -110,66 +91,56 @@ class TaskGraph:
|
|
|
110
91
|
self.fail_queue = MPQueue()
|
|
111
92
|
|
|
112
93
|
visited_stages = set()
|
|
113
|
-
queue = deque(self.root_stages)
|
|
94
|
+
queue = deque(self.root_stages)
|
|
114
95
|
|
|
96
|
+
# BFS 连接
|
|
115
97
|
while queue:
|
|
116
98
|
stage = queue.popleft()
|
|
117
99
|
stage_tag = stage.get_stage_tag()
|
|
118
100
|
if stage_tag in visited_stages:
|
|
119
101
|
continue
|
|
120
102
|
|
|
103
|
+
# 刷新所有 counter
|
|
104
|
+
stage.reset_counter()
|
|
105
|
+
|
|
121
106
|
# 记录节点
|
|
122
107
|
self.stages_status_dict[stage_tag]["stage"] = stage
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
stage_tag,
|
|
108
|
+
self.stages_status_dict[stage_tag]["in_queue"] = TaskQueue(
|
|
109
|
+
queue_list=[],
|
|
110
|
+
queue_tag=[],
|
|
111
|
+
logger_queue=self.log_listener.get_queue(),
|
|
112
|
+
stage_tag=stage_tag,
|
|
113
|
+
direction="in",
|
|
128
114
|
)
|
|
129
|
-
self.stage_error_counter[stage_tag] = MPValue("i", 0)
|
|
130
|
-
self.stage_duplicate_counter[stage_tag] = MPValue("i", 0)
|
|
131
|
-
self.stage_locks[stage_tag] = MPLock()
|
|
132
115
|
|
|
133
|
-
self.
|
|
134
|
-
|
|
116
|
+
self.stages_status_dict[stage_tag]["out_queue"] = TaskQueue(
|
|
117
|
+
queue_list=[],
|
|
118
|
+
queue_tag=[],
|
|
119
|
+
logger_queue=self.log_listener.get_queue(),
|
|
120
|
+
stage_tag=stage_tag,
|
|
121
|
+
direction="out",
|
|
135
122
|
)
|
|
136
|
-
|
|
137
|
-
self.stage_extra_stats[stage_tag].setdefault(
|
|
138
|
-
"split_output_count", MPValue("i", 0)
|
|
139
|
-
)
|
|
123
|
+
visited_stages.add(stage_tag)
|
|
140
124
|
|
|
141
|
-
|
|
142
|
-
for prev_stage in stage.prev_stages:
|
|
143
|
-
prev_tag = prev_stage.get_stage_tag() if prev_stage else None
|
|
144
|
-
self.edge_queue_map[(prev_tag, stage_tag)] = MPQueue()
|
|
125
|
+
queue.extend(stage.next_stages)
|
|
145
126
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
)
|
|
150
|
-
self.stage_extra_stats[prev_tag].setdefault(
|
|
151
|
-
"split_output_count", MPValue("i", 0)
|
|
152
|
-
)
|
|
153
|
-
self.stage_task_counter[stage_tag].add_counter(
|
|
154
|
-
self.stage_extra_stats[prev_tag]["split_output_count"]
|
|
155
|
-
)
|
|
156
|
-
else:
|
|
157
|
-
# 确保上游 success_counter 已存在
|
|
158
|
-
self.stage_success_counter[prev_tag] = (
|
|
159
|
-
self.stage_success_counter.get(prev_tag, MPValue("i", 0))
|
|
160
|
-
)
|
|
161
|
-
self.stage_task_counter[stage_tag].add_counter(
|
|
162
|
-
self.stage_success_counter[prev_tag]
|
|
163
|
-
)
|
|
127
|
+
for stage_tag in self.stages_status_dict:
|
|
128
|
+
stage: TaskManager = self.stages_status_dict[stage_tag]["stage"]
|
|
129
|
+
in_queue: TaskQueue = self.stages_status_dict[stage_tag]["in_queue"]
|
|
164
130
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
131
|
+
# 遍历每个前驱,创建边队列
|
|
132
|
+
for prev_stage in stage.prev_stages:
|
|
133
|
+
prev_stage_tag = prev_stage.get_stage_tag() if prev_stage else None
|
|
134
|
+
q = MPQueue()
|
|
168
135
|
|
|
169
|
-
|
|
136
|
+
# sink side
|
|
137
|
+
in_queue.add_queue(q, prev_stage_tag)
|
|
170
138
|
|
|
171
|
-
|
|
172
|
-
|
|
139
|
+
# source side
|
|
140
|
+
if prev_stage is not None:
|
|
141
|
+
self.stages_status_dict[prev_stage_tag]["out_queue"].add_queue(
|
|
142
|
+
q, stage_tag
|
|
143
|
+
)
|
|
173
144
|
|
|
174
145
|
def init_log(self, level="INFO"):
|
|
175
146
|
"""
|
|
@@ -184,7 +155,7 @@ class TaskGraph:
|
|
|
184
155
|
"""
|
|
185
156
|
初始化任务图结构
|
|
186
157
|
"""
|
|
187
|
-
self.
|
|
158
|
+
self.structure_json = build_structure_graph(self.root_stages)
|
|
188
159
|
|
|
189
160
|
def set_root_stages(self, root_stages: List[TaskManager]):
|
|
190
161
|
"""
|
|
@@ -242,19 +213,6 @@ class TaskGraph:
|
|
|
242
213
|
set_subsequent_stage_mode(root_stage)
|
|
243
214
|
self.init_structure_graph()
|
|
244
215
|
|
|
245
|
-
def put_termination(self, tag):
|
|
246
|
-
"""
|
|
247
|
-
放入终止信号
|
|
248
|
-
|
|
249
|
-
:param tag: 阶段标签
|
|
250
|
-
"""
|
|
251
|
-
preg_stages: List[TaskManager] = self.stages_status_dict[tag]["stage"].prev_stages
|
|
252
|
-
|
|
253
|
-
for prev_stage in preg_stages:
|
|
254
|
-
prev_tag = prev_stage.get_stage_tag() if prev_stage else None
|
|
255
|
-
self.edge_queue_map[(prev_tag, tag)].put(TERMINATION_SIGNAL)
|
|
256
|
-
self.task_logger._log("TRACE", f"TERMINATION_SIGNAL put into {(prev_tag, tag)}")
|
|
257
|
-
|
|
258
216
|
def put_stage_queue(self, tasks_dict: dict, put_termination_signal=True):
|
|
259
217
|
"""
|
|
260
218
|
将任务放入队列
|
|
@@ -263,26 +221,24 @@ class TaskGraph:
|
|
|
263
221
|
:param put_termination_signal: 是否放入终止信号
|
|
264
222
|
"""
|
|
265
223
|
for tag, tasks in tasks_dict.items():
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
prev_tag = prev_stage.get_stage_tag() if prev_stage else None
|
|
224
|
+
stage: TaskManager = self.stages_status_dict[tag]["stage"]
|
|
225
|
+
in_queue: TaskQueue = self.stages_status_dict[tag]["in_queue"]
|
|
226
|
+
|
|
270
227
|
for task in tasks:
|
|
271
228
|
if isinstance(task, TerminationSignal):
|
|
272
|
-
|
|
229
|
+
in_queue.put(TERMINATION_SIGNAL)
|
|
273
230
|
continue
|
|
274
231
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
self.stage_task_counter[tag] = self.stage_task_counter.get(
|
|
278
|
-
tag, SumCounter()
|
|
279
|
-
)
|
|
280
|
-
self.stage_task_counter[tag].add_init_value(1)
|
|
232
|
+
in_queue.put_first(make_hashable(task))
|
|
233
|
+
stage.task_counter.add_init_value(1)
|
|
281
234
|
|
|
282
235
|
if put_termination_signal:
|
|
283
236
|
for root_stage in self.root_stages:
|
|
284
237
|
root_stage_tag = root_stage.get_stage_tag()
|
|
285
|
-
self.
|
|
238
|
+
root_in_queue: TaskQueue = self.stages_status_dict[root_stage_tag][
|
|
239
|
+
"in_queue"
|
|
240
|
+
]
|
|
241
|
+
root_in_queue.put(TERMINATION_SIGNAL)
|
|
286
242
|
|
|
287
243
|
def start_graph(self, init_tasks_dict: dict, put_termination_signal: bool = True):
|
|
288
244
|
"""
|
|
@@ -347,40 +303,20 @@ class TaskGraph:
|
|
|
347
303
|
def _execute_stage(self, stage: TaskManager):
|
|
348
304
|
"""
|
|
349
305
|
执行单个节点
|
|
350
|
-
|
|
306
|
+
|
|
351
307
|
:param stage: 节点
|
|
352
308
|
"""
|
|
353
309
|
stage_tag = stage.get_stage_tag()
|
|
354
310
|
|
|
355
|
-
# 输入输出队列
|
|
356
|
-
input_queues = [
|
|
357
|
-
self.edge_queue_map[(prev.get_stage_tag() if prev else None, stage_tag)]
|
|
358
|
-
for prev in stage.prev_stages
|
|
359
|
-
]
|
|
360
|
-
output_queues = (
|
|
361
|
-
[
|
|
362
|
-
self.edge_queue_map[(stage_tag, next_stage.get_stage_tag())]
|
|
363
|
-
for next_stage in stage.next_stages
|
|
364
|
-
]
|
|
365
|
-
if stage.next_stages
|
|
366
|
-
else []
|
|
367
|
-
)
|
|
368
|
-
|
|
369
311
|
logger_queue = self.log_listener.get_queue()
|
|
370
312
|
|
|
313
|
+
# 输入输出队列
|
|
314
|
+
input_queues = self.stages_status_dict[stage_tag]["in_queue"]
|
|
315
|
+
output_queues = self.stages_status_dict[stage_tag]["out_queue"]
|
|
316
|
+
|
|
371
317
|
self.stages_status_dict[stage_tag]["status"] = StageStatus.RUNNING
|
|
372
318
|
self.stages_status_dict[stage_tag]["start_time"] = time.time()
|
|
373
319
|
|
|
374
|
-
# counter 都在 init_resources 里初始化完了,这里直接用
|
|
375
|
-
stage.init_counter(
|
|
376
|
-
self.stage_task_counter[stage_tag],
|
|
377
|
-
self.stage_success_counter[stage_tag],
|
|
378
|
-
self.stage_error_counter[stage_tag],
|
|
379
|
-
self.stage_duplicate_counter[stage_tag],
|
|
380
|
-
self.stage_locks[stage_tag],
|
|
381
|
-
self.stage_extra_stats[stage_tag],
|
|
382
|
-
)
|
|
383
|
-
|
|
384
320
|
if stage.stage_mode == "process":
|
|
385
321
|
p = multiprocessing.Process(
|
|
386
322
|
target=stage.start_stage,
|
|
@@ -416,16 +352,23 @@ class TaskGraph:
|
|
|
416
352
|
stage_status["status"] = StageStatus.STOPPED # 已停止
|
|
417
353
|
|
|
418
354
|
# 3️⃣ 收集并持久化每个 stage 中未消费的任务
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
355
|
+
for stage_tag, stage_status in self.stages_status_dict.items():
|
|
356
|
+
in_queue: TaskQueue = stage_status["in_queue"]
|
|
357
|
+
|
|
358
|
+
# 用你刚才统一的 drain() 提取当前剩余任务
|
|
359
|
+
remaining_sources = in_queue.drain()
|
|
360
|
+
|
|
361
|
+
# 如无剩余,跳过
|
|
362
|
+
if not remaining_sources:
|
|
363
|
+
continue
|
|
364
|
+
|
|
365
|
+
# 持久化逻辑(写日志 / 存储到全局 structure)
|
|
366
|
+
for source in remaining_sources:
|
|
367
|
+
task_str = str(source)
|
|
368
|
+
error_info = f"(UnconsumeError)"
|
|
369
|
+
timestamp = time.time()
|
|
425
370
|
|
|
426
|
-
|
|
427
|
-
# except Exception as e:
|
|
428
|
-
# self.task_logger._log("WARNING", f"获取 {stage_tag} 剩余任务失败: {e}")
|
|
371
|
+
self._persist_single_failure(task_str, error_info, stage_tag, timestamp)
|
|
429
372
|
|
|
430
373
|
def release_resources(self):
|
|
431
374
|
"""
|
|
@@ -446,13 +389,13 @@ class TaskGraph:
|
|
|
446
389
|
task_str = item["task"]
|
|
447
390
|
error_info = item["error_info"]
|
|
448
391
|
timestamp = item["timestamp"]
|
|
449
|
-
error_key = (error_info, stage_tag)
|
|
450
392
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
393
|
+
self.error_data.append({
|
|
394
|
+
"timestamp": timestamp,
|
|
395
|
+
"node": stage_tag,
|
|
396
|
+
"error": error_info,
|
|
397
|
+
"task_id": task_str if len(task_str) < 100 else task_str[:100] + "...",
|
|
398
|
+
})
|
|
456
399
|
|
|
457
400
|
self._persist_single_failure(task_str, error_info, stage_tag, timestamp)
|
|
458
401
|
|
|
@@ -460,17 +403,21 @@ class TaskGraph:
|
|
|
460
403
|
"""
|
|
461
404
|
在运行开始时写入任务结构元信息到 jsonl 文件
|
|
462
405
|
"""
|
|
406
|
+
date_str = datetime.fromtimestamp(self.start_time).strftime("%Y-%m-%d")
|
|
407
|
+
time_str = datetime.fromtimestamp(self.start_time).strftime("%H-%M-%S-%f")[:-3]
|
|
408
|
+
self.error_jsonl_path = f"./fallback/{date_str}/realtime_errors({time_str}).jsonl"
|
|
409
|
+
|
|
463
410
|
log_item = {
|
|
464
411
|
"timestamp": datetime.now().isoformat(),
|
|
465
412
|
"structure": self.get_structure_json(),
|
|
466
413
|
}
|
|
467
414
|
append_jsonl_log(
|
|
468
|
-
log_item, self.
|
|
415
|
+
log_item, self.error_jsonl_path, self.task_logger
|
|
469
416
|
)
|
|
470
417
|
|
|
471
418
|
def _persist_single_failure(self, task_str, error_info, stage_tag, timestamp):
|
|
472
419
|
"""
|
|
473
|
-
|
|
420
|
+
增量写入单条错误日志到 jsonl 文件中
|
|
474
421
|
|
|
475
422
|
:param task_str: 任务字符串
|
|
476
423
|
:param error_info: 错误信息
|
|
@@ -484,48 +431,20 @@ class TaskGraph:
|
|
|
484
431
|
"task": task_str,
|
|
485
432
|
}
|
|
486
433
|
append_jsonl_log(
|
|
487
|
-
log_item, self.
|
|
434
|
+
log_item, self.error_jsonl_path, self.task_logger
|
|
488
435
|
)
|
|
489
436
|
|
|
490
|
-
def
|
|
437
|
+
def get_error_data(self):
|
|
491
438
|
"""
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
:param stage_tag: 阶段标签
|
|
495
|
-
:param task: 任务对象
|
|
439
|
+
返回错误数据
|
|
496
440
|
"""
|
|
497
|
-
|
|
498
|
-
"timestamp": datetime.now().isoformat(),
|
|
499
|
-
"stage": stage_tag,
|
|
500
|
-
"task": str(task),
|
|
501
|
-
}
|
|
502
|
-
append_jsonl_log(
|
|
503
|
-
log_item, self.start_time, "./fallback", "leftover_tasks", self.task_logger
|
|
504
|
-
)
|
|
441
|
+
return self.error_data
|
|
505
442
|
|
|
506
|
-
def
|
|
507
|
-
|
|
508
|
-
返回最终错误字典
|
|
509
|
-
"""
|
|
510
|
-
return dict(self.error_timeline_dict)
|
|
511
|
-
|
|
512
|
-
def get_all_stage_error_dict(self):
|
|
513
|
-
"""
|
|
514
|
-
返回最终失败字典
|
|
515
|
-
"""
|
|
516
|
-
return dict(self.all_stage_error_dict)
|
|
443
|
+
def get_fail_by_stage_dict(self):
|
|
444
|
+
return load_task_by_stage(self.error_jsonl_path)
|
|
517
445
|
|
|
518
446
|
def get_fail_by_error_dict(self):
|
|
519
|
-
return
|
|
520
|
-
key: [a for a, _ in tuple_list]
|
|
521
|
-
for key, tuple_list in self.get_error_timeline_dict().items()
|
|
522
|
-
}
|
|
523
|
-
|
|
524
|
-
def get_fail_by_stage_dict(self):
|
|
525
|
-
return {
|
|
526
|
-
stage: list(inner_dict.keys())
|
|
527
|
-
for stage, inner_dict in self.get_all_stage_error_dict().items()
|
|
528
|
-
}
|
|
447
|
+
return load_task_by_error(self.error_jsonl_path)
|
|
529
448
|
|
|
530
449
|
def get_status_dict(self) -> Dict[str, dict]:
|
|
531
450
|
"""
|
|
@@ -543,10 +462,10 @@ class TaskGraph:
|
|
|
543
462
|
|
|
544
463
|
status = stage_status_dict.get("status", StageStatus.NOT_STARTED)
|
|
545
464
|
|
|
546
|
-
input =
|
|
547
|
-
successed =
|
|
548
|
-
failed =
|
|
549
|
-
duplicated =
|
|
465
|
+
input = stage.task_counter.value
|
|
466
|
+
successed = stage.success_counter.value
|
|
467
|
+
failed = stage.error_counter.value
|
|
468
|
+
duplicated = stage.duplicate_counter.value
|
|
550
469
|
processed = successed + failed + duplicated
|
|
551
470
|
pending = max(0, input - processed)
|
|
552
471
|
|
|
@@ -633,13 +552,13 @@ class TaskGraph:
|
|
|
633
552
|
}
|
|
634
553
|
|
|
635
554
|
def get_structure_json(self):
|
|
636
|
-
return self.
|
|
555
|
+
return self.structure_json
|
|
637
556
|
|
|
638
557
|
def get_structure_list(self):
|
|
639
|
-
return format_structure_list_from_graph(self.
|
|
558
|
+
return format_structure_list_from_graph(self.structure_json)
|
|
640
559
|
|
|
641
560
|
def get_networkx_graph(self):
|
|
642
|
-
return format_networkx_graph(self.
|
|
561
|
+
return format_networkx_graph(self.structure_json)
|
|
643
562
|
|
|
644
563
|
def analyze_graph(self):
|
|
645
564
|
"""
|
|
@@ -650,8 +569,8 @@ class TaskGraph:
|
|
|
650
569
|
|
|
651
570
|
self.isDAG = is_directed_acyclic_graph(networkx_graph)
|
|
652
571
|
if self.isDAG:
|
|
653
|
-
|
|
654
|
-
self.layers_dict = cluster_by_value_sorted(
|
|
572
|
+
stage_level_dict = compute_node_levels(networkx_graph)
|
|
573
|
+
self.layers_dict = cluster_by_value_sorted(stage_level_dict)
|
|
655
574
|
|
|
656
575
|
def test_methods(
|
|
657
576
|
self,
|
|
@@ -681,10 +600,10 @@ class TaskGraph:
|
|
|
681
600
|
self.init_env()
|
|
682
601
|
self.set_graph_mode(stage_mode, execution_mode)
|
|
683
602
|
self.start_graph(init_tasks_dict)
|
|
603
|
+
fail_by_stage_dict.update(self.get_fail_by_stage_dict())
|
|
604
|
+
fail_by_error_dict.update(self.get_fail_by_error_dict())
|
|
684
605
|
|
|
685
606
|
time_list.append(time.time() - start_time)
|
|
686
|
-
fail_by_error_dict.update(self.get_fail_by_error_dict())
|
|
687
|
-
fail_by_stage_dict.update(self.get_fail_by_stage_dict())
|
|
688
607
|
|
|
689
608
|
test_table_list.append(time_list)
|
|
690
609
|
|
|
@@ -694,6 +613,6 @@ class TaskGraph:
|
|
|
694
613
|
execution_modes,
|
|
695
614
|
r"stage\execution",
|
|
696
615
|
)
|
|
697
|
-
results["Fail error dict"] = fail_by_error_dict
|
|
698
616
|
results["Fail stage dict"] = fail_by_stage_dict
|
|
617
|
+
results["Fail error dict"] = fail_by_error_dict
|
|
699
618
|
return results
|
|
@@ -13,6 +13,7 @@ class LogListener:
|
|
|
13
13
|
"""
|
|
14
14
|
日志监听进程,用于将日志写入文件
|
|
15
15
|
"""
|
|
16
|
+
|
|
16
17
|
def __init__(self, level="INFO"):
|
|
17
18
|
now = strftime("%Y-%m-%d", localtime())
|
|
18
19
|
self.log_path = f"logs/task_logger({now}).log"
|
|
@@ -66,7 +67,7 @@ class TaskLogger:
|
|
|
66
67
|
|
|
67
68
|
# ==== manager ====
|
|
68
69
|
def start_manager(self, func_name, task_num, execution_mode, worker_limit):
|
|
69
|
-
text = f"'{func_name}' start {task_num} tasks by {execution_mode}"
|
|
70
|
+
text = f"'Manager[{func_name}]' start {task_num} tasks by {execution_mode}"
|
|
70
71
|
text += f"({worker_limit} workers)." if execution_mode != "serial" else "."
|
|
71
72
|
self._log("INFO", text)
|
|
72
73
|
|
|
@@ -81,20 +82,19 @@ class TaskLogger:
|
|
|
81
82
|
):
|
|
82
83
|
self._log(
|
|
83
84
|
"INFO",
|
|
84
|
-
f"'{func_name}' end tasks by {execution_mode}. Use {use_time:.2f} second. "
|
|
85
|
+
f"'Manager[{func_name}]' end tasks by {execution_mode}. Use {use_time:.2f} second. "
|
|
85
86
|
f"{success_num} tasks successed, {failed_num} tasks failed, {duplicated_num} tasks duplicated.",
|
|
86
87
|
)
|
|
87
88
|
|
|
88
89
|
# ==== stage ====
|
|
89
|
-
def start_stage(self,
|
|
90
|
-
text = f"
|
|
90
|
+
def start_stage(self, stage_tag, execution_mode, worker_limit):
|
|
91
|
+
text = f"'{stage_tag}' start tasks by {execution_mode}"
|
|
91
92
|
text += f"({worker_limit} workers)." if execution_mode != "serial" else "."
|
|
92
93
|
self._log("INFO", text)
|
|
93
94
|
|
|
94
95
|
def end_stage(
|
|
95
96
|
self,
|
|
96
|
-
|
|
97
|
-
func_name,
|
|
97
|
+
stage_tag,
|
|
98
98
|
execution_mode,
|
|
99
99
|
use_time,
|
|
100
100
|
success_num,
|
|
@@ -103,7 +103,7 @@ class TaskLogger:
|
|
|
103
103
|
):
|
|
104
104
|
self._log(
|
|
105
105
|
"INFO",
|
|
106
|
-
f"
|
|
106
|
+
f"'{stage_tag}' end tasks by {execution_mode}. Use {use_time:.2f} second. "
|
|
107
107
|
f"{success_num} tasks successed, {failed_num} tasks failed, {duplicated_num} tasks duplicated.",
|
|
108
108
|
)
|
|
109
109
|
|
|
@@ -152,3 +152,31 @@ class TaskLogger:
|
|
|
152
152
|
"SUCCESS",
|
|
153
153
|
f"In '{func_name}', Task {task_info} has split into {split_count} parts. Used {use_time:.2f} seconds.",
|
|
154
154
|
)
|
|
155
|
+
|
|
156
|
+
# ==== queue ====
|
|
157
|
+
def put_source(self, source, queue_tag, stage_tag, direction):
|
|
158
|
+
if isinstance(source, TerminationSignal):
|
|
159
|
+
source = "TerminationSignal"
|
|
160
|
+
|
|
161
|
+
edge = f"'{queue_tag}' -> '{stage_tag}'" if direction == "in" else f"'{stage_tag}' -> '{queue_tag}'"
|
|
162
|
+
self._log(
|
|
163
|
+
"TRACE",
|
|
164
|
+
f"Put {source} into Edge({edge})."
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def get_source(self, source, queue_tag, stage_tag):
|
|
168
|
+
if isinstance(source, TerminationSignal):
|
|
169
|
+
source = "TerminationSignal"
|
|
170
|
+
|
|
171
|
+
edge = f"'{queue_tag}' -> '{stage_tag}'"
|
|
172
|
+
self._log(
|
|
173
|
+
"TRACE",
|
|
174
|
+
f"Get {source} from Edge({edge})"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def get_source_error(self, queue_tag, stage_tag, exception):
|
|
178
|
+
exception_text = str(exception).replace("\n", " ")
|
|
179
|
+
self._log(
|
|
180
|
+
"WARNING",
|
|
181
|
+
f"Error get from Edge({queue_tag} -> {stage_tag}): ({type(exception).__name__}){exception_text}",
|
|
182
|
+
)
|