celestialflow 3.1.2__tar.gz → 3.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {celestialflow-3.1.2 → celestialflow-3.1.3}/PKG-INFO +14 -7
- {celestialflow-3.1.2 → celestialflow-3.1.3}/README.md +12 -4
- {celestialflow-3.1.2 → celestialflow-3.1.3}/pyproject.toml +2 -3
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/__init__.py +8 -5
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/graph/analysis.py +2 -2
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/graph/graph.py +30 -62
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/graph/serialize.py +1 -1
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/persistence/__init__.py +3 -0
- celestialflow-3.1.3/src/celestialflow/persistence/base.py +61 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/persistence/fail.py +32 -46
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/persistence/jsonl.py +0 -47
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/persistence/log.py +77 -87
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/__init__.py +2 -0
- celestialflow-3.1.3/src/celestialflow/runtime/envelope.py +28 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/metrics.py +86 -35
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/queue.py +2 -1
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/types.py +1 -32
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/stage/__init__.py +2 -2
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/stage/executor.py +130 -220
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/stage/nodes.py +16 -20
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/stage/stage.py +15 -6
- celestialflow-3.1.3/src/celestialflow/utils/benchmark.py +103 -0
- celestialflow-3.1.3/src/celestialflow/utils/clone.py +118 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow.egg-info/PKG-INFO +14 -7
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow.egg-info/SOURCES.txt +4 -2
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow.egg-info/requires.txt +1 -2
- {celestialflow-3.1.2 → celestialflow-3.1.3}/tests/test_nodes.py +12 -6
- celestialflow-3.1.2/tests/test_executor.py +0 -121
- celestialflow-3.1.2/tests/test_graph.py +0 -219
- {celestialflow-3.1.2 → celestialflow-3.1.3}/setup.cfg +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/graph/__init__.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/graph/structure.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/observability/__init__.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/observability/report.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/persistence/constant.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/errors.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/estimators.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/factories.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/hash.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/progress.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/runtime/tools.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/utils/__init__.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/utils/collections.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/utils/debug.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/utils/format.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow/web/server.py +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow.egg-info/dependency_links.txt +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow.egg-info/entry_points.txt +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/src/celestialflow.egg-info/top_level.txt +0 -0
- {celestialflow-3.1.2 → celestialflow-3.1.3}/tests/test_structure.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: celestialflow
|
|
3
|
-
Version: 3.1.
|
|
3
|
+
Version: 3.1.3
|
|
4
4
|
Summary: A flexible GRAPH-based task orchestration framework.
|
|
5
5
|
Author-email: Mr-xiaotian <mingxiaomingtian@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -15,14 +15,13 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
15
15
|
Requires-Python: >=3.10
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
Requires-Dist: tqdm
|
|
18
|
-
Requires-Dist: loguru
|
|
19
18
|
Requires-Dist: fastapi
|
|
20
19
|
Requires-Dist: uvicorn
|
|
21
20
|
Requires-Dist: requests
|
|
22
21
|
Requires-Dist: networkx
|
|
23
22
|
Requires-Dist: redis
|
|
24
23
|
Requires-Dist: jinja2
|
|
25
|
-
Requires-Dist: celestialtree
|
|
24
|
+
Requires-Dist: celestialtree>=0.1.2
|
|
26
25
|
|
|
27
26
|
# CelestialFlow ——一个轻量级、可并行、基于图结构的 Python 任务调度框架
|
|
28
27
|
|
|
@@ -240,9 +239,8 @@ flowchart TD
|
|
|
240
239
|
|
|
241
240
|
| 依赖包 | 说明 |
|
|
242
241
|
| ----------------- | ---- |
|
|
243
|
-
| **Python ≥ 3.
|
|
242
|
+
| **Python ≥ 3.10** | 运行环境,建议使用 3.10 及以上版本 |
|
|
244
243
|
| **tqdm** | 控制台进度条显示,用于任务执行可视化 |
|
|
245
|
-
| **loguru** | 高性能日志系统,支持多进程安全输出 |
|
|
246
244
|
| **fastapi** | Web 服务接口框架(用于任务可视化与远程控制) |
|
|
247
245
|
| **uvicorn** | FastAPI 的高性能 ASGI 服务器 |
|
|
248
246
|
| **requests** | HTTP 客户端库,用于任务状态上报与远程调用 |
|
|
@@ -256,7 +254,7 @@ flowchart TD
|
|
|
256
254
|
<p align="center">
|
|
257
255
|
<img src="https://raw.githubusercontent.com/Mr-xiaotian/CelestialFlow/main/img/file_structure.svg" alt="FileStructure" />
|
|
258
256
|
<br/>
|
|
259
|
-
<em>celestial-flow 3.1.
|
|
257
|
+
<em>celestial-flow 3.1.3</em>
|
|
260
258
|
</p>
|
|
261
259
|
|
|
262
260
|
(该视图由我的另一个项目[CelestialVault](https://github.com/Mr-xiaotian/CelestialVault)中inst_file.FileTree.print_tree()生成。转换为图片则借助[Carbon](https://carbon.now.sh)。)
|
|
@@ -276,7 +274,7 @@ flowchart TD
|
|
|
276
274
|
- 6/16/2025: 多轮评测后, 当前框架已支持完整有向图结构, 将TaskTree改名为TaskGraph
|
|
277
275
|
- 3.0.1: 上线Pypi, 可喜可贺
|
|
278
276
|
- 3.0.4: 新增一个抽象结构TaskQueue, 用于表示节点的所有"入边"与"出边"; 恢复未消费任务的保存功能
|
|
279
|
-
- 3.0.5: 删除原有的TaskRedisTransfer节点, 并增添三种新的redis交互节点
|
|
277
|
+
- 3.0.5: 删除原有的TaskRedisTransfer节点, 并增添三种新的redis交互节点TaskRedisTransport TaskRedisSource TaskRedisAck, 用于跨语言 跨进程 跨设备处理任务; 并在Web页面添加展示拓扑信息的卡片
|
|
280
278
|
- 3.0.6: 添加对[CelestialTree](https://github.com/Mr-xiaotian/CelestialTree)系统的支持, 现在可以追踪单个任务的流向
|
|
281
279
|
- 3.0.7: 将TaskStage从TaskExecutor中单独抽出来作为一个子类; 增加新节点TaskRouter, 可以将传入的任务选择的传给不同的下游节点, 而不是进行广播
|
|
282
280
|
- 3.0.8: 在ctree逻辑上将"任务重试"事件后的"任务成功/失败/重试"事件视为因果关系, 而非之前的并行关系; 重构错误搜集部分逻辑; 修复大量3.0.6与3.07版本引入的bug; 优化部分log表现
|
|
@@ -319,6 +317,15 @@ flowchart TD
|
|
|
319
317
|
- fix
|
|
320
318
|
- 修复前端renderNodeList中参数设置错误;
|
|
321
319
|
- 修复其他微小bug;
|
|
320
|
+
- 3.1.3
|
|
321
|
+
- feat:
|
|
322
|
+
- 抽象出BaseListener与BaseSinker;
|
|
323
|
+
- 移除loguru, 完全由LogListener和LogSinker实现log记录;
|
|
324
|
+
- 将bench相关代码从TaskExecutor和TaskGraph中抽离, 不再作为方法, 而是单独bench函数;
|
|
325
|
+
- 重构TaskExecutor部分代码, 以尽量瘦身;
|
|
326
|
+
- 优化log处理代码中对时间戳的处理, 现在更加准确;
|
|
327
|
+
- fix:
|
|
328
|
+
- 修复一些影响性能的小问题;
|
|
322
329
|
|
|
323
330
|
## Star 历史趋势(Star History)
|
|
324
331
|
|
|
@@ -214,9 +214,8 @@ flowchart TD
|
|
|
214
214
|
|
|
215
215
|
| 依赖包 | 说明 |
|
|
216
216
|
| ----------------- | ---- |
|
|
217
|
-
| **Python ≥ 3.
|
|
217
|
+
| **Python ≥ 3.10** | 运行环境,建议使用 3.10 及以上版本 |
|
|
218
218
|
| **tqdm** | 控制台进度条显示,用于任务执行可视化 |
|
|
219
|
-
| **loguru** | 高性能日志系统,支持多进程安全输出 |
|
|
220
219
|
| **fastapi** | Web 服务接口框架(用于任务可视化与远程控制) |
|
|
221
220
|
| **uvicorn** | FastAPI 的高性能 ASGI 服务器 |
|
|
222
221
|
| **requests** | HTTP 客户端库,用于任务状态上报与远程调用 |
|
|
@@ -230,7 +229,7 @@ flowchart TD
|
|
|
230
229
|
<p align="center">
|
|
231
230
|
<img src="https://raw.githubusercontent.com/Mr-xiaotian/CelestialFlow/main/img/file_structure.svg" alt="FileStructure" />
|
|
232
231
|
<br/>
|
|
233
|
-
<em>celestial-flow 3.1.
|
|
232
|
+
<em>celestial-flow 3.1.3</em>
|
|
234
233
|
</p>
|
|
235
234
|
|
|
236
235
|
(该视图由我的另一个项目[CelestialVault](https://github.com/Mr-xiaotian/CelestialVault)中inst_file.FileTree.print_tree()生成。转换为图片则借助[Carbon](https://carbon.now.sh)。)
|
|
@@ -250,7 +249,7 @@ flowchart TD
|
|
|
250
249
|
- 6/16/2025: 多轮评测后, 当前框架已支持完整有向图结构, 将TaskTree改名为TaskGraph
|
|
251
250
|
- 3.0.1: 上线Pypi, 可喜可贺
|
|
252
251
|
- 3.0.4: 新增一个抽象结构TaskQueue, 用于表示节点的所有"入边"与"出边"; 恢复未消费任务的保存功能
|
|
253
|
-
- 3.0.5: 删除原有的TaskRedisTransfer节点, 并增添三种新的redis交互节点
|
|
252
|
+
- 3.0.5: 删除原有的TaskRedisTransfer节点, 并增添三种新的redis交互节点TaskRedisTransport TaskRedisSource TaskRedisAck, 用于跨语言 跨进程 跨设备处理任务; 并在Web页面添加展示拓扑信息的卡片
|
|
254
253
|
- 3.0.6: 添加对[CelestialTree](https://github.com/Mr-xiaotian/CelestialTree)系统的支持, 现在可以追踪单个任务的流向
|
|
255
254
|
- 3.0.7: 将TaskStage从TaskExecutor中单独抽出来作为一个子类; 增加新节点TaskRouter, 可以将传入的任务选择的传给不同的下游节点, 而不是进行广播
|
|
256
255
|
- 3.0.8: 在ctree逻辑上将"任务重试"事件后的"任务成功/失败/重试"事件视为因果关系, 而非之前的并行关系; 重构错误搜集部分逻辑; 修复大量3.0.6与3.07版本引入的bug; 优化部分log表现
|
|
@@ -293,6 +292,15 @@ flowchart TD
|
|
|
293
292
|
- fix
|
|
294
293
|
- 修复前端renderNodeList中参数设置错误;
|
|
295
294
|
- 修复其他微小bug;
|
|
295
|
+
- 3.1.3
|
|
296
|
+
- feat:
|
|
297
|
+
- 抽象出BaseListener与BaseSinker;
|
|
298
|
+
- 移除loguru, 完全由LogListener和LogSinker实现log记录;
|
|
299
|
+
- 将bench相关代码从TaskExecutor和TaskGraph中抽离, 不再作为方法, 而是单独bench函数;
|
|
300
|
+
- 重构TaskExecutor部分代码, 以尽量瘦身;
|
|
301
|
+
- 优化log处理代码中对时间戳的处理, 现在更加准确;
|
|
302
|
+
- fix:
|
|
303
|
+
- 修复一些影响性能的小问题;
|
|
296
304
|
|
|
297
305
|
## Star 历史趋势(Star History)
|
|
298
306
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "celestialflow"
|
|
7
|
-
version = "3.1.
|
|
7
|
+
version = "3.1.3"
|
|
8
8
|
description = "A flexible GRAPH-based task orchestration framework."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -14,14 +14,13 @@ requires-python = ">=3.10"
|
|
|
14
14
|
|
|
15
15
|
dependencies = [
|
|
16
16
|
"tqdm",
|
|
17
|
-
"loguru",
|
|
18
17
|
"fastapi",
|
|
19
18
|
"uvicorn",
|
|
20
19
|
"requests",
|
|
21
20
|
"networkx",
|
|
22
21
|
"redis",
|
|
23
22
|
"jinja2",
|
|
24
|
-
"celestialtree",
|
|
23
|
+
"celestialtree>=0.1.2",
|
|
25
24
|
]
|
|
26
25
|
|
|
27
26
|
classifiers = [
|
|
@@ -3,7 +3,7 @@ from .stage import (
|
|
|
3
3
|
TaskExecutor,
|
|
4
4
|
TaskStage,
|
|
5
5
|
TaskSplitter,
|
|
6
|
-
|
|
6
|
+
TaskRedisTransport,
|
|
7
7
|
TaskRedisSource,
|
|
8
8
|
TaskRedisAck,
|
|
9
9
|
TaskRouter,
|
|
@@ -17,14 +17,15 @@ from .graph import (
|
|
|
17
17
|
TaskWheel,
|
|
18
18
|
TaskGrid,
|
|
19
19
|
)
|
|
20
|
-
from .runtime.types import TerminationSignal
|
|
21
|
-
from .runtime.hash import make_hashable
|
|
22
|
-
from .utils.format import format_table
|
|
23
20
|
from .persistence.jsonl import (
|
|
24
21
|
load_jsonl_logs,
|
|
25
22
|
load_task_by_stage,
|
|
26
23
|
load_task_by_error,
|
|
27
24
|
)
|
|
25
|
+
from .runtime.types import TerminationSignal
|
|
26
|
+
from .runtime.hash import make_hashable
|
|
27
|
+
from .utils.format import format_table
|
|
28
|
+
from .utils.benchmark import benchmark_graph, benchmark_executor
|
|
28
29
|
from .web.server import TaskWebServer
|
|
29
30
|
|
|
30
31
|
__all__ = [
|
|
@@ -38,7 +39,7 @@ __all__ = [
|
|
|
38
39
|
"TaskExecutor",
|
|
39
40
|
"TaskStage",
|
|
40
41
|
"TaskSplitter",
|
|
41
|
-
"
|
|
42
|
+
"TaskRedisTransport",
|
|
42
43
|
"TaskRedisSource",
|
|
43
44
|
"TaskRedisAck",
|
|
44
45
|
"TaskRouter",
|
|
@@ -49,4 +50,6 @@ __all__ = [
|
|
|
49
50
|
"load_task_by_error",
|
|
50
51
|
"make_hashable",
|
|
51
52
|
"format_table",
|
|
53
|
+
"benchmark_graph",
|
|
54
|
+
"benchmark_executor",
|
|
52
55
|
]
|
|
@@ -15,11 +15,11 @@ def format_networkx_graph(structure_graph: List[Dict[str, Any]]) -> nx.DiGraph:
|
|
|
15
15
|
G = nx.DiGraph()
|
|
16
16
|
|
|
17
17
|
def add_node_and_edges(node: Dict[str, Any]):
|
|
18
|
-
node_id = f'{node["
|
|
18
|
+
node_id = f'{node["name"]}[{node["func_name"]}]'
|
|
19
19
|
G.add_node(node_id, **{"mode": node.get("stage_mode")})
|
|
20
20
|
|
|
21
21
|
for child in node.get("next_stages", []):
|
|
22
|
-
child_id = f'{child["
|
|
22
|
+
child_id = f'{child["name"]}[{child["func_name"]}]'
|
|
23
23
|
G.add_edge(node_id, child_id)
|
|
24
24
|
# 递归添加子节点
|
|
25
25
|
add_node_and_edges(child)
|
|
@@ -4,7 +4,7 @@ import warnings
|
|
|
4
4
|
import multiprocessing
|
|
5
5
|
from collections import defaultdict, deque
|
|
6
6
|
from multiprocessing import Queue as MPQueue
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import Dict, List
|
|
8
8
|
|
|
9
9
|
from celestialtree import (
|
|
10
10
|
Client as CelestialTreeClient,
|
|
@@ -13,7 +13,7 @@ from celestialtree import (
|
|
|
13
13
|
format_provenance_forest,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
|
-
from ..runtime import TaskQueue
|
|
16
|
+
from ..runtime import TaskQueue, TaskEnvelope
|
|
17
17
|
from ..runtime.estimators import (
|
|
18
18
|
calc_elapsed,
|
|
19
19
|
calc_remaining,
|
|
@@ -21,7 +21,6 @@ from ..runtime.estimators import (
|
|
|
21
21
|
)
|
|
22
22
|
from ..runtime.errors import UnconsumedError
|
|
23
23
|
from ..runtime.types import (
|
|
24
|
-
TaskEnvelope,
|
|
25
24
|
StageStatus,
|
|
26
25
|
TerminationSignal,
|
|
27
26
|
STAGE_STYLE,
|
|
@@ -228,6 +227,9 @@ class TaskGraph:
|
|
|
228
227
|
:param host: 报告器主机地址
|
|
229
228
|
:param port: 报告器端口
|
|
230
229
|
"""
|
|
230
|
+
self._is_report = is_report
|
|
231
|
+
self._report_host = host
|
|
232
|
+
self._report_port = port
|
|
231
233
|
if is_report:
|
|
232
234
|
self.reporter = TaskReporter(
|
|
233
235
|
host=host,
|
|
@@ -239,7 +241,12 @@ class TaskGraph:
|
|
|
239
241
|
self.reporter = NullTaskReporter()
|
|
240
242
|
|
|
241
243
|
def set_ctree(
|
|
242
|
-
self,
|
|
244
|
+
self,
|
|
245
|
+
use_ctree=False,
|
|
246
|
+
host="127.0.0.1",
|
|
247
|
+
http_port=7777,
|
|
248
|
+
grpc_port=7778,
|
|
249
|
+
transport="grpc",
|
|
243
250
|
):
|
|
244
251
|
"""
|
|
245
252
|
设定事件树客户端
|
|
@@ -252,10 +259,11 @@ class TaskGraph:
|
|
|
252
259
|
self._ctree_host = host
|
|
253
260
|
self._CTREE_HTTP_PORT = http_port
|
|
254
261
|
self._ctree_grpc_port = grpc_port
|
|
262
|
+
self._ctree_transport = transport
|
|
255
263
|
|
|
256
264
|
if use_ctree:
|
|
257
265
|
self.ctree_client = CelestialTreeClient(
|
|
258
|
-
host=host, http_port=http_port, grpc_port=grpc_port, transport=
|
|
266
|
+
host=host, http_port=http_port, grpc_port=grpc_port, transport=transport
|
|
259
267
|
)
|
|
260
268
|
if not self.ctree_client.health():
|
|
261
269
|
raise Exception("CelestialTreeClient is not available")
|
|
@@ -362,7 +370,7 @@ class TaskGraph:
|
|
|
362
370
|
)
|
|
363
371
|
|
|
364
372
|
try:
|
|
365
|
-
start_time = time.
|
|
373
|
+
start_time = time.perf_counter()
|
|
366
374
|
self.fail_listener.start()
|
|
367
375
|
self.log_listener.start()
|
|
368
376
|
self.log_sinker.start_graph(self.get_structure_list())
|
|
@@ -377,7 +385,7 @@ class TaskGraph:
|
|
|
377
385
|
|
|
378
386
|
self.reporter.stop()
|
|
379
387
|
self.release_resources()
|
|
380
|
-
self.log_sinker.end_graph(time.
|
|
388
|
+
self.log_sinker.end_graph(time.perf_counter() - start_time)
|
|
381
389
|
self.fail_listener.stop()
|
|
382
390
|
self.log_listener.stop()
|
|
383
391
|
|
|
@@ -393,11 +401,11 @@ class TaskGraph:
|
|
|
393
401
|
for p in self.processes:
|
|
394
402
|
p.join()
|
|
395
403
|
self.log_sinker.process_exit(p.name, p.exitcode)
|
|
396
|
-
|
|
404
|
+
elif self.schedule_mode == "staged":
|
|
397
405
|
# staged schedule_mode:一层层地顺序执行
|
|
398
406
|
for layer_level, layer in self.layers_dict.items():
|
|
399
407
|
self.log_sinker.start_layer(layer, layer_level)
|
|
400
|
-
start_time = time.
|
|
408
|
+
start_time = time.perf_counter()
|
|
401
409
|
|
|
402
410
|
processes = []
|
|
403
411
|
for stage_tag in layer:
|
|
@@ -411,7 +419,7 @@ class TaskGraph:
|
|
|
411
419
|
p.join()
|
|
412
420
|
self.log_sinker.process_exit(p.name, p.exitcode)
|
|
413
421
|
|
|
414
|
-
self.log_sinker.end_layer(layer, time.
|
|
422
|
+
self.log_sinker.end_layer(layer, time.perf_counter() - start_time)
|
|
415
423
|
|
|
416
424
|
def _execute_stage(self, stage: TaskStage):
|
|
417
425
|
"""
|
|
@@ -603,9 +611,11 @@ class TaskGraph:
|
|
|
603
611
|
if not self.isDAG:
|
|
604
612
|
totals["total_remain"] = max(running_remaining_map.values(), default=0.0)
|
|
605
613
|
else:
|
|
606
|
-
G = self.get_networkx_graph()
|
|
607
614
|
expected_pending_map = calc_global_remain_equal_pred(
|
|
608
|
-
|
|
615
|
+
self.networkx_graph,
|
|
616
|
+
running_processed_map,
|
|
617
|
+
running_pending_map,
|
|
618
|
+
running_elapsed_map,
|
|
609
619
|
)
|
|
610
620
|
totals["total_remain"] = max(expected_pending_map.values(), default=0.0)
|
|
611
621
|
|
|
@@ -613,10 +623,10 @@ class TaskGraph:
|
|
|
613
623
|
self.graph_summary = dict(totals)
|
|
614
624
|
|
|
615
625
|
def get_fail_by_stage_dict(self):
|
|
616
|
-
return load_task_by_stage(self.fail_listener.
|
|
626
|
+
return load_task_by_stage(self.fail_listener.jsonl_path)
|
|
617
627
|
|
|
618
628
|
def get_fail_by_error_dict(self):
|
|
619
|
-
return load_task_by_error(self.fail_listener.
|
|
629
|
+
return load_task_by_error(self.fail_listener.jsonl_path)
|
|
620
630
|
|
|
621
631
|
def get_status_dict(self) -> Dict[str, dict]:
|
|
622
632
|
"""
|
|
@@ -647,6 +657,9 @@ class TaskGraph:
|
|
|
647
657
|
return format_structure_list_from_graph(self.structure_json)
|
|
648
658
|
|
|
649
659
|
def get_networkx_graph(self):
|
|
660
|
+
"""
|
|
661
|
+
获取任务图的 networkx 有向图(DiGraph)
|
|
662
|
+
"""
|
|
650
663
|
return format_networkx_graph(self.structure_json)
|
|
651
664
|
|
|
652
665
|
def get_fallback_path(self) -> str:
|
|
@@ -668,55 +681,10 @@ class TaskGraph:
|
|
|
668
681
|
"""
|
|
669
682
|
分析任务图,计算 DAG 属性和层级信息
|
|
670
683
|
"""
|
|
671
|
-
networkx_graph = self.get_networkx_graph()
|
|
684
|
+
self.networkx_graph = self.get_networkx_graph()
|
|
672
685
|
self.layers_dict = {}
|
|
673
686
|
|
|
674
|
-
self.isDAG = is_directed_acyclic_graph(networkx_graph)
|
|
687
|
+
self.isDAG = is_directed_acyclic_graph(self.networkx_graph)
|
|
675
688
|
if self.isDAG:
|
|
676
|
-
stage_level_dict = compute_node_levels(networkx_graph)
|
|
689
|
+
stage_level_dict = compute_node_levels(self.networkx_graph)
|
|
677
690
|
self.layers_dict = cluster_by_value_sorted(stage_level_dict)
|
|
678
|
-
|
|
679
|
-
def test_methods(
|
|
680
|
-
self,
|
|
681
|
-
init_tasks_dict: Dict[str, List],
|
|
682
|
-
stage_modes: list = None,
|
|
683
|
-
execution_modes: list = None,
|
|
684
|
-
) -> Dict[str, Any]:
|
|
685
|
-
"""
|
|
686
|
-
测试 TaskGraph 在 'serial' 和 'process' 模式下的执行时间。
|
|
687
|
-
|
|
688
|
-
:param init_tasks_dict: 初始化任务字典
|
|
689
|
-
:param stage_modes: 阶段模式列表,默认为 ['serial', 'process']
|
|
690
|
-
:param execution_modes: 执行模式列表,默认为 ['serial', 'thread']
|
|
691
|
-
:return: 包含两种执行模式下的执行时间的字典
|
|
692
|
-
"""
|
|
693
|
-
results = {}
|
|
694
|
-
test_table_list = []
|
|
695
|
-
fail_by_error_dict = {}
|
|
696
|
-
fail_by_stage_dict = {}
|
|
697
|
-
|
|
698
|
-
stage_modes = stage_modes or ["serial", "process"]
|
|
699
|
-
execution_modes = execution_modes or ["serial", "thread"]
|
|
700
|
-
for stage_mode in stage_modes:
|
|
701
|
-
time_list = []
|
|
702
|
-
for execution_mode in execution_modes:
|
|
703
|
-
start_time = time.time()
|
|
704
|
-
self.init_env()
|
|
705
|
-
self.set_graph_mode(stage_mode, execution_mode)
|
|
706
|
-
self.start_graph(init_tasks_dict)
|
|
707
|
-
fail_by_stage_dict.update(self.get_fail_by_stage_dict())
|
|
708
|
-
fail_by_error_dict.update(self.get_fail_by_error_dict())
|
|
709
|
-
|
|
710
|
-
time_list.append(time.time() - start_time)
|
|
711
|
-
|
|
712
|
-
test_table_list.append(time_list)
|
|
713
|
-
|
|
714
|
-
results["Time table"] = (
|
|
715
|
-
test_table_list,
|
|
716
|
-
stage_modes,
|
|
717
|
-
execution_modes,
|
|
718
|
-
r"stage\execution",
|
|
719
|
-
)
|
|
720
|
-
results["Fail stage dict"] = fail_by_stage_dict
|
|
721
|
-
results["Fail error dict"] = fail_by_error_dict
|
|
722
|
-
return results
|
|
@@ -59,7 +59,7 @@ def format_structure_list_from_graph(root_roots: List[Dict] = None) -> List[str]
|
|
|
59
59
|
|
|
60
60
|
def node_label(node: Dict) -> str:
|
|
61
61
|
visited_note = " [Ref]" if node.get("is_ref") else ""
|
|
62
|
-
N = node.get("
|
|
62
|
+
N = node.get("name", "?") # N
|
|
63
63
|
F = node.get("func_name", "?") # F
|
|
64
64
|
S = node.get("stage_mode", "?") # S
|
|
65
65
|
E = node.get("execution_mode", "?") # E
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
# persistence/__init__.py
|
|
2
|
+
from .base import BaseListener, BaseSinker
|
|
2
3
|
from .fail import FailListener, FailSinker
|
|
3
4
|
from .log import LogListener, LogSinker
|
|
4
5
|
|
|
5
6
|
__all__ = [
|
|
7
|
+
"BaseListener",
|
|
8
|
+
"BaseSinker",
|
|
6
9
|
"FailListener",
|
|
7
10
|
"FailSinker",
|
|
8
11
|
"LogListener",
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# persistence/base.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from multiprocessing import Queue as MPQueue
|
|
5
|
+
from queue import Empty
|
|
6
|
+
from threading import Thread
|
|
7
|
+
|
|
8
|
+
from ..runtime.tools import cleanup_mpqueue
|
|
9
|
+
from ..runtime.types import TerminationSignal, TERMINATION_SIGNAL
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseListener:
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self.queue = MPQueue()
|
|
15
|
+
self._thread: Thread | None = None
|
|
16
|
+
|
|
17
|
+
def _before_start(self):
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
def _handle_record(self, record):
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
|
|
23
|
+
def _after_stop(self):
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
def start(self):
|
|
27
|
+
self._before_start()
|
|
28
|
+
if self._thread is None or not self._thread.is_alive():
|
|
29
|
+
self._thread = Thread(target=self._listen, daemon=True)
|
|
30
|
+
self._thread.start()
|
|
31
|
+
|
|
32
|
+
def _listen(self):
|
|
33
|
+
while True:
|
|
34
|
+
try:
|
|
35
|
+
record = self.queue.get(timeout=0.5)
|
|
36
|
+
if isinstance(record, TerminationSignal):
|
|
37
|
+
break
|
|
38
|
+
self._handle_record(record)
|
|
39
|
+
except Empty:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
def get_queue(self):
|
|
43
|
+
return self.queue
|
|
44
|
+
|
|
45
|
+
def stop(self):
|
|
46
|
+
if self._thread is None:
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
self.queue.put(TERMINATION_SIGNAL)
|
|
50
|
+
self._thread.join()
|
|
51
|
+
self._thread = None
|
|
52
|
+
cleanup_mpqueue(self.queue)
|
|
53
|
+
self._after_stop()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class BaseSinker:
|
|
57
|
+
def __init__(self, queue):
|
|
58
|
+
self.queue: MPQueue = queue
|
|
59
|
+
|
|
60
|
+
def _sink(self, record):
|
|
61
|
+
self.queue.put(record)
|
|
@@ -1,77 +1,63 @@
|
|
|
1
1
|
# persistence/fail.py
|
|
2
|
-
import
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
3
4
|
from datetime import datetime
|
|
4
|
-
from multiprocessing import Queue as MPQueue
|
|
5
|
-
from queue import Empty
|
|
6
|
-
from threading import Lock, Thread
|
|
7
5
|
|
|
8
|
-
from ..runtime.tools import cleanup_mpqueue
|
|
9
|
-
from ..runtime.types import TerminationSignal, TERMINATION_SIGNAL
|
|
10
6
|
from ..utils.format import format_repr
|
|
11
|
-
from .
|
|
7
|
+
from .base import BaseListener, BaseSinker
|
|
12
8
|
|
|
13
9
|
|
|
14
|
-
class FailListener:
|
|
10
|
+
class FailListener(BaseListener):
|
|
15
11
|
def __init__(self, error_source: str):
|
|
12
|
+
super().__init__()
|
|
13
|
+
|
|
16
14
|
self.error_source = error_source
|
|
17
|
-
self.fail_queue =
|
|
18
|
-
self.
|
|
19
|
-
|
|
15
|
+
self.fail_queue = self.queue
|
|
16
|
+
self.jsonl_path: Path | None = None
|
|
17
|
+
|
|
20
18
|
self.total_error_num = 0
|
|
21
|
-
self.
|
|
19
|
+
self._file = None
|
|
22
20
|
|
|
23
|
-
def
|
|
21
|
+
def _before_start(self):
|
|
22
|
+
# 创建 fallback 目录
|
|
24
23
|
now = datetime.now()
|
|
25
24
|
date_str = now.strftime("%Y-%m-%d")
|
|
26
25
|
time_str = now.strftime("%H-%M-%S-%f")[:-3]
|
|
27
|
-
self.
|
|
26
|
+
self.jsonl_path = Path(
|
|
28
27
|
f"./fallback/{date_str}/{self.error_source}({time_str}).jsonl"
|
|
29
28
|
)
|
|
30
|
-
self.
|
|
29
|
+
self.jsonl_path.parent.mkdir(parents=True, exist_ok=True)
|
|
31
30
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
self._thread.start()
|
|
31
|
+
# 打开失败记录文件
|
|
32
|
+
self._file = self.jsonl_path.open("a", encoding="utf-8")
|
|
35
33
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
try:
|
|
39
|
-
record = self.fail_queue.get(timeout=0.5)
|
|
40
|
-
if isinstance(record, TerminationSignal):
|
|
41
|
-
break
|
|
42
|
-
append_jsonl_log(record, self.fallback_path)
|
|
43
|
-
if isinstance(record, dict) and record.get("error_id") is not None:
|
|
44
|
-
with self._counter_lock:
|
|
45
|
-
self.total_error_num += 1
|
|
46
|
-
except Empty:
|
|
47
|
-
continue
|
|
34
|
+
# 初始化错误计数器
|
|
35
|
+
self.total_error_num = 0
|
|
48
36
|
|
|
49
|
-
def
|
|
50
|
-
|
|
37
|
+
def _handle_record(self, record):
|
|
38
|
+
jsonl_record = json.dumps(record, ensure_ascii=False)
|
|
51
39
|
|
|
52
|
-
|
|
53
|
-
|
|
40
|
+
self._file.write(f"{jsonl_record}\n")
|
|
41
|
+
self._file.flush()
|
|
54
42
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
return
|
|
43
|
+
if isinstance(record, dict) and record.get("error_id") is not None:
|
|
44
|
+
self.total_error_num += 1
|
|
58
45
|
|
|
59
|
-
|
|
60
|
-
self.
|
|
61
|
-
|
|
62
|
-
|
|
46
|
+
def _after_stop(self):
|
|
47
|
+
if self._file:
|
|
48
|
+
self._file.flush()
|
|
49
|
+
self._file.close()
|
|
50
|
+
self._file = None
|
|
63
51
|
|
|
64
52
|
|
|
65
|
-
class FailSinker:
|
|
53
|
+
class FailSinker(BaseSinker):
|
|
66
54
|
"""
|
|
67
55
|
多进程安全失败记录包装类,所有失败记录通过队列发送到监听进程写入
|
|
68
56
|
"""
|
|
69
57
|
|
|
70
58
|
def __init__(self, fail_queue):
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def _sink(self, record: dict):
|
|
74
|
-
self.fail_queue.put(record)
|
|
59
|
+
super().__init__(fail_queue)
|
|
60
|
+
self.fail_queue = self.queue
|
|
75
61
|
|
|
76
62
|
def start_graph(self, structure_json):
|
|
77
63
|
"""
|
|
@@ -1,57 +1,10 @@
|
|
|
1
1
|
# persistence/jsonl.py
|
|
2
2
|
import json
|
|
3
3
|
import ast
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from collections.abc import Iterable
|
|
6
4
|
from collections import defaultdict
|
|
7
5
|
from typing import Dict, Any, List, Optional
|
|
8
6
|
|
|
9
|
-
|
|
10
7
|
# ======== jsonl文件处理 ========
|
|
11
|
-
def append_jsonl_log(log_data: dict, file_path: str, logger=None):
|
|
12
|
-
"""
|
|
13
|
-
将日志字典写入指定目录下的 JSONL 文件。
|
|
14
|
-
|
|
15
|
-
:param log_data: 要写入的日志项(字典)
|
|
16
|
-
:param start_time: 运行开始时间,用于构造路径
|
|
17
|
-
:param base_path: 基础路径,例如 './fallback'
|
|
18
|
-
:param prefix: 文件名前缀,例如 'realtime_errors'
|
|
19
|
-
:param logger: 可选的日志对象用于记录失败信息
|
|
20
|
-
"""
|
|
21
|
-
try:
|
|
22
|
-
file_path: Path = Path(file_path)
|
|
23
|
-
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
24
|
-
|
|
25
|
-
with open(file_path, "a", encoding="utf-8") as f:
|
|
26
|
-
f.write(json.dumps(log_data, ensure_ascii=False) + "\n")
|
|
27
|
-
except Exception as e:
|
|
28
|
-
if logger:
|
|
29
|
-
logger._log("WARNING", f"[Persist] 写入日志失败: {e}")
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def append_jsonl_logs(log_items: Iterable[dict], file_path: str, logger=None):
|
|
33
|
-
"""
|
|
34
|
-
将多条日志一次性写入 JSONL 文件(batch 追加)。
|
|
35
|
-
|
|
36
|
-
:param log_items: Iterable[dict],每个元素写成一行 JSON
|
|
37
|
-
:param file_path: JSONL 文件路径
|
|
38
|
-
:param logger: 可选日志对象
|
|
39
|
-
"""
|
|
40
|
-
try:
|
|
41
|
-
if not isinstance(log_items, Iterable):
|
|
42
|
-
raise TypeError("log_items must be an iterable of dict")
|
|
43
|
-
|
|
44
|
-
file_path: Path = Path(file_path)
|
|
45
|
-
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
-
|
|
47
|
-
with open(file_path, "a", encoding="utf-8") as f:
|
|
48
|
-
for item in log_items:
|
|
49
|
-
if not isinstance(item, dict):
|
|
50
|
-
raise TypeError(f"each log item must be dict, got {type(item)}")
|
|
51
|
-
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
52
|
-
except Exception as e:
|
|
53
|
-
if logger:
|
|
54
|
-
logger._log("WARNING", f"[Persist] 批量写入日志失败: {e}")
|
|
55
8
|
|
|
56
9
|
|
|
57
10
|
def load_jsonl_logs(
|