dtflow 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dtflow/__init__.py +17 -1
- dtflow/__main__.py +292 -239
- dtflow/cli/__init__.py +8 -2
- dtflow/cli/commands.py +1030 -92
- dtflow/core.py +96 -31
- dtflow/lineage.py +407 -0
- dtflow/mcp/cli.py +14 -14
- dtflow/pipeline.py +450 -0
- dtflow/storage/io.py +376 -370
- dtflow/streaming.py +661 -0
- dtflow/tokenizers.py +188 -51
- dtflow/utils/display.py +5 -4
- {dtflow-0.3.0.dist-info → dtflow-0.3.1.dist-info}/METADATA +153 -7
- dtflow-0.3.1.dist-info/RECORD +24 -0
- dtflow-0.3.0.dist-info/RECORD +0 -21
- {dtflow-0.3.0.dist-info → dtflow-0.3.1.dist-info}/WHEEL +0 -0
- {dtflow-0.3.0.dist-info → dtflow-0.3.1.dist-info}/entry_points.txt +0 -0
dtflow/core.py
CHANGED
|
@@ -6,29 +6,16 @@ DataTransformer 核心模块
|
|
|
6
6
|
from typing import List, Dict, Any, Optional, Callable, Union, Tuple, Literal
|
|
7
7
|
from copy import deepcopy
|
|
8
8
|
from dataclasses import dataclass
|
|
9
|
-
import json
|
|
10
9
|
|
|
11
|
-
|
|
10
|
+
import orjson
|
|
12
11
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
import orjson
|
|
16
|
-
_HAS_ORJSON = True
|
|
17
|
-
except ImportError:
|
|
18
|
-
_HAS_ORJSON = False
|
|
12
|
+
from .storage.io import save_data, load_data
|
|
13
|
+
from .lineage import LineageTracker
|
|
19
14
|
|
|
20
15
|
|
|
21
16
|
def _fast_json_dumps(obj: Any) -> str:
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
orjson 比标准 json 快约 10 倍,特别适合大量数据的序列化场景。
|
|
26
|
-
"""
|
|
27
|
-
if _HAS_ORJSON:
|
|
28
|
-
# orjson.dumps 返回 bytes,需要 decode
|
|
29
|
-
return orjson.dumps(obj, option=orjson.OPT_SORT_KEYS).decode('utf-8')
|
|
30
|
-
else:
|
|
31
|
-
return json.dumps(obj, sort_keys=True, ensure_ascii=False)
|
|
17
|
+
"""快速 JSON 序列化(使用 orjson,比标准 json 快约 10 倍)"""
|
|
18
|
+
return orjson.dumps(obj, option=orjson.OPT_SORT_KEYS).decode('utf-8')
|
|
32
19
|
|
|
33
20
|
|
|
34
21
|
# ============ 错误处理 ============
|
|
@@ -102,8 +89,15 @@ class DataTransformer:
|
|
|
102
89
|
- fields/stats: 数据信息
|
|
103
90
|
"""
|
|
104
91
|
|
|
105
|
-
def __init__(
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
data: Optional[List[Dict[str, Any]]] = None,
|
|
95
|
+
_source_path: Optional[str] = None,
|
|
96
|
+
_lineage_tracker: Optional[LineageTracker] = None,
|
|
97
|
+
):
|
|
106
98
|
self._data = data if data is not None else []
|
|
99
|
+
self._source_path = _source_path
|
|
100
|
+
self._lineage_tracker = _lineage_tracker
|
|
107
101
|
|
|
108
102
|
@property
|
|
109
103
|
def data(self) -> List[Dict[str, Any]]:
|
|
@@ -122,23 +116,38 @@ class DataTransformer:
|
|
|
122
116
|
# ============ 加载/保存 ============
|
|
123
117
|
|
|
124
118
|
@classmethod
|
|
125
|
-
def load(cls, filepath: str) -> 'DataTransformer':
|
|
119
|
+
def load(cls, filepath: str, track_lineage: bool = False) -> 'DataTransformer':
|
|
126
120
|
"""
|
|
127
121
|
从文件加载数据。
|
|
128
122
|
|
|
129
123
|
支持格式: jsonl, json, csv, parquet(自动检测)
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
filepath: 文件路径
|
|
127
|
+
track_lineage: 是否追踪血缘(默认 False)
|
|
130
128
|
"""
|
|
131
129
|
data = load_data(filepath)
|
|
132
|
-
|
|
130
|
+
tracker = LineageTracker(filepath) if track_lineage else None
|
|
131
|
+
return cls(data, _source_path=filepath, _lineage_tracker=tracker)
|
|
133
132
|
|
|
134
|
-
def save(self, filepath: str) -> None:
|
|
133
|
+
def save(self, filepath: str, lineage: bool = False) -> None:
|
|
135
134
|
"""
|
|
136
135
|
保存数据到文件。
|
|
137
136
|
|
|
138
137
|
支持格式: jsonl, json, csv, parquet(根据扩展名)
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
filepath: 文件路径
|
|
141
|
+
lineage: 是否保存血缘元数据(默认 False)
|
|
139
142
|
"""
|
|
140
143
|
save_data(self._data, filepath)
|
|
141
144
|
|
|
145
|
+
# 保存血缘记录
|
|
146
|
+
if lineage and self._lineage_tracker:
|
|
147
|
+
lineage_path = self._lineage_tracker.save(filepath, len(self._data))
|
|
148
|
+
import sys
|
|
149
|
+
print(f"📜 血缘记录已保存: {lineage_path}", file=sys.stderr)
|
|
150
|
+
|
|
142
151
|
# ============ 核心转换 ============
|
|
143
152
|
|
|
144
153
|
def to(
|
|
@@ -230,7 +239,16 @@ class DataTransformer:
|
|
|
230
239
|
>>> # 原始模式(大数据集推荐)
|
|
231
240
|
>>> dt.transform(lambda x: {"q": x["q"]}, raw=True).save("output.jsonl")
|
|
232
241
|
"""
|
|
233
|
-
|
|
242
|
+
input_count = len(self._data)
|
|
243
|
+
result = self.to(func, on_error=on_error, raw=raw)
|
|
244
|
+
output_count = len(result)
|
|
245
|
+
|
|
246
|
+
# 传递血缘追踪器并记录操作
|
|
247
|
+
tracker = self._lineage_tracker
|
|
248
|
+
if tracker:
|
|
249
|
+
tracker.record("transform", {"func": func}, input_count, output_count)
|
|
250
|
+
|
|
251
|
+
return DataTransformer(result, _lineage_tracker=tracker)
|
|
234
252
|
|
|
235
253
|
# ============ 数据筛选 ============
|
|
236
254
|
|
|
@@ -281,7 +299,12 @@ class DataTransformer:
|
|
|
281
299
|
if errors:
|
|
282
300
|
_print_error_summary(errors, len(self._data))
|
|
283
301
|
|
|
284
|
-
|
|
302
|
+
# 传递血缘追踪器并记录操作
|
|
303
|
+
tracker = self._lineage_tracker
|
|
304
|
+
if tracker:
|
|
305
|
+
tracker.record("filter", {"func": func}, len(self._data), len(filtered))
|
|
306
|
+
|
|
307
|
+
return DataTransformer(filtered, _lineage_tracker=tracker)
|
|
285
308
|
|
|
286
309
|
def sample(self, n: int, seed: Optional[int] = None) -> 'DataTransformer':
|
|
287
310
|
"""
|
|
@@ -295,16 +318,30 @@ class DataTransformer:
|
|
|
295
318
|
if seed is not None:
|
|
296
319
|
random.seed(seed)
|
|
297
320
|
|
|
321
|
+
input_count = len(self._data)
|
|
298
322
|
data = self._data[:] if n >= len(self._data) else random.sample(self._data, n)
|
|
299
|
-
|
|
323
|
+
|
|
324
|
+
tracker = self._lineage_tracker
|
|
325
|
+
if tracker:
|
|
326
|
+
tracker.record("sample", {"n": n, "seed": seed}, input_count, len(data))
|
|
327
|
+
|
|
328
|
+
return DataTransformer(data, _lineage_tracker=tracker)
|
|
300
329
|
|
|
301
330
|
def head(self, n: int = 10) -> 'DataTransformer':
|
|
302
331
|
"""取前 n 条"""
|
|
303
|
-
|
|
332
|
+
data = self._data[:n]
|
|
333
|
+
tracker = self._lineage_tracker
|
|
334
|
+
if tracker:
|
|
335
|
+
tracker.record("head", {"n": n}, len(self._data), len(data))
|
|
336
|
+
return DataTransformer(data, _lineage_tracker=tracker)
|
|
304
337
|
|
|
305
338
|
def tail(self, n: int = 10) -> 'DataTransformer':
|
|
306
339
|
"""取后 n 条"""
|
|
307
|
-
|
|
340
|
+
data = self._data[-n:]
|
|
341
|
+
tracker = self._lineage_tracker
|
|
342
|
+
if tracker:
|
|
343
|
+
tracker.record("tail", {"n": n}, len(self._data), len(data))
|
|
344
|
+
return DataTransformer(data, _lineage_tracker=tracker)
|
|
308
345
|
|
|
309
346
|
def dedupe(
|
|
310
347
|
self,
|
|
@@ -338,7 +375,11 @@ class DataTransformer:
|
|
|
338
375
|
seen.add(k)
|
|
339
376
|
result.append(item)
|
|
340
377
|
|
|
341
|
-
|
|
378
|
+
tracker = self._lineage_tracker
|
|
379
|
+
if tracker:
|
|
380
|
+
tracker.record("dedupe", {"key": key}, len(self._data), len(result))
|
|
381
|
+
|
|
382
|
+
return DataTransformer(result, _lineage_tracker=tracker)
|
|
342
383
|
|
|
343
384
|
def _get_dedupe_key(
|
|
344
385
|
self,
|
|
@@ -442,7 +483,17 @@ class DataTransformer:
|
|
|
442
483
|
|
|
443
484
|
# 按原顺序保留数据
|
|
444
485
|
result = [self._data[i] for i in sorted(keep_indices)]
|
|
445
|
-
|
|
486
|
+
|
|
487
|
+
tracker = self._lineage_tracker
|
|
488
|
+
if tracker:
|
|
489
|
+
tracker.record(
|
|
490
|
+
"dedupe_similar",
|
|
491
|
+
{"key": key, "threshold": threshold, "num_perm": num_perm, "ngram": ngram},
|
|
492
|
+
len(self._data),
|
|
493
|
+
len(result),
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
return DataTransformer(result, _lineage_tracker=tracker)
|
|
446
497
|
|
|
447
498
|
def _get_text_for_similarity(
|
|
448
499
|
self,
|
|
@@ -581,7 +632,12 @@ class DataTransformer:
|
|
|
581
632
|
if seed is not None:
|
|
582
633
|
random.seed(seed)
|
|
583
634
|
random.shuffle(data)
|
|
584
|
-
|
|
635
|
+
|
|
636
|
+
tracker = self._lineage_tracker
|
|
637
|
+
if tracker:
|
|
638
|
+
tracker.record("shuffle", {"seed": seed}, len(self._data), len(data))
|
|
639
|
+
|
|
640
|
+
return DataTransformer(data, _lineage_tracker=tracker)
|
|
585
641
|
|
|
586
642
|
def split(self, ratio: float = 0.8, seed: Optional[int] = None) -> tuple:
|
|
587
643
|
"""
|
|
@@ -596,7 +652,16 @@ class DataTransformer:
|
|
|
596
652
|
"""
|
|
597
653
|
data = self.shuffle(seed).data
|
|
598
654
|
split_idx = int(len(data) * ratio)
|
|
599
|
-
|
|
655
|
+
|
|
656
|
+
# 分割后血缘追踪器各自独立
|
|
657
|
+
tracker = self._lineage_tracker
|
|
658
|
+
if tracker:
|
|
659
|
+
tracker.record("split", {"ratio": ratio, "seed": seed}, len(self._data), len(data))
|
|
660
|
+
|
|
661
|
+
return (
|
|
662
|
+
DataTransformer(data[:split_idx], _lineage_tracker=tracker),
|
|
663
|
+
DataTransformer(data[split_idx:], _lineage_tracker=tracker),
|
|
664
|
+
)
|
|
600
665
|
|
|
601
666
|
# ============ 并行处理 ============
|
|
602
667
|
|
dtflow/lineage.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
"""
|
|
2
|
+
数据血缘模块
|
|
3
|
+
|
|
4
|
+
记录数据处理的完整历史,支持数据溯源和版本对比。
|
|
5
|
+
"""
|
|
6
|
+
import hashlib
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
import orjson
|
|
10
|
+
import platform
|
|
11
|
+
import time
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Dict, List, Optional, Union
|
|
15
|
+
|
|
16
|
+
# 血缘元数据版本
|
|
17
|
+
LINEAGE_VERSION = "1.0"
|
|
18
|
+
|
|
19
|
+
# 元数据文件后缀
|
|
20
|
+
LINEAGE_SUFFIX = ".lineage.json"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _get_file_hash(filepath: str, sample_size: int = 10000) -> str:
|
|
24
|
+
"""
|
|
25
|
+
计算文件内容哈希(采样方式,避免大文件性能问题)。
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
filepath: 文件路径
|
|
29
|
+
sample_size: 采样字节数
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
文件哈希值(前16位)
|
|
33
|
+
"""
|
|
34
|
+
hasher = hashlib.sha256()
|
|
35
|
+
file_size = os.path.getsize(filepath)
|
|
36
|
+
|
|
37
|
+
with open(filepath, "rb") as f:
|
|
38
|
+
# 读取文件头
|
|
39
|
+
hasher.update(f.read(sample_size))
|
|
40
|
+
|
|
41
|
+
# 如果文件较大,还要读取中间和尾部
|
|
42
|
+
if file_size > sample_size * 3:
|
|
43
|
+
f.seek(file_size // 2)
|
|
44
|
+
hasher.update(f.read(sample_size))
|
|
45
|
+
f.seek(-sample_size, 2)
|
|
46
|
+
hasher.update(f.read(sample_size))
|
|
47
|
+
|
|
48
|
+
return hasher.hexdigest()[:16]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _get_lineage_path(data_path: str) -> str:
|
|
52
|
+
"""获取血缘元数据文件路径"""
|
|
53
|
+
return str(data_path) + LINEAGE_SUFFIX
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _get_environment_info() -> Dict[str, str]:
|
|
57
|
+
"""获取运行环境信息"""
|
|
58
|
+
return {
|
|
59
|
+
"python_version": platform.python_version(),
|
|
60
|
+
"platform": platform.system(),
|
|
61
|
+
"hostname": platform.node(),
|
|
62
|
+
"user": os.environ.get("USER", os.environ.get("USERNAME", "unknown")),
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class LineageRecord:
|
|
67
|
+
"""血缘记录"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
source: Optional[str] = None,
|
|
72
|
+
operations: Optional[List[Dict[str, Any]]] = None,
|
|
73
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
74
|
+
):
|
|
75
|
+
self.version = LINEAGE_VERSION
|
|
76
|
+
self.created_at = datetime.now().isoformat()
|
|
77
|
+
self.source = source
|
|
78
|
+
self.operations = operations or []
|
|
79
|
+
self.metadata = metadata or {}
|
|
80
|
+
self.environment = _get_environment_info()
|
|
81
|
+
|
|
82
|
+
def add_operation(
|
|
83
|
+
self,
|
|
84
|
+
op_type: str,
|
|
85
|
+
params: Optional[Dict[str, Any]] = None,
|
|
86
|
+
input_count: Optional[int] = None,
|
|
87
|
+
output_count: Optional[int] = None,
|
|
88
|
+
) -> "LineageRecord":
|
|
89
|
+
"""添加操作记录"""
|
|
90
|
+
op = {
|
|
91
|
+
"type": op_type,
|
|
92
|
+
"timestamp": datetime.now().isoformat(),
|
|
93
|
+
}
|
|
94
|
+
if params:
|
|
95
|
+
op["params"] = params
|
|
96
|
+
if input_count is not None:
|
|
97
|
+
op["input_count"] = input_count
|
|
98
|
+
if output_count is not None:
|
|
99
|
+
op["output_count"] = output_count
|
|
100
|
+
|
|
101
|
+
self.operations.append(op)
|
|
102
|
+
return self
|
|
103
|
+
|
|
104
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
105
|
+
"""转换为字典"""
|
|
106
|
+
return {
|
|
107
|
+
"version": self.version,
|
|
108
|
+
"created_at": self.created_at,
|
|
109
|
+
"source": self.source,
|
|
110
|
+
"operations": self.operations,
|
|
111
|
+
"metadata": self.metadata,
|
|
112
|
+
"environment": self.environment,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def from_dict(cls, data: Dict[str, Any]) -> "LineageRecord":
|
|
117
|
+
"""从字典创建"""
|
|
118
|
+
record = cls(
|
|
119
|
+
source=data.get("source"),
|
|
120
|
+
operations=data.get("operations", []),
|
|
121
|
+
metadata=data.get("metadata", {}),
|
|
122
|
+
)
|
|
123
|
+
record.version = data.get("version", LINEAGE_VERSION)
|
|
124
|
+
record.created_at = data.get("created_at", datetime.now().isoformat())
|
|
125
|
+
record.environment = data.get("environment", {})
|
|
126
|
+
return record
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class LineageTracker:
|
|
130
|
+
"""
|
|
131
|
+
血缘追踪器
|
|
132
|
+
|
|
133
|
+
用于记录数据处理的完整历史。
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def __init__(self, source_path: Optional[str] = None):
|
|
137
|
+
"""
|
|
138
|
+
初始化追踪器。
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
source_path: 源数据文件路径
|
|
142
|
+
"""
|
|
143
|
+
self.source_path = source_path
|
|
144
|
+
self.source_lineage = None
|
|
145
|
+
self.operations: List[Dict[str, Any]] = []
|
|
146
|
+
|
|
147
|
+
# 如果源文件有血缘记录,加载它
|
|
148
|
+
if source_path:
|
|
149
|
+
self.source_lineage = load_lineage(source_path)
|
|
150
|
+
|
|
151
|
+
def record(
|
|
152
|
+
self,
|
|
153
|
+
op_type: str,
|
|
154
|
+
params: Optional[Dict[str, Any]] = None,
|
|
155
|
+
input_count: Optional[int] = None,
|
|
156
|
+
output_count: Optional[int] = None,
|
|
157
|
+
) -> "LineageTracker":
|
|
158
|
+
"""
|
|
159
|
+
记录一次操作。
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
op_type: 操作类型 (filter, transform, dedupe, sample, etc.)
|
|
163
|
+
params: 操作参数
|
|
164
|
+
input_count: 输入数据量
|
|
165
|
+
output_count: 输出数据量
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
self,支持链式调用
|
|
169
|
+
"""
|
|
170
|
+
op = {
|
|
171
|
+
"type": op_type,
|
|
172
|
+
"timestamp": datetime.now().isoformat(),
|
|
173
|
+
}
|
|
174
|
+
if params:
|
|
175
|
+
# 清理参数,移除不可序列化的内容
|
|
176
|
+
op["params"] = _sanitize_params(params)
|
|
177
|
+
if input_count is not None:
|
|
178
|
+
op["input_count"] = input_count
|
|
179
|
+
if output_count is not None:
|
|
180
|
+
op["output_count"] = output_count
|
|
181
|
+
|
|
182
|
+
self.operations.append(op)
|
|
183
|
+
return self
|
|
184
|
+
|
|
185
|
+
def build_record(self, output_path: str, output_count: int) -> LineageRecord:
|
|
186
|
+
"""
|
|
187
|
+
构建最终的血缘记录。
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
output_path: 输出文件路径
|
|
191
|
+
output_count: 输出数据量
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
LineageRecord 对象
|
|
195
|
+
"""
|
|
196
|
+
# 构建来源信息
|
|
197
|
+
source_info = None
|
|
198
|
+
if self.source_path:
|
|
199
|
+
source_info = {
|
|
200
|
+
"path": str(self.source_path),
|
|
201
|
+
"hash": _get_file_hash(self.source_path) if os.path.exists(self.source_path) else None,
|
|
202
|
+
}
|
|
203
|
+
# 如果源文件有血缘,记录血缘链
|
|
204
|
+
if self.source_lineage:
|
|
205
|
+
source_info["lineage_ref"] = _get_lineage_path(self.source_path)
|
|
206
|
+
|
|
207
|
+
record = LineageRecord(
|
|
208
|
+
source=source_info,
|
|
209
|
+
operations=self.operations,
|
|
210
|
+
metadata={
|
|
211
|
+
"output_path": str(output_path),
|
|
212
|
+
"output_count": output_count,
|
|
213
|
+
},
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
return record
|
|
217
|
+
|
|
218
|
+
def save(self, output_path: str, output_count: int) -> str:
|
|
219
|
+
"""
|
|
220
|
+
保存血缘记录到文件。
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
output_path: 输出数据文件路径
|
|
224
|
+
output_count: 输出数据量
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
血缘文件路径
|
|
228
|
+
"""
|
|
229
|
+
record = self.build_record(output_path, output_count)
|
|
230
|
+
lineage_path = _get_lineage_path(output_path)
|
|
231
|
+
|
|
232
|
+
with open(lineage_path, "wb") as f:
|
|
233
|
+
f.write(orjson.dumps(record.to_dict(), option=orjson.OPT_INDENT_2))
|
|
234
|
+
|
|
235
|
+
return lineage_path
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]:
|
|
239
|
+
"""
|
|
240
|
+
清理参数,移除不可序列化的内容。
|
|
241
|
+
"""
|
|
242
|
+
result = {}
|
|
243
|
+
for key, value in params.items():
|
|
244
|
+
if callable(value):
|
|
245
|
+
# 函数:只记录名称
|
|
246
|
+
result[key] = f"<function:{getattr(value, '__name__', 'anonymous')}>"
|
|
247
|
+
elif isinstance(value, (str, int, float, bool, type(None))):
|
|
248
|
+
result[key] = value
|
|
249
|
+
elif isinstance(value, (list, tuple)):
|
|
250
|
+
result[key] = [_sanitize_value(v) for v in value]
|
|
251
|
+
elif isinstance(value, dict):
|
|
252
|
+
result[key] = _sanitize_params(value)
|
|
253
|
+
else:
|
|
254
|
+
result[key] = str(value)
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _sanitize_value(value: Any) -> Any:
|
|
259
|
+
"""清理单个值"""
|
|
260
|
+
if callable(value):
|
|
261
|
+
return f"<function:{getattr(value, '__name__', 'anonymous')}>"
|
|
262
|
+
elif isinstance(value, (str, int, float, bool, type(None))):
|
|
263
|
+
return value
|
|
264
|
+
elif isinstance(value, dict):
|
|
265
|
+
return _sanitize_params(value)
|
|
266
|
+
else:
|
|
267
|
+
return str(value)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# ============ 公共 API ============
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def load_lineage(data_path: str) -> Optional[LineageRecord]:
|
|
274
|
+
"""
|
|
275
|
+
加载数据文件的血缘记录。
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
data_path: 数据文件路径
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
LineageRecord 或 None(如果没有血缘记录)
|
|
282
|
+
"""
|
|
283
|
+
lineage_path = _get_lineage_path(data_path)
|
|
284
|
+
if not os.path.exists(lineage_path):
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
with open(lineage_path, "rb") as f:
|
|
289
|
+
data = orjson.loads(f.read())
|
|
290
|
+
return LineageRecord.from_dict(data)
|
|
291
|
+
except (orjson.JSONDecodeError, IOError):
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def get_lineage_chain(data_path: str, max_depth: int = 10) -> List[LineageRecord]:
|
|
296
|
+
"""
|
|
297
|
+
获取完整的血缘链。
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
data_path: 数据文件路径
|
|
301
|
+
max_depth: 最大追溯深度
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
血缘记录列表,从最新到最旧
|
|
305
|
+
"""
|
|
306
|
+
chain = []
|
|
307
|
+
current_path = data_path
|
|
308
|
+
visited = set()
|
|
309
|
+
|
|
310
|
+
for _ in range(max_depth):
|
|
311
|
+
if current_path in visited:
|
|
312
|
+
break # 避免循环引用
|
|
313
|
+
visited.add(current_path)
|
|
314
|
+
|
|
315
|
+
record = load_lineage(current_path)
|
|
316
|
+
if not record:
|
|
317
|
+
break
|
|
318
|
+
|
|
319
|
+
chain.append(record)
|
|
320
|
+
|
|
321
|
+
# 追溯到源文件
|
|
322
|
+
if record.source and isinstance(record.source, dict):
|
|
323
|
+
source_path = record.source.get("path")
|
|
324
|
+
if source_path and os.path.exists(source_path):
|
|
325
|
+
current_path = source_path
|
|
326
|
+
else:
|
|
327
|
+
break
|
|
328
|
+
else:
|
|
329
|
+
break
|
|
330
|
+
|
|
331
|
+
return chain
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def format_lineage_report(data_path: str) -> str:
|
|
335
|
+
"""
|
|
336
|
+
格式化血缘报告。
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
data_path: 数据文件路径
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
格式化的报告字符串
|
|
343
|
+
"""
|
|
344
|
+
chain = get_lineage_chain(data_path)
|
|
345
|
+
|
|
346
|
+
if not chain:
|
|
347
|
+
return f"文件 {data_path} 没有血缘记录"
|
|
348
|
+
|
|
349
|
+
lines = []
|
|
350
|
+
lines.append(f"📊 数据血缘报告: {data_path}")
|
|
351
|
+
lines.append("=" * 60)
|
|
352
|
+
|
|
353
|
+
for i, record in enumerate(chain):
|
|
354
|
+
prefix = "└─" if i == len(chain) - 1 else "├─"
|
|
355
|
+
indent = " " * i
|
|
356
|
+
|
|
357
|
+
# 基本信息
|
|
358
|
+
lines.append(f"{indent}{prefix} 版本 {i + 1}")
|
|
359
|
+
lines.append(f"{indent} 创建时间: {record.created_at}")
|
|
360
|
+
|
|
361
|
+
# 来源信息
|
|
362
|
+
if record.source:
|
|
363
|
+
if isinstance(record.source, dict):
|
|
364
|
+
lines.append(f"{indent} 来源: {record.source.get('path', 'unknown')}")
|
|
365
|
+
if record.source.get("hash"):
|
|
366
|
+
lines.append(f"{indent} 哈希: {record.source['hash']}")
|
|
367
|
+
else:
|
|
368
|
+
lines.append(f"{indent} 来源: {record.source}")
|
|
369
|
+
|
|
370
|
+
# 操作列表
|
|
371
|
+
if record.operations:
|
|
372
|
+
lines.append(f"{indent} 操作链:")
|
|
373
|
+
for j, op in enumerate(record.operations):
|
|
374
|
+
op_prefix = "└─" if j == len(record.operations) - 1 else "├─"
|
|
375
|
+
op_type = op.get("type", "unknown")
|
|
376
|
+
input_count = op.get("input_count", "?")
|
|
377
|
+
output_count = op.get("output_count", "?")
|
|
378
|
+
lines.append(f"{indent} {op_prefix} {op_type}: {input_count} → {output_count}")
|
|
379
|
+
|
|
380
|
+
# 显示参数
|
|
381
|
+
if op.get("params"):
|
|
382
|
+
for key, value in op["params"].items():
|
|
383
|
+
lines.append(f"{indent} {key}: {value}")
|
|
384
|
+
|
|
385
|
+
# 元数据
|
|
386
|
+
if record.metadata:
|
|
387
|
+
output_count = record.metadata.get("output_count")
|
|
388
|
+
if output_count:
|
|
389
|
+
lines.append(f"{indent} 输出数量: {output_count}")
|
|
390
|
+
|
|
391
|
+
lines.append("")
|
|
392
|
+
|
|
393
|
+
return "\n".join(lines)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def has_lineage(data_path: str) -> bool:
|
|
397
|
+
"""检查文件是否有血缘记录"""
|
|
398
|
+
return os.path.exists(_get_lineage_path(data_path))
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def delete_lineage(data_path: str) -> bool:
|
|
402
|
+
"""删除血缘记录"""
|
|
403
|
+
lineage_path = _get_lineage_path(data_path)
|
|
404
|
+
if os.path.exists(lineage_path):
|
|
405
|
+
os.remove(lineage_path)
|
|
406
|
+
return True
|
|
407
|
+
return False
|
dtflow/mcp/cli.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
提供 MCP 服务的安装和管理命令。
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
import orjson
|
|
7
7
|
import os
|
|
8
8
|
import platform
|
|
9
9
|
from pathlib import Path
|
|
@@ -63,9 +63,9 @@ def _install_to_config(config_path: Path, name: str, target_name: str) -> bool:
|
|
|
63
63
|
config = {}
|
|
64
64
|
if config_path.exists():
|
|
65
65
|
try:
|
|
66
|
-
with open(config_path, "
|
|
67
|
-
config =
|
|
68
|
-
except
|
|
66
|
+
with open(config_path, "rb") as f:
|
|
67
|
+
config = orjson.loads(f.read())
|
|
68
|
+
except orjson.JSONDecodeError:
|
|
69
69
|
if console:
|
|
70
70
|
console.print(f"[yellow]警告:[/yellow] {target_name} 配置文件格式错误,将创建新配置")
|
|
71
71
|
else:
|
|
@@ -87,8 +87,8 @@ def _install_to_config(config_path: Path, name: str, target_name: str) -> bool:
|
|
|
87
87
|
|
|
88
88
|
# 写入配置
|
|
89
89
|
try:
|
|
90
|
-
with open(config_path, "
|
|
91
|
-
|
|
90
|
+
with open(config_path, "wb") as f:
|
|
91
|
+
f.write(orjson.dumps(config, option=orjson.OPT_INDENT_2))
|
|
92
92
|
return True
|
|
93
93
|
except Exception as e:
|
|
94
94
|
if console:
|
|
@@ -108,9 +108,9 @@ def _uninstall_from_config(config_path: Path, name: str, target_name: str) -> bo
|
|
|
108
108
|
return False
|
|
109
109
|
|
|
110
110
|
try:
|
|
111
|
-
with open(config_path, "
|
|
112
|
-
config =
|
|
113
|
-
except
|
|
111
|
+
with open(config_path, "rb") as f:
|
|
112
|
+
config = orjson.loads(f.read())
|
|
113
|
+
except orjson.JSONDecodeError:
|
|
114
114
|
return False
|
|
115
115
|
|
|
116
116
|
if "mcpServers" not in config or name not in config["mcpServers"]:
|
|
@@ -119,8 +119,8 @@ def _uninstall_from_config(config_path: Path, name: str, target_name: str) -> bo
|
|
|
119
119
|
del config["mcpServers"][name]
|
|
120
120
|
|
|
121
121
|
try:
|
|
122
|
-
with open(config_path, "
|
|
123
|
-
|
|
122
|
+
with open(config_path, "wb") as f:
|
|
123
|
+
f.write(orjson.dumps(config, option=orjson.OPT_INDENT_2))
|
|
124
124
|
return True
|
|
125
125
|
except Exception:
|
|
126
126
|
return False
|
|
@@ -141,9 +141,9 @@ def _show_config_status(config_path: Path, target_name: str):
|
|
|
141
141
|
return
|
|
142
142
|
|
|
143
143
|
try:
|
|
144
|
-
with open(config_path, "
|
|
145
|
-
config =
|
|
146
|
-
except
|
|
144
|
+
with open(config_path, "rb") as f:
|
|
145
|
+
config = orjson.loads(f.read())
|
|
146
|
+
except orjson.JSONDecodeError:
|
|
147
147
|
if console:
|
|
148
148
|
console.print(" [red]配置文件格式错误[/red]")
|
|
149
149
|
else:
|