dtflow 0.4.3__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {dtflow-0.4.3 → dtflow-0.5.2}/PKG-INFO +117 -2
  2. {dtflow-0.4.3 → dtflow-0.5.2}/README.md +106 -1
  3. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/__init__.py +34 -1
  4. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/__main__.py +22 -0
  5. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/commands.py +5 -0
  6. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/common.py +13 -9
  7. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/stats.py +114 -36
  8. dtflow-0.5.2/dtflow/cli/validate.py +152 -0
  9. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/core.py +220 -10
  10. dtflow-0.5.2/dtflow/framework.py +610 -0
  11. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/lineage.py +17 -0
  12. dtflow-0.5.2/dtflow/schema.py +508 -0
  13. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/streaming.py +93 -35
  14. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/tokenizers.py +84 -29
  15. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/utils/field_path.py +6 -2
  16. {dtflow-0.4.3 → dtflow-0.5.2}/pyproject.toml +11 -0
  17. dtflow-0.5.2/tests/README.md +88 -0
  18. dtflow-0.5.2/tests/benchmark_sharegpt.py +392 -0
  19. dtflow-0.5.2/tests/test_cli_benchmark.py +565 -0
  20. dtflow-0.5.2/tests/test_framework.py +204 -0
  21. dtflow-0.5.2/tests/test_schema.py +547 -0
  22. {dtflow-0.4.3 → dtflow-0.5.2}/tests/test_streaming.py +80 -0
  23. {dtflow-0.4.3 → dtflow-0.5.2}/tests/test_transformer.py +77 -0
  24. {dtflow-0.4.3 → dtflow-0.5.2}/.gitignore +0 -0
  25. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/__init__.py +0 -0
  26. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/clean.py +0 -0
  27. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/io_ops.py +0 -0
  28. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/lineage.py +0 -0
  29. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/pipeline.py +0 -0
  30. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/sample.py +0 -0
  31. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/cli/transform.py +0 -0
  32. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/converters.py +0 -0
  33. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/mcp/__init__.py +0 -0
  34. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/mcp/__main__.py +0 -0
  35. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/mcp/cli.py +0 -0
  36. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/mcp/docs.py +0 -0
  37. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/mcp/server.py +0 -0
  38. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/pipeline.py +0 -0
  39. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/presets.py +0 -0
  40. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/storage/__init__.py +0 -0
  41. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/storage/io.py +0 -0
  42. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/utils/__init__.py +0 -0
  43. {dtflow-0.4.3 → dtflow-0.5.2}/dtflow/utils/display.py +0 -0
  44. {dtflow-0.4.3 → dtflow-0.5.2}/tests/benchmark_io.py +0 -0
  45. {dtflow-0.4.3 → dtflow-0.5.2}/tests/test_converters.py +0 -0
  46. {dtflow-0.4.3 → dtflow-0.5.2}/tests/test_field_path.py +0 -0
  47. {dtflow-0.4.3 → dtflow-0.5.2}/tests/test_io.py +0 -0
  48. {dtflow-0.4.3 → dtflow-0.5.2}/tests/test_lineage.py +0 -0
  49. {dtflow-0.4.3 → dtflow-0.5.2}/tests/test_pipeline.py +0 -0
  50. {dtflow-0.4.3 → dtflow-0.5.2}/tests/test_tokenizers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dtflow
3
- Version: 0.4.3
3
+ Version: 0.5.2
4
4
  Summary: A flexible data transformation tool for ML training formats (SFT, RLHF, Pretrain)
5
5
  Project-URL: Homepage, https://github.com/yourusername/DataTransformer
6
6
  Project-URL: Documentation, https://github.com/yourusername/DataTransformer#readme
@@ -32,16 +32,26 @@ Requires-Dist: orjson>=3.9.0
32
32
  Requires-Dist: polars>=0.20.0
33
33
  Requires-Dist: pyyaml>=5.4.0
34
34
  Requires-Dist: rich>=10.0.0
35
+ Requires-Dist: tiktoken>=0.5.0
35
36
  Requires-Dist: typer>=0.9.0
36
37
  Provides-Extra: converters
37
38
  Requires-Dist: datasets>=2.0.0; extra == 'converters'
38
39
  Provides-Extra: dev
39
40
  Requires-Dist: black>=21.0; extra == 'dev'
41
+ Requires-Dist: datasets>=2.0.0; extra == 'dev'
42
+ Requires-Dist: datasketch>=1.5.0; extra == 'dev'
40
43
  Requires-Dist: flake8>=3.9.0; extra == 'dev'
44
+ Requires-Dist: huggingface-hub>=0.20.0; extra == 'dev'
41
45
  Requires-Dist: isort>=5.9.0; extra == 'dev'
42
46
  Requires-Dist: mypy>=0.910; extra == 'dev'
47
+ Requires-Dist: pyarrow; extra == 'dev'
43
48
  Requires-Dist: pytest-cov>=2.12.0; extra == 'dev'
44
49
  Requires-Dist: pytest>=6.0.0; extra == 'dev'
50
+ Requires-Dist: rich>=10.0.0; extra == 'dev'
51
+ Requires-Dist: scikit-learn>=0.24.0; extra == 'dev'
52
+ Requires-Dist: tiktoken>=0.5.0; extra == 'dev'
53
+ Requires-Dist: tokenizers>=0.15.0; extra == 'dev'
54
+ Requires-Dist: toolong>=1.5.0; extra == 'dev'
45
55
  Provides-Extra: display
46
56
  Provides-Extra: docs
47
57
  Requires-Dist: myst-parser>=0.15.0; extra == 'docs'
@@ -129,7 +139,7 @@ dt.filter(lambda x: x.language == "zh")
129
139
  ### 数据验证
130
140
 
131
141
  ```python
132
- # 验证数据,返回不通过的记录列表
142
+ # 简单验证,返回不通过的记录列表
133
143
  errors = dt.validate(lambda x: len(x.messages) >= 2)
134
144
 
135
145
  if errors:
@@ -137,6 +147,53 @@ if errors:
137
147
  print(f"第 {e.index} 行: {e.error}")
138
148
  ```
139
149
 
150
+ ### Schema 验证
151
+
152
+ 使用 Schema 进行结构化数据验证:
153
+
154
+ ```python
155
+ from dtflow import Schema, Field, openai_chat_schema
156
+
157
+ # 使用预设 Schema
158
+ result = dt.validate_schema(openai_chat_schema)
159
+ print(result) # ValidationResult(valid=950, invalid=50, errors=[...])
160
+
161
+ # 自定义 Schema
162
+ schema = Schema({
163
+ "messages": Field(type="list", required=True, min_length=1),
164
+ "messages[*].role": Field(type="str", choices=["user", "assistant", "system"]),
165
+ "messages[*].content": Field(type="str", min_length=1),
166
+ "score": Field(type="float", min=0, max=1),
167
+ })
168
+
169
+ result = dt.validate_schema(schema)
170
+
171
+ # 过滤出有效数据
172
+ valid_dt = dt.validate_schema(schema, filter_invalid=True)
173
+ valid_dt.save("valid.jsonl")
174
+ ```
175
+
176
+ **预设 Schema**:
177
+
178
+ | Schema 名称 | 用途 |
179
+ |------------|------|
180
+ | `openai_chat_schema` | OpenAI messages 格式验证 |
181
+ | `alpaca_schema` | Alpaca instruction/output 格式 |
182
+ | `sharegpt_schema` | ShareGPT conversations 格式 |
183
+ | `dpo_schema` | DPO prompt/chosen/rejected 格式 |
184
+
185
+ **Field 参数**:
186
+
187
+ | 参数 | 说明 | 示例 |
188
+ |------|------|------|
189
+ | `type` | 类型验证 | `"str"`, `"int"`, `"float"`, `"bool"`, `"list"`, `"dict"` |
190
+ | `required` | 是否必填 | `True` / `False` |
191
+ | `min` / `max` | 数值范围 | `min=0, max=1` |
192
+ | `min_length` / `max_length` | 长度范围 | `min_length=1` |
193
+ | `choices` | 枚举值 | `choices=["user", "assistant"]` |
194
+ | `pattern` | 正则匹配 | `pattern=r"^\d{4}-\d{2}-\d{2}$"` |
195
+ | `custom` | 自定义验证 | `custom=lambda x: x > 0` |
196
+
140
197
  ### 数据转换
141
198
 
142
199
  ```python
@@ -286,6 +343,58 @@ dt.transform(to_swift_vlm(images_field="images")).save("swift_vlm.jsonl")
286
343
  # 输出: {"messages": [...], "images": ["/path/to/img.jpg"]}
287
344
  ```
288
345
 
346
+ ### 训练框架一键导出
347
+
348
+ 将数据导出为目标训练框架可直接使用的格式,自动生成配置文件:
349
+
350
+ ```python
351
+ from dtflow import DataTransformer
352
+
353
+ dt = DataTransformer.load("data.jsonl")
354
+
355
+ # 1. 检查框架兼容性
356
+ result = dt.check_compatibility("llama-factory")
357
+ print(result)
358
+ # ✅ 兼容 - LLaMA-Factory (openai_chat)
359
+ # 或
360
+ # ❌ 不兼容 - 错误: xxx
361
+
362
+ # 2. 一键导出到 LLaMA-Factory
363
+ files = dt.export_for("llama-factory", "./llama_ready/")
364
+ # 生成文件:
365
+ # - ./llama_ready/custom_dataset.json # 数据文件
366
+ # - ./llama_ready/dataset_info.json # 数据集配置
367
+ # - ./llama_ready/train_args.yaml # 训练参数模板
368
+
369
+ # 3. 导出到 ms-swift
370
+ files = dt.export_for("swift", "./swift_ready/")
371
+ # 生成: data.jsonl + train_swift.sh
372
+
373
+ # 4. 导出到 Axolotl
374
+ files = dt.export_for("axolotl", "./axolotl_ready/")
375
+ # 生成: data.jsonl + config.yaml
376
+
377
+ # 指定数据集名称
378
+ dt.export_for("llama-factory", "./output/", dataset_name="my_sft_data")
379
+ ```
380
+
381
+ **支持的框架**:
382
+
383
+ | 框架 | 导出内容 | 使用方式 |
384
+ |------|---------|---------|
385
+ | `llama-factory` | data.json + dataset_info.json + train_args.yaml | `llamafactory-cli train train_args.yaml` |
386
+ | `swift` | data.jsonl + train_swift.sh | `bash train_swift.sh` |
387
+ | `axolotl` | data.jsonl + config.yaml | `accelerate launch -m axolotl.cli.train config.yaml` |
388
+
389
+ **自动格式检测**:
390
+
391
+ | 检测到的格式 | 数据结构 |
392
+ |------------|---------|
393
+ | `openai_chat` | `{"messages": [{"role": "user", ...}]}` |
394
+ | `alpaca` | `{"instruction": ..., "output": ...}` |
395
+ | `sharegpt` | `{"conversations": [{"from": "human", ...}]}` |
396
+ | `dpo` | `{"prompt": ..., "chosen": ..., "rejected": ...}` |
397
+
289
398
  ### 其他操作
290
399
 
291
400
  ```python
@@ -361,6 +470,12 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
361
470
 
362
471
  # 数据统计
363
472
  dt stats data.jsonl
473
+
474
+ # 数据验证
475
+ dt validate data.jsonl --preset=openai_chat # 使用预设 schema 验证
476
+ dt validate data.jsonl --preset=alpaca --verbose # 详细输出
477
+ dt validate data.jsonl --preset=sharegpt --filter-invalid -o valid.jsonl # 过滤出有效数据
478
+ dt validate data.jsonl --preset=dpo --max-errors=100 # 限制错误输出数量
364
479
  ```
365
480
 
366
481
  ### 字段路径语法
@@ -53,7 +53,7 @@ dt.filter(lambda x: x.language == "zh")
53
53
  ### 数据验证
54
54
 
55
55
  ```python
56
- # 验证数据,返回不通过的记录列表
56
+ # 简单验证,返回不通过的记录列表
57
57
  errors = dt.validate(lambda x: len(x.messages) >= 2)
58
58
 
59
59
  if errors:
@@ -61,6 +61,53 @@ if errors:
61
61
  print(f"第 {e.index} 行: {e.error}")
62
62
  ```
63
63
 
64
+ ### Schema 验证
65
+
66
+ 使用 Schema 进行结构化数据验证:
67
+
68
+ ```python
69
+ from dtflow import Schema, Field, openai_chat_schema
70
+
71
+ # 使用预设 Schema
72
+ result = dt.validate_schema(openai_chat_schema)
73
+ print(result) # ValidationResult(valid=950, invalid=50, errors=[...])
74
+
75
+ # 自定义 Schema
76
+ schema = Schema({
77
+ "messages": Field(type="list", required=True, min_length=1),
78
+ "messages[*].role": Field(type="str", choices=["user", "assistant", "system"]),
79
+ "messages[*].content": Field(type="str", min_length=1),
80
+ "score": Field(type="float", min=0, max=1),
81
+ })
82
+
83
+ result = dt.validate_schema(schema)
84
+
85
+ # 过滤出有效数据
86
+ valid_dt = dt.validate_schema(schema, filter_invalid=True)
87
+ valid_dt.save("valid.jsonl")
88
+ ```
89
+
90
+ **预设 Schema**:
91
+
92
+ | Schema 名称 | 用途 |
93
+ |------------|------|
94
+ | `openai_chat_schema` | OpenAI messages 格式验证 |
95
+ | `alpaca_schema` | Alpaca instruction/output 格式 |
96
+ | `sharegpt_schema` | ShareGPT conversations 格式 |
97
+ | `dpo_schema` | DPO prompt/chosen/rejected 格式 |
98
+
99
+ **Field 参数**:
100
+
101
+ | 参数 | 说明 | 示例 |
102
+ |------|------|------|
103
+ | `type` | 类型验证 | `"str"`, `"int"`, `"float"`, `"bool"`, `"list"`, `"dict"` |
104
+ | `required` | 是否必填 | `True` / `False` |
105
+ | `min` / `max` | 数值范围 | `min=0, max=1` |
106
+ | `min_length` / `max_length` | 长度范围 | `min_length=1` |
107
+ | `choices` | 枚举值 | `choices=["user", "assistant"]` |
108
+ | `pattern` | 正则匹配 | `pattern=r"^\d{4}-\d{2}-\d{2}$"` |
109
+ | `custom` | 自定义验证 | `custom=lambda x: x > 0` |
110
+
64
111
  ### 数据转换
65
112
 
66
113
  ```python
@@ -210,6 +257,58 @@ dt.transform(to_swift_vlm(images_field="images")).save("swift_vlm.jsonl")
210
257
  # 输出: {"messages": [...], "images": ["/path/to/img.jpg"]}
211
258
  ```
212
259
 
260
+ ### 训练框架一键导出
261
+
262
+ 将数据导出为目标训练框架可直接使用的格式,自动生成配置文件:
263
+
264
+ ```python
265
+ from dtflow import DataTransformer
266
+
267
+ dt = DataTransformer.load("data.jsonl")
268
+
269
+ # 1. 检查框架兼容性
270
+ result = dt.check_compatibility("llama-factory")
271
+ print(result)
272
+ # ✅ 兼容 - LLaMA-Factory (openai_chat)
273
+ # 或
274
+ # ❌ 不兼容 - 错误: xxx
275
+
276
+ # 2. 一键导出到 LLaMA-Factory
277
+ files = dt.export_for("llama-factory", "./llama_ready/")
278
+ # 生成文件:
279
+ # - ./llama_ready/custom_dataset.json # 数据文件
280
+ # - ./llama_ready/dataset_info.json # 数据集配置
281
+ # - ./llama_ready/train_args.yaml # 训练参数模板
282
+
283
+ # 3. 导出到 ms-swift
284
+ files = dt.export_for("swift", "./swift_ready/")
285
+ # 生成: data.jsonl + train_swift.sh
286
+
287
+ # 4. 导出到 Axolotl
288
+ files = dt.export_for("axolotl", "./axolotl_ready/")
289
+ # 生成: data.jsonl + config.yaml
290
+
291
+ # 指定数据集名称
292
+ dt.export_for("llama-factory", "./output/", dataset_name="my_sft_data")
293
+ ```
294
+
295
+ **支持的框架**:
296
+
297
+ | 框架 | 导出内容 | 使用方式 |
298
+ |------|---------|---------|
299
+ | `llama-factory` | data.json + dataset_info.json + train_args.yaml | `llamafactory-cli train train_args.yaml` |
300
+ | `swift` | data.jsonl + train_swift.sh | `bash train_swift.sh` |
301
+ | `axolotl` | data.jsonl + config.yaml | `accelerate launch -m axolotl.cli.train config.yaml` |
302
+
303
+ **自动格式检测**:
304
+
305
+ | 检测到的格式 | 数据结构 |
306
+ |------------|---------|
307
+ | `openai_chat` | `{"messages": [{"role": "user", ...}]}` |
308
+ | `alpaca` | `{"instruction": ..., "output": ...}` |
309
+ | `sharegpt` | `{"conversations": [{"from": "human", ...}]}` |
310
+ | `dpo` | `{"prompt": ..., "chosen": ..., "rejected": ...}` |
311
+
213
312
  ### 其他操作
214
313
 
215
314
  ```python
@@ -285,6 +384,12 @@ dt concat a.jsonl b.jsonl -o merged.jsonl
285
384
 
286
385
  # 数据统计
287
386
  dt stats data.jsonl
387
+
388
+ # 数据验证
389
+ dt validate data.jsonl --preset=openai_chat # 使用预设 schema 验证
390
+ dt validate data.jsonl --preset=alpaca --verbose # 详细输出
391
+ dt validate data.jsonl --preset=sharegpt --filter-invalid -o valid.jsonl # 过滤出有效数据
392
+ dt validate data.jsonl --preset=dpo --max-errors=100 # 限制错误输出数量
288
393
  ```
289
394
 
290
395
  ### 字段路径语法
@@ -4,6 +4,7 @@ DataTransformer: 简洁的数据格式转换工具
4
4
  核心功能:
5
5
  - DataTransformer: 数据加载、转换、保存
6
6
  - presets: 预设转换模板 (openai_chat, alpaca, sharegpt, dpo_pair, simple_qa)
7
+ - schema: 数据结构验证 (Schema, Field)
7
8
  - tokenizers: Token 统计和过滤
8
9
  - converters: HuggingFace/OpenAI 等格式转换
9
10
  """
@@ -26,6 +27,23 @@ from .converters import ( # LLaMA-Factory 扩展; ms-swift
26
27
  )
27
28
  from .core import DataTransformer, DictWrapper, TransformError, TransformErrors
28
29
  from .presets import get_preset, list_presets
30
+ from .schema import (
31
+ Field,
32
+ Schema,
33
+ ValidationError,
34
+ ValidationResult,
35
+ alpaca_schema,
36
+ dpo_schema,
37
+ openai_chat_schema,
38
+ sharegpt_schema,
39
+ validate_data,
40
+ )
41
+ from .framework import (
42
+ CompatibilityResult,
43
+ check_compatibility,
44
+ detect_format,
45
+ export_for,
46
+ )
29
47
  from .storage import load_data, sample_file, save_data
30
48
  from .streaming import StreamingTransformer, load_sharded, load_stream, process_shards
31
49
  from .tokenizers import (
@@ -42,7 +60,7 @@ from .tokenizers import (
42
60
  token_stats,
43
61
  )
44
62
 
45
- __version__ = "0.4.3"
63
+ __version__ = "0.5.2"
46
64
 
47
65
  __all__ = [
48
66
  # core
@@ -53,6 +71,21 @@ __all__ = [
53
71
  # presets
54
72
  "get_preset",
55
73
  "list_presets",
74
+ # schema
75
+ "Schema",
76
+ "Field",
77
+ "ValidationResult",
78
+ "ValidationError",
79
+ "validate_data",
80
+ "openai_chat_schema",
81
+ "alpaca_schema",
82
+ "dpo_schema",
83
+ "sharegpt_schema",
84
+ # framework
85
+ "CompatibilityResult",
86
+ "check_compatibility",
87
+ "detect_format",
88
+ "export_for",
56
89
  # storage
57
90
  "save_data",
58
91
  "load_data",
@@ -18,6 +18,7 @@ Commands:
18
18
  clean 数据清洗
19
19
  run 执行 Pipeline 配置文件
20
20
  history 显示数据血缘历史
21
+ validate 使用 Schema 验证数据格式
21
22
  mcp MCP 服务管理(install/uninstall/status)
22
23
  logs 日志查看工具使用说明
23
24
  """
@@ -40,6 +41,7 @@ from .cli.commands import stats as _stats
40
41
  from .cli.commands import tail as _tail
41
42
  from .cli.commands import token_stats as _token_stats
42
43
  from .cli.commands import transform as _transform
44
+ from .cli.commands import validate as _validate
43
45
 
44
46
  # 创建主应用
45
47
  app = typer.Typer(
@@ -211,6 +213,26 @@ def history(
211
213
  _history(filename, json)
212
214
 
213
215
 
216
+ # ============ 验证命令 ============
217
+
218
+
219
+ @app.command()
220
+ def validate(
221
+ filename: str = typer.Argument(..., help="输入文件路径"),
222
+ preset: Optional[str] = typer.Option(
223
+ None, "--preset", "-p", help="预设 Schema: openai_chat, alpaca, dpo, sharegpt"
224
+ ),
225
+ output: Optional[str] = typer.Option(None, "--output", "-o", help="输出有效数据的文件路径"),
226
+ filter: bool = typer.Option(
227
+ False, "--filter", "-f", help="过滤无效数据并保存"
228
+ ),
229
+ max_errors: int = typer.Option(20, "--max-errors", help="最多显示的错误数量"),
230
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="显示详细信息"),
231
+ ):
232
+ """使用预设 Schema 验证数据格式"""
233
+ _validate(filename, preset, output, filter, max_errors, verbose)
234
+
235
+
214
236
  # ============ 工具命令 ============
215
237
 
216
238
 
@@ -33,6 +33,9 @@ from .pipeline import run
33
33
  # 血缘追踪命令
34
34
  from .lineage import history
35
35
 
36
+ # 验证命令
37
+ from .validate import validate
38
+
36
39
  __all__ = [
37
40
  # 采样
38
41
  "sample",
@@ -53,4 +56,6 @@ __all__ = [
53
56
  "run",
54
57
  # 血缘
55
58
  "history",
59
+ # 验证
60
+ "validate",
56
61
  ]
@@ -57,7 +57,7 @@ def _get_file_row_count(filepath: Path) -> Optional[int]:
57
57
  return None
58
58
 
59
59
 
60
- def _format_value(value: Any, max_len: int = 80) -> str:
60
+ def _format_value(value: Any, max_len: int = 120) -> str:
61
61
  """格式化单个值,长文本截断。"""
62
62
  if value is None:
63
63
  return "[dim]null[/dim]"
@@ -66,18 +66,22 @@ def _format_value(value: Any, max_len: int = 80) -> str:
66
66
  if isinstance(value, (int, float)):
67
67
  return f"[cyan]{value}[/cyan]"
68
68
  if isinstance(value, str):
69
+ half_len = max_len // 2
69
70
  # 处理多行文本
70
71
  if "\n" in value:
71
72
  lines = value.split("\n")
72
- if len(lines) > 3:
73
- preview = lines[0][:max_len] + f"... [dim]({len(lines)} 行)[/dim]"
74
- else:
75
- preview = value.replace("\n", "\\n")
76
- if len(preview) > max_len:
77
- preview = preview[:max_len] + "..."
73
+ preview = value.replace("\n", "\\n")
74
+ if len(preview) > max_len:
75
+ # 前半 + 省略标记 + 后半
76
+ head = preview[:half_len]
77
+ tail = preview[-half_len:]
78
+ return f'"{head} [yellow]<<<{len(lines)}行>>>[/yellow] {tail}"'
78
79
  return f'"{preview}"'
79
80
  if len(value) > max_len:
80
- return f'"{value[:max_len]}..." [dim]({len(value)} 字符)[/dim]'
81
+ # 前半 + 省略标记 + 后半
82
+ head = value[:half_len]
83
+ tail = value[-half_len:]
84
+ return f'"{head} [yellow]<<<{len(value)}字符>>>[/yellow] {tail}"'
81
85
  return f'"{value}"'
82
86
  return str(value)
83
87
 
@@ -86,7 +90,7 @@ def _format_nested(
86
90
  value: Any,
87
91
  indent: str = "",
88
92
  is_last: bool = True,
89
- max_len: int = 80,
93
+ max_len: int = 120,
90
94
  ) -> List[str]:
91
95
  """
92
96
  递归格式化嵌套结构,返回行列表。