dtflow 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtflow/cli/common.py ADDED
@@ -0,0 +1,384 @@
1
+ """
2
+ CLI 通用工具函数
3
+ """
4
+
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import orjson
9
+
10
+ # 支持的文件格式
11
+ SUPPORTED_FORMATS = {".csv", ".jsonl", ".json", ".xlsx", ".xls", ".parquet", ".arrow", ".feather"}
12
+
13
+ # 支持流式处理的格式(与 streaming.py 保持一致)
14
+ STREAMING_FORMATS = {".jsonl", ".csv", ".parquet", ".arrow", ".feather"}
15
+
16
+
17
+ def _is_streaming_supported(filepath: Path) -> bool:
18
+ """检查文件是否支持流式处理"""
19
+ return filepath.suffix.lower() in STREAMING_FORMATS
20
+
21
+
22
+ def _check_file_format(filepath: Path) -> bool:
23
+ """检查文件格式是否支持,不支持则打印错误信息并返回 False"""
24
+ ext = filepath.suffix.lower()
25
+ if ext not in SUPPORTED_FORMATS:
26
+ print(f"错误: 不支持的文件格式 - {ext}")
27
+ print(f"支持的格式: {', '.join(sorted(SUPPORTED_FORMATS))}")
28
+ return False
29
+ return True
30
+
31
+
32
+ def _get_file_row_count(filepath: Path) -> Optional[int]:
33
+ """
34
+ 快速获取文件行数(不加载全部数据)。
35
+
36
+ 支持 JSONL、CSV、Parquet、Arrow 格式的快速计数。
37
+ 对于不支持的格式(如 JSON、Excel),会加载数据计数。
38
+ """
39
+ from ..streaming import _count_rows_fast
40
+
41
+ # 先尝试快速计数(支持 JSONL/CSV/Parquet/Arrow)
42
+ count = _count_rows_fast(str(filepath))
43
+ if count is not None:
44
+ return count
45
+
46
+ # 对于其他格式(JSON、Excel),需要加载数据
47
+ ext = filepath.suffix.lower()
48
+ if ext in (".json", ".xlsx", ".xls"):
49
+ try:
50
+ from ..storage.io import load_data
51
+
52
+ data = load_data(str(filepath))
53
+ return len(data)
54
+ except Exception:
55
+ return None
56
+
57
+ return None
58
+
59
+
60
+ def _format_value(value: Any, max_len: int = 80) -> str:
61
+ """格式化单个值,长文本截断。"""
62
+ if value is None:
63
+ return "[dim]null[/dim]"
64
+ if isinstance(value, bool):
65
+ return "[cyan]true[/cyan]" if value else "[cyan]false[/cyan]"
66
+ if isinstance(value, (int, float)):
67
+ return f"[cyan]{value}[/cyan]"
68
+ if isinstance(value, str):
69
+ # 处理多行文本
70
+ if "\n" in value:
71
+ lines = value.split("\n")
72
+ if len(lines) > 3:
73
+ preview = lines[0][:max_len] + f"... [dim]({len(lines)} 行)[/dim]"
74
+ else:
75
+ preview = value.replace("\n", "\\n")
76
+ if len(preview) > max_len:
77
+ preview = preview[:max_len] + "..."
78
+ return f'"{preview}"'
79
+ if len(value) > max_len:
80
+ return f'"{value[:max_len]}..." [dim]({len(value)} 字符)[/dim]'
81
+ return f'"{value}"'
82
+ return str(value)
83
+
84
+
85
+ def _format_nested(
86
+ value: Any,
87
+ indent: str = "",
88
+ is_last: bool = True,
89
+ max_len: int = 80,
90
+ ) -> List[str]:
91
+ """
92
+ 递归格式化嵌套结构,返回行列表。
93
+
94
+ 使用树形符号展示结构:
95
+ ├─ 中间项
96
+ └─ 最后一项
97
+ """
98
+ lines = []
99
+ branch = "└─ " if is_last else "├─ "
100
+ cont = " " if is_last else "│ "
101
+
102
+ if isinstance(value, dict):
103
+ items = list(value.items())
104
+ for i, (k, v) in enumerate(items):
105
+ is_last_item = i == len(items) - 1
106
+ b = "└─ " if is_last_item else "├─ "
107
+ c = " " if is_last_item else "│ "
108
+
109
+ if isinstance(v, (dict, list)) and v:
110
+ # 嵌套结构
111
+ if isinstance(v, list):
112
+ # 检测是否为 messages 格式
113
+ is_messages = (
114
+ v and isinstance(v[0], dict) and "role" in v[0] and "content" in v[0]
115
+ )
116
+ if is_messages:
117
+ lines.append(
118
+ f"{indent}{b}[green]{k}[/green]: ({len(v)} items) [dim]→ \\[role]: content[/dim]"
119
+ )
120
+ else:
121
+ lines.append(f"{indent}{b}[green]{k}[/green]: ({len(v)} items)")
122
+ else:
123
+ lines.append(f"{indent}{b}[green]{k}[/green]:")
124
+ lines.extend(_format_nested(v, indent + c, True, max_len))
125
+ else:
126
+ # 简单值
127
+ lines.append(f"{indent}{b}[green]{k}[/green]: {_format_value(v, max_len)}")
128
+
129
+ elif isinstance(value, list):
130
+ for i, item in enumerate(value):
131
+ is_last_item = i == len(value) - 1
132
+ b = "└─ " if is_last_item else "├─ "
133
+ c = " " if is_last_item else "│ "
134
+
135
+ if isinstance(item, dict):
136
+ # 列表中的字典项 - 检测是否为 messages 格式
137
+ if "role" in item and "content" in item:
138
+ role = item.get("role", "")
139
+ content = item.get("content", "")
140
+ # 截断长内容
141
+ if len(content) > max_len:
142
+ content = content[:max_len].replace("\n", "\\n") + "..."
143
+ else:
144
+ content = content.replace("\n", "\\n")
145
+ # 使用 \[ 转义避免被 rich 解析为样式
146
+ lines.append(f"{indent}{b}[yellow]\\[{role}]:[/yellow] {content}")
147
+ else:
148
+ # 普通字典
149
+ lines.append(f"{indent}{b}[dim]{{...}}[/dim]")
150
+ lines.extend(_format_nested(item, indent + c, True, max_len))
151
+ elif isinstance(item, list):
152
+ lines.append(f"{indent}{b}[dim][{len(item)} items][/dim]")
153
+ lines.extend(_format_nested(item, indent + c, True, max_len))
154
+ else:
155
+ lines.append(f"{indent}{b}{_format_value(item, max_len)}")
156
+
157
+ return lines
158
+
159
+
160
+ def _is_simple_data(samples: List[Dict]) -> bool:
161
+ """判断数据是否适合表格展示(无嵌套结构)。"""
162
+ if not samples or not isinstance(samples[0], dict):
163
+ return False
164
+ keys = list(samples[0].keys())
165
+ if len(keys) > 6:
166
+ return False
167
+ for s in samples[:3]:
168
+ for k in keys:
169
+ v = s.get(k)
170
+ if isinstance(v, (dict, list)):
171
+ return False
172
+ if isinstance(v, str) and len(v) > 80:
173
+ return False
174
+ return True
175
+
176
+
177
+ def _print_samples(
178
+ samples: list,
179
+ filename: Optional[str] = None,
180
+ total_count: Optional[int] = None,
181
+ fields: Optional[List[str]] = None,
182
+ ) -> None:
183
+ """
184
+ 打印采样结果。
185
+
186
+ Args:
187
+ samples: 采样数据列表
188
+ filename: 文件名(用于显示概览)
189
+ total_count: 文件总行数(用于显示概览)
190
+ fields: 只显示指定字段
191
+ """
192
+ if not samples:
193
+ print("没有数据")
194
+ return
195
+
196
+ # 过滤字段
197
+ if fields and isinstance(samples[0], dict):
198
+ field_set = set(fields)
199
+ samples = [{k: v for k, v in item.items() if k in field_set} for item in samples]
200
+
201
+ try:
202
+ from rich.console import Console
203
+ from rich.panel import Panel
204
+ from rich.table import Table
205
+
206
+ console = Console()
207
+
208
+ # 显示数据概览头部
209
+ if filename:
210
+ all_fields = set()
211
+ for item in samples:
212
+ if isinstance(item, dict):
213
+ all_fields.update(item.keys())
214
+ field_names = ", ".join(sorted(all_fields))
215
+
216
+ if total_count is not None:
217
+ info = f"总行数: {total_count:,} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
218
+ else:
219
+ info = f"采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
220
+
221
+ console.print(
222
+ Panel(
223
+ f"[dim]{info}[/dim]\n[dim]字段: {field_names}[/dim]",
224
+ title=f"[bold]📊 {filename}[/bold]",
225
+ expand=False,
226
+ border_style="dim",
227
+ )
228
+ )
229
+ console.print()
230
+
231
+ # 简单数据用表格展示
232
+ if _is_simple_data(samples):
233
+ keys = list(samples[0].keys())
234
+ table = Table(show_header=True, header_style="bold cyan")
235
+ for key in keys:
236
+ table.add_column(key, overflow="fold")
237
+ for item in samples:
238
+ table.add_row(*[str(item.get(k, "")) for k in keys])
239
+ console.print(table)
240
+ return
241
+
242
+ # 嵌套数据用树形结构展示
243
+ for i, item in enumerate(samples, 1):
244
+ console.print(f"[bold cyan]--- 第 {i} 条 ---[/bold cyan]")
245
+ if isinstance(item, dict):
246
+ for line in _format_nested(item):
247
+ console.print(line)
248
+ else:
249
+ console.print(_format_value(item))
250
+ console.print()
251
+
252
+ except ImportError:
253
+ # 没有 rich,使用普通打印
254
+ if filename:
255
+ all_fields = set()
256
+ for item in samples:
257
+ if isinstance(item, dict):
258
+ all_fields.update(item.keys())
259
+
260
+ print(f"\n📊 {filename}")
261
+ if total_count is not None:
262
+ print(
263
+ f" 总行数: {total_count:,} | 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个"
264
+ )
265
+ else:
266
+ print(f" 采样: {len(samples)} 条 | 字段: {len(all_fields)} 个")
267
+ print(f" 字段: {', '.join(sorted(all_fields))}")
268
+ print()
269
+
270
+ for i, item in enumerate(samples, 1):
271
+ print(f"--- 第 {i} 条 ---")
272
+ print(orjson.dumps(item, option=orjson.OPT_INDENT_2).decode("utf-8"))
273
+ print()
274
+
275
+
276
+ def _parse_field_list(value: Any) -> List[str]:
277
+ """解析字段列表参数(处理 fire 将逗号分隔的值解析为元组的情况)"""
278
+ if isinstance(value, (list, tuple)):
279
+ return [str(f).strip() for f in value]
280
+ elif isinstance(value, str):
281
+ return [f.strip() for f in value.split(",")]
282
+ else:
283
+ return [str(value)]
284
+
285
+
286
+ def _is_empty_value(v: Any) -> bool:
287
+ """判断值是否为空"""
288
+ if v is None:
289
+ return True
290
+ if isinstance(v, str) and v.strip() == "":
291
+ return True
292
+ if isinstance(v, (list, dict)) and len(v) == 0:
293
+ return True
294
+ return False
295
+
296
+
297
+ def _get_value_len(value: Any) -> int:
298
+ """
299
+ 获取值的长度。
300
+
301
+ - str/list/dict: 返回 len()
302
+ - int/float: 直接返回该数值(用于 messages.# 这种返回数量的场景)
303
+ - None: 返回 0
304
+ - 其他: 转为字符串后返回长度
305
+ """
306
+ if value is None:
307
+ return 0
308
+ if isinstance(value, (int, float)):
309
+ return int(value)
310
+ if isinstance(value, (str, list, dict)):
311
+ return len(value)
312
+ return len(str(value))
313
+
314
+
315
+ def _infer_type(values: List[Any]) -> str:
316
+ """推断字段类型"""
317
+ if not values:
318
+ return "unknown"
319
+
320
+ sample = values[0]
321
+ if isinstance(sample, bool):
322
+ return "bool"
323
+ if isinstance(sample, int):
324
+ return "int"
325
+ if isinstance(sample, float):
326
+ return "float"
327
+ if isinstance(sample, list):
328
+ return "list"
329
+ if isinstance(sample, dict):
330
+ return "dict"
331
+ return "str"
332
+
333
+
334
+ def _is_numeric(v: Any) -> bool:
335
+ """检查值是否为数值"""
336
+ if isinstance(v, (int, float)) and not isinstance(v, bool):
337
+ return True
338
+ return False
339
+
340
+
341
+ def _truncate(v: Any, max_width: int) -> str:
342
+ """按显示宽度截断值(中文字符算 2 宽度)"""
343
+ s = str(v)
344
+ width = 0
345
+ result = []
346
+ for char in s:
347
+ # CJK 字符范围
348
+ if (
349
+ "\u4e00" <= char <= "\u9fff"
350
+ or "\u3000" <= char <= "\u303f"
351
+ or "\uff00" <= char <= "\uffef"
352
+ ):
353
+ char_width = 2
354
+ else:
355
+ char_width = 1
356
+ if width + char_width > max_width - 3: # 预留 ... 的宽度
357
+ return "".join(result) + "..."
358
+ result.append(char)
359
+ width += char_width
360
+ return s
361
+
362
+
363
+ def _display_width(s: str) -> int:
364
+ """计算字符串的显示宽度(中文字符算 2,ASCII 字符算 1)"""
365
+ width = 0
366
+ for char in s:
367
+ # CJK 字符范围
368
+ if (
369
+ "\u4e00" <= char <= "\u9fff"
370
+ or "\u3000" <= char <= "\u303f"
371
+ or "\uff00" <= char <= "\uffef"
372
+ ):
373
+ width += 2
374
+ else:
375
+ width += 1
376
+ return width
377
+
378
+
379
+ def _pad_to_width(s: str, target_width: int) -> str:
380
+ """将字符串填充到指定的显示宽度"""
381
+ current_width = _display_width(s)
382
+ if current_width >= target_width:
383
+ return s
384
+ return s + " " * (target_width - current_width)