maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
maque/cli/groups/data.py
ADDED
|
@@ -0,0 +1,615 @@
|
|
|
1
|
+
"""数据处理命令组"""
|
|
2
|
+
import json
|
|
3
|
+
import csv
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from maque.io import jsonl_load
|
|
6
|
+
from typing import Union, Optional, List
|
|
7
|
+
from rich import print
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DataGroup:
|
|
13
|
+
"""数据处理命令组"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, cli_instance):
|
|
16
|
+
self.cli = cli_instance
|
|
17
|
+
self.console = Console()
|
|
18
|
+
|
|
19
|
+
def table_viewer(
|
|
20
|
+
self,
|
|
21
|
+
file_path: str = None,
|
|
22
|
+
port: int = 8080,
|
|
23
|
+
host: str = "127.0.0.1",
|
|
24
|
+
sheet_name: Union[str, int] = 0,
|
|
25
|
+
image_columns: str = None,
|
|
26
|
+
auto_detect_images: bool = True,
|
|
27
|
+
auto_open: bool = True,
|
|
28
|
+
**kwargs
|
|
29
|
+
):
|
|
30
|
+
"""启动交互式表格查看器
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
file_path: 表格文件路径(支持.xlsx, .xls, .csv格式)
|
|
34
|
+
port: 服务器端口,默认8080
|
|
35
|
+
host: 服务器主机地址,默认127.0.0.1
|
|
36
|
+
sheet_name: Excel文件的sheet名称或索引,默认为0
|
|
37
|
+
image_columns: 指定图片列名,用逗号分隔
|
|
38
|
+
auto_detect_images: 是否自动检测图片列,默认True
|
|
39
|
+
auto_open: 是否自动打开浏览器,默认True
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
maque data table-viewer data.xlsx
|
|
43
|
+
maque data table-viewer "products.csv" --port=9090
|
|
44
|
+
"""
|
|
45
|
+
# 直接调用 table_viewer 实现,避免循环引用
|
|
46
|
+
from maque.table_viewer import start_table_viewer
|
|
47
|
+
|
|
48
|
+
# 处理 image_columns 参数
|
|
49
|
+
if image_columns:
|
|
50
|
+
image_columns = [col.strip() for col in image_columns.split(',')]
|
|
51
|
+
|
|
52
|
+
return start_table_viewer(
|
|
53
|
+
file_path=file_path,
|
|
54
|
+
port=port,
|
|
55
|
+
host=host,
|
|
56
|
+
sheet_name=sheet_name,
|
|
57
|
+
image_columns=image_columns,
|
|
58
|
+
auto_detect_images=auto_detect_images,
|
|
59
|
+
auto_open=auto_open
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def convert(
|
|
63
|
+
self,
|
|
64
|
+
input_file: str,
|
|
65
|
+
output_file: str = None,
|
|
66
|
+
sheet_name: Union[str, int] = 0,
|
|
67
|
+
encoding: str = "utf-8",
|
|
68
|
+
delimiter: str = ",",
|
|
69
|
+
**kwargs
|
|
70
|
+
):
|
|
71
|
+
"""数据格式转换
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
input_file: 输入文件路径
|
|
75
|
+
output_file: 输出文件路径,不指定则自动生成
|
|
76
|
+
sheet_name: Excel sheet名称或索引
|
|
77
|
+
encoding: 文件编码,默认utf-8
|
|
78
|
+
delimiter: CSV分隔符,默认逗号
|
|
79
|
+
**kwargs: 其他pandas读取参数
|
|
80
|
+
|
|
81
|
+
Examples:
|
|
82
|
+
maque data convert input.xlsx output.csv
|
|
83
|
+
maque data convert data.csv data.xlsx
|
|
84
|
+
maque data convert file.xlsx --sheet_name="Sheet2"
|
|
85
|
+
"""
|
|
86
|
+
import pandas as pd
|
|
87
|
+
|
|
88
|
+
input_path = Path(input_file)
|
|
89
|
+
if not input_path.exists():
|
|
90
|
+
print(f"[red]输入文件不存在: {input_file}[/red]")
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
# 自动生成输出文件名
|
|
94
|
+
if not output_file:
|
|
95
|
+
if input_path.suffix.lower() == '.csv':
|
|
96
|
+
output_file = str(input_path.with_suffix('.xlsx'))
|
|
97
|
+
else:
|
|
98
|
+
output_file = str(input_path.with_suffix('.csv'))
|
|
99
|
+
|
|
100
|
+
output_path = Path(output_file)
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
# 读取输入文件
|
|
104
|
+
input_ext = input_path.suffix.lower()
|
|
105
|
+
if input_ext in ['.xlsx', '.xls']:
|
|
106
|
+
df = pd.read_excel(input_file, sheet_name=sheet_name, **kwargs)
|
|
107
|
+
print(f"[green]✓[/green] 读取Excel文件: {input_file} (sheet: {sheet_name})")
|
|
108
|
+
elif input_ext == '.csv':
|
|
109
|
+
df = pd.read_csv(input_file, encoding=encoding, delimiter=delimiter, **kwargs)
|
|
110
|
+
print(f"[green]✓[/green] 读取CSV文件: {input_file}")
|
|
111
|
+
else:
|
|
112
|
+
print(f"[red]不支持的输入格式: {input_ext}[/red]")
|
|
113
|
+
return False
|
|
114
|
+
|
|
115
|
+
# 写入输出文件
|
|
116
|
+
output_ext = output_path.suffix.lower()
|
|
117
|
+
if output_ext in ['.xlsx', '.xls']:
|
|
118
|
+
df.to_excel(output_file, index=False)
|
|
119
|
+
print(f"[green]✓[/green] 保存为Excel文件: {output_file}")
|
|
120
|
+
elif output_ext == '.csv':
|
|
121
|
+
df.to_csv(output_file, index=False, encoding=encoding)
|
|
122
|
+
print(f"[green]✓[/green] 保存为CSV文件: {output_file}")
|
|
123
|
+
elif output_ext == '.json':
|
|
124
|
+
df.to_json(output_file, orient='records', ensure_ascii=False, indent=2)
|
|
125
|
+
print(f"[green]✓[/green] 保存为JSON文件: {output_file}")
|
|
126
|
+
else:
|
|
127
|
+
print(f"[red]不支持的输出格式: {output_ext}[/red]")
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
print(f"数据形状: {df.shape[0]} 行 × {df.shape[1]} 列")
|
|
131
|
+
return True
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
print(f"[red]转换失败: {e}[/red]")
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
def stats(
|
|
138
|
+
self,
|
|
139
|
+
file_path: str,
|
|
140
|
+
sheet_name: Union[str, int] = 0,
|
|
141
|
+
columns: str = None,
|
|
142
|
+
output_file: str = None,
|
|
143
|
+
**kwargs
|
|
144
|
+
):
|
|
145
|
+
"""数据统计分析
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
file_path: 文件路径
|
|
149
|
+
sheet_name: Excel sheet名称或索引
|
|
150
|
+
columns: 分析的列名,用逗号分隔,不指定则分析所有数值列
|
|
151
|
+
output_file: 统计结果保存文件(可选)
|
|
152
|
+
**kwargs: 其他pandas读取参数
|
|
153
|
+
|
|
154
|
+
Examples:
|
|
155
|
+
maque data stats data.csv
|
|
156
|
+
maque data stats data.xlsx --columns="age,price"
|
|
157
|
+
maque data stats file.csv --output_file="stats.json"
|
|
158
|
+
"""
|
|
159
|
+
import pandas as pd
|
|
160
|
+
import numpy as np
|
|
161
|
+
|
|
162
|
+
file_path_obj = Path(file_path)
|
|
163
|
+
if not file_path_obj.exists():
|
|
164
|
+
print(f"[red]文件不存在: {file_path}[/red]")
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
# 读取文件
|
|
169
|
+
file_ext = file_path_obj.suffix.lower()
|
|
170
|
+
if file_ext in ['.xlsx', '.xls']:
|
|
171
|
+
df = pd.read_excel(file_path, sheet_name=sheet_name, **kwargs)
|
|
172
|
+
elif file_ext == '.csv':
|
|
173
|
+
df = pd.read_csv(file_path, **kwargs)
|
|
174
|
+
else:
|
|
175
|
+
print(f"[red]不支持的文件格式: {file_ext}[/red]")
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
print(f"[blue]数据统计分析: {file_path}[/blue]")
|
|
179
|
+
print(f"数据形状: {df.shape[0]} 行 × {df.shape[1]} 列\n")
|
|
180
|
+
|
|
181
|
+
# 选择要分析的列
|
|
182
|
+
if columns:
|
|
183
|
+
col_list = [col.strip() for col in columns.split(',')]
|
|
184
|
+
missing_cols = [col for col in col_list if col not in df.columns]
|
|
185
|
+
if missing_cols:
|
|
186
|
+
print(f"[yellow]警告: 以下列不存在: {missing_cols}[/yellow]")
|
|
187
|
+
col_list = [col for col in col_list if col in df.columns]
|
|
188
|
+
df_analyze = df[col_list]
|
|
189
|
+
else:
|
|
190
|
+
# 自动选择数值列
|
|
191
|
+
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
192
|
+
if not numeric_columns:
|
|
193
|
+
print("[yellow]未找到数值列,显示所有列的基本信息[/yellow]")
|
|
194
|
+
df_analyze = df
|
|
195
|
+
else:
|
|
196
|
+
df_analyze = df[numeric_columns]
|
|
197
|
+
|
|
198
|
+
# 基本统计信息
|
|
199
|
+
stats_dict = {}
|
|
200
|
+
|
|
201
|
+
print("[bold cyan]基本信息[/bold cyan]")
|
|
202
|
+
info_table = Table(show_header=True, header_style="bold magenta")
|
|
203
|
+
info_table.add_column("指标", style="cyan")
|
|
204
|
+
info_table.add_column("值", style="green")
|
|
205
|
+
|
|
206
|
+
info_table.add_row("总行数", str(df.shape[0]))
|
|
207
|
+
info_table.add_row("总列数", str(df.shape[1]))
|
|
208
|
+
info_table.add_row("缺失值总数", str(df.isnull().sum().sum()))
|
|
209
|
+
info_table.add_row("内存使用", f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
|
|
210
|
+
|
|
211
|
+
self.console.print(info_table)
|
|
212
|
+
|
|
213
|
+
stats_dict['basic_info'] = {
|
|
214
|
+
'rows': df.shape[0],
|
|
215
|
+
'columns': df.shape[1],
|
|
216
|
+
'missing_values': int(df.isnull().sum().sum()),
|
|
217
|
+
'memory_usage_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2)
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
# 数值列统计
|
|
221
|
+
numeric_cols = df_analyze.select_dtypes(include=[np.number]).columns.tolist()
|
|
222
|
+
if numeric_cols:
|
|
223
|
+
print(f"\n[bold cyan]数值列统计 ({len(numeric_cols)} 列)[/bold cyan]")
|
|
224
|
+
desc = df_analyze[numeric_cols].describe()
|
|
225
|
+
|
|
226
|
+
# 创建统计表格
|
|
227
|
+
stats_table = Table(show_header=True, header_style="bold magenta")
|
|
228
|
+
stats_table.add_column("统计量", style="cyan")
|
|
229
|
+
for col in numeric_cols[:5]: # 限制显示列数
|
|
230
|
+
stats_table.add_column(col, style="green")
|
|
231
|
+
|
|
232
|
+
for stat in ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']:
|
|
233
|
+
row_data = [stat]
|
|
234
|
+
for col in numeric_cols[:5]:
|
|
235
|
+
value = desc.loc[stat, col]
|
|
236
|
+
if stat == 'count':
|
|
237
|
+
row_data.append(f"{int(value)}")
|
|
238
|
+
else:
|
|
239
|
+
row_data.append(f"{value:.2f}")
|
|
240
|
+
stats_table.add_row(*row_data)
|
|
241
|
+
|
|
242
|
+
self.console.print(stats_table)
|
|
243
|
+
|
|
244
|
+
if len(numeric_cols) > 5:
|
|
245
|
+
print(f"[dim]... 还有 {len(numeric_cols) - 5} 列未显示[/dim]")
|
|
246
|
+
|
|
247
|
+
stats_dict['numeric_stats'] = desc.to_dict()
|
|
248
|
+
|
|
249
|
+
# 文本列信息
|
|
250
|
+
text_cols = df.select_dtypes(include=['object']).columns.tolist()
|
|
251
|
+
if text_cols:
|
|
252
|
+
print(f"\n[bold cyan]文本列信息 ({len(text_cols)} 列)[/bold cyan]")
|
|
253
|
+
|
|
254
|
+
text_table = Table(show_header=True, header_style="bold magenta")
|
|
255
|
+
text_table.add_column("列名", style="cyan")
|
|
256
|
+
text_table.add_column("唯一值数量", style="green")
|
|
257
|
+
text_table.add_column("最常见值", style="yellow")
|
|
258
|
+
text_table.add_column("缺失值", style="red")
|
|
259
|
+
|
|
260
|
+
text_stats = {}
|
|
261
|
+
for col in text_cols[:10]: # 限制显示列数
|
|
262
|
+
unique_count = df[col].nunique()
|
|
263
|
+
most_common = df[col].mode().iloc[0] if not df[col].mode().empty else "N/A"
|
|
264
|
+
missing_count = df[col].isnull().sum()
|
|
265
|
+
|
|
266
|
+
text_table.add_row(
|
|
267
|
+
col,
|
|
268
|
+
str(unique_count),
|
|
269
|
+
str(most_common)[:20] + "..." if len(str(most_common)) > 20 else str(most_common),
|
|
270
|
+
str(missing_count)
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
text_stats[col] = {
|
|
274
|
+
'unique_count': int(unique_count),
|
|
275
|
+
'most_common': str(most_common),
|
|
276
|
+
'missing_count': int(missing_count)
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
self.console.print(text_table)
|
|
280
|
+
|
|
281
|
+
if len(text_cols) > 10:
|
|
282
|
+
print(f"[dim]... 还有 {len(text_cols) - 10} 列未显示[/dim]")
|
|
283
|
+
|
|
284
|
+
stats_dict['text_stats'] = text_stats
|
|
285
|
+
|
|
286
|
+
# 缺失值分析
|
|
287
|
+
missing_data = df.isnull().sum()
|
|
288
|
+
missing_data = missing_data[missing_data > 0]
|
|
289
|
+
|
|
290
|
+
if len(missing_data) > 0:
|
|
291
|
+
print(f"\n[bold cyan]缺失值分析[/bold cyan]")
|
|
292
|
+
missing_table = Table(show_header=True, header_style="bold magenta")
|
|
293
|
+
missing_table.add_column("列名", style="cyan")
|
|
294
|
+
missing_table.add_column("缺失数量", style="red")
|
|
295
|
+
missing_table.add_column("缺失比例", style="yellow")
|
|
296
|
+
|
|
297
|
+
missing_stats = {}
|
|
298
|
+
for col, count in missing_data.items():
|
|
299
|
+
percentage = (count / len(df)) * 100
|
|
300
|
+
missing_table.add_row(col, str(count), f"{percentage:.2f}%")
|
|
301
|
+
missing_stats[col] = {
|
|
302
|
+
'missing_count': int(count),
|
|
303
|
+
'missing_percentage': round(percentage, 2)
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
self.console.print(missing_table)
|
|
307
|
+
stats_dict['missing_data'] = missing_stats
|
|
308
|
+
|
|
309
|
+
# 保存统计结果
|
|
310
|
+
if output_file:
|
|
311
|
+
output_path = Path(output_file)
|
|
312
|
+
if output_path.suffix.lower() == '.json':
|
|
313
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
314
|
+
json.dump(stats_dict, f, ensure_ascii=False, indent=2)
|
|
315
|
+
elif output_path.suffix.lower() == '.csv':
|
|
316
|
+
# 将基本统计保存为CSV
|
|
317
|
+
if 'numeric_stats' in stats_dict:
|
|
318
|
+
pd.DataFrame(stats_dict['numeric_stats']).to_csv(output_file)
|
|
319
|
+
else:
|
|
320
|
+
print(f"[yellow]不支持的输出格式,使用JSON格式[/yellow]")
|
|
321
|
+
output_file = output_path.with_suffix('.json')
|
|
322
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
323
|
+
json.dump(stats_dict, f, ensure_ascii=False, indent=2)
|
|
324
|
+
|
|
325
|
+
print(f"\n[green]✓ 统计结果已保存到: {output_file}[/green]")
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
print(f"[red]分析失败: {e}[/red]")
|
|
329
|
+
|
|
330
|
+
def validate(
|
|
331
|
+
self,
|
|
332
|
+
file_path: str,
|
|
333
|
+
schema_file: str = None,
|
|
334
|
+
rules: str = None,
|
|
335
|
+
output_file: str = None,
|
|
336
|
+
**kwargs
|
|
337
|
+
):
|
|
338
|
+
"""数据验证
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
file_path: 数据文件路径
|
|
342
|
+
schema_file: 验证模式文件(JSON格式)
|
|
343
|
+
rules: 验证规则,用分号分隔,格式: "column:rule"
|
|
344
|
+
output_file: 验证结果保存文件
|
|
345
|
+
**kwargs: 其他参数
|
|
346
|
+
|
|
347
|
+
Examples:
|
|
348
|
+
maque data validate data.csv --rules="age:>0;price:>0"
|
|
349
|
+
maque data validate data.xlsx --schema_file=schema.json
|
|
350
|
+
"""
|
|
351
|
+
import pandas as pd
|
|
352
|
+
|
|
353
|
+
file_path_obj = Path(file_path)
|
|
354
|
+
if not file_path_obj.exists():
|
|
355
|
+
print(f"[red]文件不存在: {file_path}[/red]")
|
|
356
|
+
return False
|
|
357
|
+
|
|
358
|
+
try:
|
|
359
|
+
# 读取数据
|
|
360
|
+
file_ext = file_path_obj.suffix.lower()
|
|
361
|
+
if file_ext in ['.xlsx', '.xls']:
|
|
362
|
+
df = pd.read_excel(file_path, **kwargs)
|
|
363
|
+
elif file_ext == '.csv':
|
|
364
|
+
df = pd.read_csv(file_path, **kwargs)
|
|
365
|
+
else:
|
|
366
|
+
print(f"[red]不支持的文件格式: {file_ext}[/red]")
|
|
367
|
+
return False
|
|
368
|
+
|
|
369
|
+
print(f"[blue]数据验证: {file_path}[/blue]")
|
|
370
|
+
print(f"数据形状: {df.shape[0]} 行 × {df.shape[1]} 列\n")
|
|
371
|
+
|
|
372
|
+
validation_results = []
|
|
373
|
+
total_errors = 0
|
|
374
|
+
|
|
375
|
+
# 使用规则验证
|
|
376
|
+
if rules:
|
|
377
|
+
rule_list = rules.split(';')
|
|
378
|
+
for rule in rule_list:
|
|
379
|
+
if ':' not in rule:
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
column, condition = rule.split(':', 1)
|
|
383
|
+
column = column.strip()
|
|
384
|
+
condition = condition.strip()
|
|
385
|
+
|
|
386
|
+
if column not in df.columns:
|
|
387
|
+
validation_results.append({
|
|
388
|
+
'rule': rule,
|
|
389
|
+
'column': column,
|
|
390
|
+
'status': 'error',
|
|
391
|
+
'message': f'列 "{column}" 不存在',
|
|
392
|
+
'failed_rows': []
|
|
393
|
+
})
|
|
394
|
+
total_errors += 1
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
try:
|
|
398
|
+
# 简单的验证规则解析
|
|
399
|
+
if condition.startswith('>'):
|
|
400
|
+
threshold = float(condition[1:])
|
|
401
|
+
mask = df[column] <= threshold
|
|
402
|
+
elif condition.startswith('<'):
|
|
403
|
+
threshold = float(condition[1:])
|
|
404
|
+
mask = df[column] >= threshold
|
|
405
|
+
elif condition.startswith('!='):
|
|
406
|
+
value = condition[2:].strip()
|
|
407
|
+
mask = df[column] == value
|
|
408
|
+
elif condition == 'not_null':
|
|
409
|
+
mask = df[column].isnull()
|
|
410
|
+
else:
|
|
411
|
+
validation_results.append({
|
|
412
|
+
'rule': rule,
|
|
413
|
+
'column': column,
|
|
414
|
+
'status': 'error',
|
|
415
|
+
'message': f'不支持的验证规则: {condition}',
|
|
416
|
+
'failed_rows': []
|
|
417
|
+
})
|
|
418
|
+
continue
|
|
419
|
+
|
|
420
|
+
failed_indices = df[mask].index.tolist()
|
|
421
|
+
failed_count = len(failed_indices)
|
|
422
|
+
|
|
423
|
+
validation_results.append({
|
|
424
|
+
'rule': rule,
|
|
425
|
+
'column': column,
|
|
426
|
+
'status': 'pass' if failed_count == 0 else 'fail',
|
|
427
|
+
'message': f'{failed_count} 行违反规则' if failed_count > 0 else '通过验证',
|
|
428
|
+
'failed_rows': failed_indices[:10] # 只保留前10个失败行
|
|
429
|
+
})
|
|
430
|
+
|
|
431
|
+
total_errors += failed_count
|
|
432
|
+
|
|
433
|
+
except Exception as e:
|
|
434
|
+
validation_results.append({
|
|
435
|
+
'rule': rule,
|
|
436
|
+
'column': column,
|
|
437
|
+
'status': 'error',
|
|
438
|
+
'message': f'验证出错: {e}',
|
|
439
|
+
'failed_rows': []
|
|
440
|
+
})
|
|
441
|
+
|
|
442
|
+
# 显示验证结果
|
|
443
|
+
print("[bold cyan]验证结果[/bold cyan]")
|
|
444
|
+
|
|
445
|
+
result_table = Table(show_header=True, header_style="bold magenta")
|
|
446
|
+
result_table.add_column("规则", style="cyan")
|
|
447
|
+
result_table.add_column("列名", style="blue")
|
|
448
|
+
result_table.add_column("状态", style="green")
|
|
449
|
+
result_table.add_column("消息", style="yellow")
|
|
450
|
+
|
|
451
|
+
for result in validation_results:
|
|
452
|
+
status_color = {
|
|
453
|
+
'pass': '[green]✓ 通过[/green]',
|
|
454
|
+
'fail': '[red]✗ 失败[/red]',
|
|
455
|
+
'error': '[red]✗ 错误[/red]'
|
|
456
|
+
}.get(result['status'], result['status'])
|
|
457
|
+
|
|
458
|
+
result_table.add_row(
|
|
459
|
+
result['rule'],
|
|
460
|
+
result['column'],
|
|
461
|
+
status_color,
|
|
462
|
+
result['message']
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
self.console.print(result_table)
|
|
466
|
+
|
|
467
|
+
# 总结
|
|
468
|
+
passed = sum(1 for r in validation_results if r['status'] == 'pass')
|
|
469
|
+
failed = sum(1 for r in validation_results if r['status'] in ['fail', 'error'])
|
|
470
|
+
|
|
471
|
+
print(f"\n[bold]验证总结[/bold]")
|
|
472
|
+
print(f"通过: [green]{passed}[/green]")
|
|
473
|
+
print(f"失败: [red]{failed}[/red]")
|
|
474
|
+
print(f"错误行数: [red]{total_errors}[/red]")
|
|
475
|
+
|
|
476
|
+
# 保存结果
|
|
477
|
+
if output_file:
|
|
478
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
479
|
+
json.dump({
|
|
480
|
+
'file_path': file_path,
|
|
481
|
+
'validation_time': str(pd.Timestamp.now()),
|
|
482
|
+
'data_shape': df.shape,
|
|
483
|
+
'summary': {
|
|
484
|
+
'passed': passed,
|
|
485
|
+
'failed': failed,
|
|
486
|
+
'total_errors': total_errors
|
|
487
|
+
},
|
|
488
|
+
'results': validation_results
|
|
489
|
+
}, f, ensure_ascii=False, indent=2)
|
|
490
|
+
|
|
491
|
+
print(f"[green]✓ 验证结果已保存到: {output_file}[/green]")
|
|
492
|
+
|
|
493
|
+
return total_errors == 0
|
|
494
|
+
|
|
495
|
+
except Exception as e:
|
|
496
|
+
print(f"[red]验证失败: {e}[/red]")
|
|
497
|
+
return False
|
|
498
|
+
|
|
499
|
+
def sample(
|
|
500
|
+
self,
|
|
501
|
+
file_path: str,
|
|
502
|
+
n: int = 100,
|
|
503
|
+
method: str = "random",
|
|
504
|
+
output_file: str = None,
|
|
505
|
+
seed: int = None,
|
|
506
|
+
**kwargs
|
|
507
|
+
):
|
|
508
|
+
"""数据采样
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
file_path: 数据文件路径(支持 .csv, .xlsx, .xls, .jsonl, .json)
|
|
512
|
+
n: 采样数量
|
|
513
|
+
method: 采样方法 (random, head, tail)
|
|
514
|
+
output_file: 输出文件路径
|
|
515
|
+
seed: 随机种子
|
|
516
|
+
**kwargs: 其他参数
|
|
517
|
+
|
|
518
|
+
Examples:
|
|
519
|
+
maque data sample large_data.csv --n=1000
|
|
520
|
+
maque data sample data.xlsx --method=head --n=50
|
|
521
|
+
maque data sample train.jsonl --n=500 --seed=42
|
|
522
|
+
"""
|
|
523
|
+
import pandas as pd
|
|
524
|
+
import random
|
|
525
|
+
|
|
526
|
+
file_path_obj = Path(file_path)
|
|
527
|
+
if not file_path_obj.exists():
|
|
528
|
+
print(f"[red]文件不存在: {file_path}[/red]")
|
|
529
|
+
return False
|
|
530
|
+
|
|
531
|
+
try:
|
|
532
|
+
# 读取数据
|
|
533
|
+
file_ext = file_path_obj.suffix.lower()
|
|
534
|
+
is_jsonl = file_ext == '.jsonl'
|
|
535
|
+
|
|
536
|
+
if file_ext in ['.xlsx', '.xls']:
|
|
537
|
+
df = pd.read_excel(file_path, **kwargs)
|
|
538
|
+
elif file_ext == '.csv':
|
|
539
|
+
df = pd.read_csv(file_path, **kwargs)
|
|
540
|
+
elif file_ext == '.jsonl':
|
|
541
|
+
data = jsonl_load(file_path)
|
|
542
|
+
df = pd.DataFrame(data)
|
|
543
|
+
elif file_ext == '.json':
|
|
544
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
545
|
+
data = json.load(f)
|
|
546
|
+
if isinstance(data, list):
|
|
547
|
+
df = pd.DataFrame(data)
|
|
548
|
+
else:
|
|
549
|
+
print(f"[red]JSON文件必须是数组格式[/red]")
|
|
550
|
+
return False
|
|
551
|
+
else:
|
|
552
|
+
print(f"[red]不支持的文件格式: {file_ext}[/red]")
|
|
553
|
+
print("支持的格式: .csv, .xlsx, .xls, .jsonl, .json")
|
|
554
|
+
return False
|
|
555
|
+
|
|
556
|
+
print(f"[blue]数据采样: {file_path}[/blue]")
|
|
557
|
+
print(f"原始数据: {df.shape[0]} 行 × {df.shape[1]} 列")
|
|
558
|
+
|
|
559
|
+
# 采样
|
|
560
|
+
if n >= len(df):
|
|
561
|
+
print(f"[yellow]警告: 采样数量({n})大于等于数据行数({len(df)}),返回全部数据[/yellow]")
|
|
562
|
+
sampled_df = df
|
|
563
|
+
else:
|
|
564
|
+
if method == "random":
|
|
565
|
+
if seed is not None:
|
|
566
|
+
sampled_df = df.sample(n=n, random_state=seed)
|
|
567
|
+
else:
|
|
568
|
+
sampled_df = df.sample(n=n)
|
|
569
|
+
elif method == "head":
|
|
570
|
+
sampled_df = df.head(n)
|
|
571
|
+
elif method == "tail":
|
|
572
|
+
sampled_df = df.tail(n)
|
|
573
|
+
else:
|
|
574
|
+
print(f"[red]不支持的采样方法: {method}[/red]")
|
|
575
|
+
print("支持的方法: random, head, tail")
|
|
576
|
+
return False
|
|
577
|
+
|
|
578
|
+
print(f"采样结果: {sampled_df.shape[0]} 行 × {sampled_df.shape[1]} 列")
|
|
579
|
+
|
|
580
|
+
# 保存结果
|
|
581
|
+
if not output_file:
|
|
582
|
+
output_file = file_path_obj.stem + f"_sample_{n}" + file_path_obj.suffix
|
|
583
|
+
|
|
584
|
+
output_path = Path(output_file)
|
|
585
|
+
output_ext = output_path.suffix.lower()
|
|
586
|
+
|
|
587
|
+
if output_ext in ['.xlsx', '.xls']:
|
|
588
|
+
sampled_df.to_excel(output_file, index=False)
|
|
589
|
+
elif output_ext == '.csv':
|
|
590
|
+
sampled_df.to_csv(output_file, index=False)
|
|
591
|
+
elif output_ext == '.jsonl':
|
|
592
|
+
# 保存为 JSONL 格式
|
|
593
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
594
|
+
for _, row in sampled_df.iterrows():
|
|
595
|
+
f.write(json.dumps(row.to_dict(), ensure_ascii=False) + '\n')
|
|
596
|
+
elif output_ext == '.json':
|
|
597
|
+
# 保存为 JSON 格式
|
|
598
|
+
sampled_df.to_json(output_file, orient='records', ensure_ascii=False, indent=2)
|
|
599
|
+
else:
|
|
600
|
+
# 默认保持原格式,如果无法识别则用 CSV
|
|
601
|
+
if is_jsonl:
|
|
602
|
+
output_file = str(output_path.with_suffix('.jsonl'))
|
|
603
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
604
|
+
for _, row in sampled_df.iterrows():
|
|
605
|
+
f.write(json.dumps(row.to_dict(), ensure_ascii=False) + '\n')
|
|
606
|
+
else:
|
|
607
|
+
output_file = str(output_path.with_suffix('.csv'))
|
|
608
|
+
sampled_df.to_csv(output_file, index=False)
|
|
609
|
+
|
|
610
|
+
print(f"[green]✓ 采样结果已保存到: {output_file}[/green]")
|
|
611
|
+
return True
|
|
612
|
+
|
|
613
|
+
except Exception as e:
|
|
614
|
+
print(f"[red]采样失败: {e}[/red]")
|
|
615
|
+
return False
|