maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,615 @@
1
+ """数据处理命令组"""
2
+ import json
3
+ import csv
4
+ from pathlib import Path
5
+ from maque.io import jsonl_load
6
+ from typing import Union, Optional, List
7
+ from rich import print
8
+ from rich.table import Table
9
+ from rich.console import Console
10
+
11
+
12
+ class DataGroup:
13
+ """数据处理命令组"""
14
+
15
+ def __init__(self, cli_instance):
16
+ self.cli = cli_instance
17
+ self.console = Console()
18
+
19
+ def table_viewer(
20
+ self,
21
+ file_path: str = None,
22
+ port: int = 8080,
23
+ host: str = "127.0.0.1",
24
+ sheet_name: Union[str, int] = 0,
25
+ image_columns: str = None,
26
+ auto_detect_images: bool = True,
27
+ auto_open: bool = True,
28
+ **kwargs
29
+ ):
30
+ """启动交互式表格查看器
31
+
32
+ Args:
33
+ file_path: 表格文件路径(支持.xlsx, .xls, .csv格式)
34
+ port: 服务器端口,默认8080
35
+ host: 服务器主机地址,默认127.0.0.1
36
+ sheet_name: Excel文件的sheet名称或索引,默认为0
37
+ image_columns: 指定图片列名,用逗号分隔
38
+ auto_detect_images: 是否自动检测图片列,默认True
39
+ auto_open: 是否自动打开浏览器,默认True
40
+
41
+ Examples:
42
+ maque data table-viewer data.xlsx
43
+ maque data table-viewer "products.csv" --port=9090
44
+ """
45
+ # 直接调用 table_viewer 实现,避免循环引用
46
+ from maque.table_viewer import start_table_viewer
47
+
48
+ # 处理 image_columns 参数
49
+ if image_columns:
50
+ image_columns = [col.strip() for col in image_columns.split(',')]
51
+
52
+ return start_table_viewer(
53
+ file_path=file_path,
54
+ port=port,
55
+ host=host,
56
+ sheet_name=sheet_name,
57
+ image_columns=image_columns,
58
+ auto_detect_images=auto_detect_images,
59
+ auto_open=auto_open
60
+ )
61
+
62
+ def convert(
63
+ self,
64
+ input_file: str,
65
+ output_file: str = None,
66
+ sheet_name: Union[str, int] = 0,
67
+ encoding: str = "utf-8",
68
+ delimiter: str = ",",
69
+ **kwargs
70
+ ):
71
+ """数据格式转换
72
+
73
+ Args:
74
+ input_file: 输入文件路径
75
+ output_file: 输出文件路径,不指定则自动生成
76
+ sheet_name: Excel sheet名称或索引
77
+ encoding: 文件编码,默认utf-8
78
+ delimiter: CSV分隔符,默认逗号
79
+ **kwargs: 其他pandas读取参数
80
+
81
+ Examples:
82
+ maque data convert input.xlsx output.csv
83
+ maque data convert data.csv data.xlsx
84
+ maque data convert file.xlsx --sheet_name="Sheet2"
85
+ """
86
+ import pandas as pd
87
+
88
+ input_path = Path(input_file)
89
+ if not input_path.exists():
90
+ print(f"[red]输入文件不存在: {input_file}[/red]")
91
+ return False
92
+
93
+ # 自动生成输出文件名
94
+ if not output_file:
95
+ if input_path.suffix.lower() == '.csv':
96
+ output_file = str(input_path.with_suffix('.xlsx'))
97
+ else:
98
+ output_file = str(input_path.with_suffix('.csv'))
99
+
100
+ output_path = Path(output_file)
101
+
102
+ try:
103
+ # 读取输入文件
104
+ input_ext = input_path.suffix.lower()
105
+ if input_ext in ['.xlsx', '.xls']:
106
+ df = pd.read_excel(input_file, sheet_name=sheet_name, **kwargs)
107
+ print(f"[green]✓[/green] 读取Excel文件: {input_file} (sheet: {sheet_name})")
108
+ elif input_ext == '.csv':
109
+ df = pd.read_csv(input_file, encoding=encoding, delimiter=delimiter, **kwargs)
110
+ print(f"[green]✓[/green] 读取CSV文件: {input_file}")
111
+ else:
112
+ print(f"[red]不支持的输入格式: {input_ext}[/red]")
113
+ return False
114
+
115
+ # 写入输出文件
116
+ output_ext = output_path.suffix.lower()
117
+ if output_ext in ['.xlsx', '.xls']:
118
+ df.to_excel(output_file, index=False)
119
+ print(f"[green]✓[/green] 保存为Excel文件: {output_file}")
120
+ elif output_ext == '.csv':
121
+ df.to_csv(output_file, index=False, encoding=encoding)
122
+ print(f"[green]✓[/green] 保存为CSV文件: {output_file}")
123
+ elif output_ext == '.json':
124
+ df.to_json(output_file, orient='records', ensure_ascii=False, indent=2)
125
+ print(f"[green]✓[/green] 保存为JSON文件: {output_file}")
126
+ else:
127
+ print(f"[red]不支持的输出格式: {output_ext}[/red]")
128
+ return False
129
+
130
+ print(f"数据形状: {df.shape[0]} 行 × {df.shape[1]} 列")
131
+ return True
132
+
133
+ except Exception as e:
134
+ print(f"[red]转换失败: {e}[/red]")
135
+ return False
136
+
137
+ def stats(
138
+ self,
139
+ file_path: str,
140
+ sheet_name: Union[str, int] = 0,
141
+ columns: str = None,
142
+ output_file: str = None,
143
+ **kwargs
144
+ ):
145
+ """数据统计分析
146
+
147
+ Args:
148
+ file_path: 文件路径
149
+ sheet_name: Excel sheet名称或索引
150
+ columns: 分析的列名,用逗号分隔,不指定则分析所有数值列
151
+ output_file: 统计结果保存文件(可选)
152
+ **kwargs: 其他pandas读取参数
153
+
154
+ Examples:
155
+ maque data stats data.csv
156
+ maque data stats data.xlsx --columns="age,price"
157
+ maque data stats file.csv --output_file="stats.json"
158
+ """
159
+ import pandas as pd
160
+ import numpy as np
161
+
162
+ file_path_obj = Path(file_path)
163
+ if not file_path_obj.exists():
164
+ print(f"[red]文件不存在: {file_path}[/red]")
165
+ return
166
+
167
+ try:
168
+ # 读取文件
169
+ file_ext = file_path_obj.suffix.lower()
170
+ if file_ext in ['.xlsx', '.xls']:
171
+ df = pd.read_excel(file_path, sheet_name=sheet_name, **kwargs)
172
+ elif file_ext == '.csv':
173
+ df = pd.read_csv(file_path, **kwargs)
174
+ else:
175
+ print(f"[red]不支持的文件格式: {file_ext}[/red]")
176
+ return
177
+
178
+ print(f"[blue]数据统计分析: {file_path}[/blue]")
179
+ print(f"数据形状: {df.shape[0]} 行 × {df.shape[1]} 列\n")
180
+
181
+ # 选择要分析的列
182
+ if columns:
183
+ col_list = [col.strip() for col in columns.split(',')]
184
+ missing_cols = [col for col in col_list if col not in df.columns]
185
+ if missing_cols:
186
+ print(f"[yellow]警告: 以下列不存在: {missing_cols}[/yellow]")
187
+ col_list = [col for col in col_list if col in df.columns]
188
+ df_analyze = df[col_list]
189
+ else:
190
+ # 自动选择数值列
191
+ numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
192
+ if not numeric_columns:
193
+ print("[yellow]未找到数值列,显示所有列的基本信息[/yellow]")
194
+ df_analyze = df
195
+ else:
196
+ df_analyze = df[numeric_columns]
197
+
198
+ # 基本统计信息
199
+ stats_dict = {}
200
+
201
+ print("[bold cyan]基本信息[/bold cyan]")
202
+ info_table = Table(show_header=True, header_style="bold magenta")
203
+ info_table.add_column("指标", style="cyan")
204
+ info_table.add_column("值", style="green")
205
+
206
+ info_table.add_row("总行数", str(df.shape[0]))
207
+ info_table.add_row("总列数", str(df.shape[1]))
208
+ info_table.add_row("缺失值总数", str(df.isnull().sum().sum()))
209
+ info_table.add_row("内存使用", f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
210
+
211
+ self.console.print(info_table)
212
+
213
+ stats_dict['basic_info'] = {
214
+ 'rows': df.shape[0],
215
+ 'columns': df.shape[1],
216
+ 'missing_values': int(df.isnull().sum().sum()),
217
+ 'memory_usage_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2)
218
+ }
219
+
220
+ # 数值列统计
221
+ numeric_cols = df_analyze.select_dtypes(include=[np.number]).columns.tolist()
222
+ if numeric_cols:
223
+ print(f"\n[bold cyan]数值列统计 ({len(numeric_cols)} 列)[/bold cyan]")
224
+ desc = df_analyze[numeric_cols].describe()
225
+
226
+ # 创建统计表格
227
+ stats_table = Table(show_header=True, header_style="bold magenta")
228
+ stats_table.add_column("统计量", style="cyan")
229
+ for col in numeric_cols[:5]: # 限制显示列数
230
+ stats_table.add_column(col, style="green")
231
+
232
+ for stat in ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']:
233
+ row_data = [stat]
234
+ for col in numeric_cols[:5]:
235
+ value = desc.loc[stat, col]
236
+ if stat == 'count':
237
+ row_data.append(f"{int(value)}")
238
+ else:
239
+ row_data.append(f"{value:.2f}")
240
+ stats_table.add_row(*row_data)
241
+
242
+ self.console.print(stats_table)
243
+
244
+ if len(numeric_cols) > 5:
245
+ print(f"[dim]... 还有 {len(numeric_cols) - 5} 列未显示[/dim]")
246
+
247
+ stats_dict['numeric_stats'] = desc.to_dict()
248
+
249
+ # 文本列信息
250
+ text_cols = df.select_dtypes(include=['object']).columns.tolist()
251
+ if text_cols:
252
+ print(f"\n[bold cyan]文本列信息 ({len(text_cols)} 列)[/bold cyan]")
253
+
254
+ text_table = Table(show_header=True, header_style="bold magenta")
255
+ text_table.add_column("列名", style="cyan")
256
+ text_table.add_column("唯一值数量", style="green")
257
+ text_table.add_column("最常见值", style="yellow")
258
+ text_table.add_column("缺失值", style="red")
259
+
260
+ text_stats = {}
261
+ for col in text_cols[:10]: # 限制显示列数
262
+ unique_count = df[col].nunique()
263
+ most_common = df[col].mode().iloc[0] if not df[col].mode().empty else "N/A"
264
+ missing_count = df[col].isnull().sum()
265
+
266
+ text_table.add_row(
267
+ col,
268
+ str(unique_count),
269
+ str(most_common)[:20] + "..." if len(str(most_common)) > 20 else str(most_common),
270
+ str(missing_count)
271
+ )
272
+
273
+ text_stats[col] = {
274
+ 'unique_count': int(unique_count),
275
+ 'most_common': str(most_common),
276
+ 'missing_count': int(missing_count)
277
+ }
278
+
279
+ self.console.print(text_table)
280
+
281
+ if len(text_cols) > 10:
282
+ print(f"[dim]... 还有 {len(text_cols) - 10} 列未显示[/dim]")
283
+
284
+ stats_dict['text_stats'] = text_stats
285
+
286
+ # 缺失值分析
287
+ missing_data = df.isnull().sum()
288
+ missing_data = missing_data[missing_data > 0]
289
+
290
+ if len(missing_data) > 0:
291
+ print(f"\n[bold cyan]缺失值分析[/bold cyan]")
292
+ missing_table = Table(show_header=True, header_style="bold magenta")
293
+ missing_table.add_column("列名", style="cyan")
294
+ missing_table.add_column("缺失数量", style="red")
295
+ missing_table.add_column("缺失比例", style="yellow")
296
+
297
+ missing_stats = {}
298
+ for col, count in missing_data.items():
299
+ percentage = (count / len(df)) * 100
300
+ missing_table.add_row(col, str(count), f"{percentage:.2f}%")
301
+ missing_stats[col] = {
302
+ 'missing_count': int(count),
303
+ 'missing_percentage': round(percentage, 2)
304
+ }
305
+
306
+ self.console.print(missing_table)
307
+ stats_dict['missing_data'] = missing_stats
308
+
309
+ # 保存统计结果
310
+ if output_file:
311
+ output_path = Path(output_file)
312
+ if output_path.suffix.lower() == '.json':
313
+ with open(output_file, 'w', encoding='utf-8') as f:
314
+ json.dump(stats_dict, f, ensure_ascii=False, indent=2)
315
+ elif output_path.suffix.lower() == '.csv':
316
+ # 将基本统计保存为CSV
317
+ if 'numeric_stats' in stats_dict:
318
+ pd.DataFrame(stats_dict['numeric_stats']).to_csv(output_file)
319
+ else:
320
+ print(f"[yellow]不支持的输出格式,使用JSON格式[/yellow]")
321
+ output_file = output_path.with_suffix('.json')
322
+ with open(output_file, 'w', encoding='utf-8') as f:
323
+ json.dump(stats_dict, f, ensure_ascii=False, indent=2)
324
+
325
+ print(f"\n[green]✓ 统计结果已保存到: {output_file}[/green]")
326
+
327
+ except Exception as e:
328
+ print(f"[red]分析失败: {e}[/red]")
329
+
330
+ def validate(
331
+ self,
332
+ file_path: str,
333
+ schema_file: str = None,
334
+ rules: str = None,
335
+ output_file: str = None,
336
+ **kwargs
337
+ ):
338
+ """数据验证
339
+
340
+ Args:
341
+ file_path: 数据文件路径
342
+ schema_file: 验证模式文件(JSON格式)
343
+ rules: 验证规则,用分号分隔,格式: "column:rule"
344
+ output_file: 验证结果保存文件
345
+ **kwargs: 其他参数
346
+
347
+ Examples:
348
+ maque data validate data.csv --rules="age:>0;price:>0"
349
+ maque data validate data.xlsx --schema_file=schema.json
350
+ """
351
+ import pandas as pd
352
+
353
+ file_path_obj = Path(file_path)
354
+ if not file_path_obj.exists():
355
+ print(f"[red]文件不存在: {file_path}[/red]")
356
+ return False
357
+
358
+ try:
359
+ # 读取数据
360
+ file_ext = file_path_obj.suffix.lower()
361
+ if file_ext in ['.xlsx', '.xls']:
362
+ df = pd.read_excel(file_path, **kwargs)
363
+ elif file_ext == '.csv':
364
+ df = pd.read_csv(file_path, **kwargs)
365
+ else:
366
+ print(f"[red]不支持的文件格式: {file_ext}[/red]")
367
+ return False
368
+
369
+ print(f"[blue]数据验证: {file_path}[/blue]")
370
+ print(f"数据形状: {df.shape[0]} 行 × {df.shape[1]} 列\n")
371
+
372
+ validation_results = []
373
+ total_errors = 0
374
+
375
+ # 使用规则验证
376
+ if rules:
377
+ rule_list = rules.split(';')
378
+ for rule in rule_list:
379
+ if ':' not in rule:
380
+ continue
381
+
382
+ column, condition = rule.split(':', 1)
383
+ column = column.strip()
384
+ condition = condition.strip()
385
+
386
+ if column not in df.columns:
387
+ validation_results.append({
388
+ 'rule': rule,
389
+ 'column': column,
390
+ 'status': 'error',
391
+ 'message': f'列 "{column}" 不存在',
392
+ 'failed_rows': []
393
+ })
394
+ total_errors += 1
395
+ continue
396
+
397
+ try:
398
+ # 简单的验证规则解析
399
+ if condition.startswith('>'):
400
+ threshold = float(condition[1:])
401
+ mask = df[column] <= threshold
402
+ elif condition.startswith('<'):
403
+ threshold = float(condition[1:])
404
+ mask = df[column] >= threshold
405
+ elif condition.startswith('!='):
406
+ value = condition[2:].strip()
407
+ mask = df[column] == value
408
+ elif condition == 'not_null':
409
+ mask = df[column].isnull()
410
+ else:
411
+ validation_results.append({
412
+ 'rule': rule,
413
+ 'column': column,
414
+ 'status': 'error',
415
+ 'message': f'不支持的验证规则: {condition}',
416
+ 'failed_rows': []
417
+ })
418
+ continue
419
+
420
+ failed_indices = df[mask].index.tolist()
421
+ failed_count = len(failed_indices)
422
+
423
+ validation_results.append({
424
+ 'rule': rule,
425
+ 'column': column,
426
+ 'status': 'pass' if failed_count == 0 else 'fail',
427
+ 'message': f'{failed_count} 行违反规则' if failed_count > 0 else '通过验证',
428
+ 'failed_rows': failed_indices[:10] # 只保留前10个失败行
429
+ })
430
+
431
+ total_errors += failed_count
432
+
433
+ except Exception as e:
434
+ validation_results.append({
435
+ 'rule': rule,
436
+ 'column': column,
437
+ 'status': 'error',
438
+ 'message': f'验证出错: {e}',
439
+ 'failed_rows': []
440
+ })
441
+
442
+ # 显示验证结果
443
+ print("[bold cyan]验证结果[/bold cyan]")
444
+
445
+ result_table = Table(show_header=True, header_style="bold magenta")
446
+ result_table.add_column("规则", style="cyan")
447
+ result_table.add_column("列名", style="blue")
448
+ result_table.add_column("状态", style="green")
449
+ result_table.add_column("消息", style="yellow")
450
+
451
+ for result in validation_results:
452
+ status_color = {
453
+ 'pass': '[green]✓ 通过[/green]',
454
+ 'fail': '[red]✗ 失败[/red]',
455
+ 'error': '[red]✗ 错误[/red]'
456
+ }.get(result['status'], result['status'])
457
+
458
+ result_table.add_row(
459
+ result['rule'],
460
+ result['column'],
461
+ status_color,
462
+ result['message']
463
+ )
464
+
465
+ self.console.print(result_table)
466
+
467
+ # 总结
468
+ passed = sum(1 for r in validation_results if r['status'] == 'pass')
469
+ failed = sum(1 for r in validation_results if r['status'] in ['fail', 'error'])
470
+
471
+ print(f"\n[bold]验证总结[/bold]")
472
+ print(f"通过: [green]{passed}[/green]")
473
+ print(f"失败: [red]{failed}[/red]")
474
+ print(f"错误行数: [red]{total_errors}[/red]")
475
+
476
+ # 保存结果
477
+ if output_file:
478
+ with open(output_file, 'w', encoding='utf-8') as f:
479
+ json.dump({
480
+ 'file_path': file_path,
481
+ 'validation_time': str(pd.Timestamp.now()),
482
+ 'data_shape': df.shape,
483
+ 'summary': {
484
+ 'passed': passed,
485
+ 'failed': failed,
486
+ 'total_errors': total_errors
487
+ },
488
+ 'results': validation_results
489
+ }, f, ensure_ascii=False, indent=2)
490
+
491
+ print(f"[green]✓ 验证结果已保存到: {output_file}[/green]")
492
+
493
+ return total_errors == 0
494
+
495
+ except Exception as e:
496
+ print(f"[red]验证失败: {e}[/red]")
497
+ return False
498
+
499
+ def sample(
500
+ self,
501
+ file_path: str,
502
+ n: int = 100,
503
+ method: str = "random",
504
+ output_file: str = None,
505
+ seed: int = None,
506
+ **kwargs
507
+ ):
508
+ """数据采样
509
+
510
+ Args:
511
+ file_path: 数据文件路径(支持 .csv, .xlsx, .xls, .jsonl, .json)
512
+ n: 采样数量
513
+ method: 采样方法 (random, head, tail)
514
+ output_file: 输出文件路径
515
+ seed: 随机种子
516
+ **kwargs: 其他参数
517
+
518
+ Examples:
519
+ maque data sample large_data.csv --n=1000
520
+ maque data sample data.xlsx --method=head --n=50
521
+ maque data sample train.jsonl --n=500 --seed=42
522
+ """
523
+ import pandas as pd
524
+ import random
525
+
526
+ file_path_obj = Path(file_path)
527
+ if not file_path_obj.exists():
528
+ print(f"[red]文件不存在: {file_path}[/red]")
529
+ return False
530
+
531
+ try:
532
+ # 读取数据
533
+ file_ext = file_path_obj.suffix.lower()
534
+ is_jsonl = file_ext == '.jsonl'
535
+
536
+ if file_ext in ['.xlsx', '.xls']:
537
+ df = pd.read_excel(file_path, **kwargs)
538
+ elif file_ext == '.csv':
539
+ df = pd.read_csv(file_path, **kwargs)
540
+ elif file_ext == '.jsonl':
541
+ data = jsonl_load(file_path)
542
+ df = pd.DataFrame(data)
543
+ elif file_ext == '.json':
544
+ with open(file_path, 'r', encoding='utf-8') as f:
545
+ data = json.load(f)
546
+ if isinstance(data, list):
547
+ df = pd.DataFrame(data)
548
+ else:
549
+ print(f"[red]JSON文件必须是数组格式[/red]")
550
+ return False
551
+ else:
552
+ print(f"[red]不支持的文件格式: {file_ext}[/red]")
553
+ print("支持的格式: .csv, .xlsx, .xls, .jsonl, .json")
554
+ return False
555
+
556
+ print(f"[blue]数据采样: {file_path}[/blue]")
557
+ print(f"原始数据: {df.shape[0]} 行 × {df.shape[1]} 列")
558
+
559
+ # 采样
560
+ if n >= len(df):
561
+ print(f"[yellow]警告: 采样数量({n})大于等于数据行数({len(df)}),返回全部数据[/yellow]")
562
+ sampled_df = df
563
+ else:
564
+ if method == "random":
565
+ if seed is not None:
566
+ sampled_df = df.sample(n=n, random_state=seed)
567
+ else:
568
+ sampled_df = df.sample(n=n)
569
+ elif method == "head":
570
+ sampled_df = df.head(n)
571
+ elif method == "tail":
572
+ sampled_df = df.tail(n)
573
+ else:
574
+ print(f"[red]不支持的采样方法: {method}[/red]")
575
+ print("支持的方法: random, head, tail")
576
+ return False
577
+
578
+ print(f"采样结果: {sampled_df.shape[0]} 行 × {sampled_df.shape[1]} 列")
579
+
580
+ # 保存结果
581
+ if not output_file:
582
+ output_file = file_path_obj.stem + f"_sample_{n}" + file_path_obj.suffix
583
+
584
+ output_path = Path(output_file)
585
+ output_ext = output_path.suffix.lower()
586
+
587
+ if output_ext in ['.xlsx', '.xls']:
588
+ sampled_df.to_excel(output_file, index=False)
589
+ elif output_ext == '.csv':
590
+ sampled_df.to_csv(output_file, index=False)
591
+ elif output_ext == '.jsonl':
592
+ # 保存为 JSONL 格式
593
+ with open(output_file, 'w', encoding='utf-8') as f:
594
+ for _, row in sampled_df.iterrows():
595
+ f.write(json.dumps(row.to_dict(), ensure_ascii=False) + '\n')
596
+ elif output_ext == '.json':
597
+ # 保存为 JSON 格式
598
+ sampled_df.to_json(output_file, orient='records', ensure_ascii=False, indent=2)
599
+ else:
600
+ # 默认保持原格式,如果无法识别则用 CSV
601
+ if is_jsonl:
602
+ output_file = str(output_path.with_suffix('.jsonl'))
603
+ with open(output_file, 'w', encoding='utf-8') as f:
604
+ for _, row in sampled_df.iterrows():
605
+ f.write(json.dumps(row.to_dict(), ensure_ascii=False) + '\n')
606
+ else:
607
+ output_file = str(output_path.with_suffix('.csv'))
608
+ sampled_df.to_csv(output_file, index=False)
609
+
610
+ print(f"[green]✓ 采样结果已保存到: {output_file}[/green]")
611
+ return True
612
+
613
+ except Exception as e:
614
+ print(f"[red]采样失败: {e}[/red]")
615
+ return False