maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
"""
|
|
2
|
+
第7步:结果校验步骤
|
|
3
|
+
对最终结果进行质量检查和验证
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Any, Tuple
|
|
10
|
+
from ..core import PipelineStep, StepResult, PipelineConfig
|
|
11
|
+
from maque.performance import MeasureTime
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ResultValidationStep(PipelineStep):
|
|
15
|
+
"""结果校验步骤"""
|
|
16
|
+
|
|
17
|
+
CONFIG_SCHEMA = {
|
|
18
|
+
"type": "object",
|
|
19
|
+
"properties": {
|
|
20
|
+
"validation_criteria": {
|
|
21
|
+
"type": "object",
|
|
22
|
+
"properties": {
|
|
23
|
+
"min_samples": {"type": "integer", "default": 10},
|
|
24
|
+
"min_success_rate": {"type": "number", "default": 0.8},
|
|
25
|
+
"required_fields": {
|
|
26
|
+
"type": "array",
|
|
27
|
+
"items": {"type": "string"},
|
|
28
|
+
"default": ["text", "images", "labels"]
|
|
29
|
+
},
|
|
30
|
+
"quality_thresholds": {
|
|
31
|
+
"type": "object",
|
|
32
|
+
"properties": {
|
|
33
|
+
"min_text_length": {"type": "integer", "default": 10},
|
|
34
|
+
"max_empty_fields": {"type": "integer", "default": 2}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
"validation_tests": {
|
|
40
|
+
"type": "object",
|
|
41
|
+
"properties": {
|
|
42
|
+
"check_duplicates": {"type": "boolean", "default": True},
|
|
43
|
+
"check_data_integrity": {"type": "boolean", "default": True},
|
|
44
|
+
"check_file_references": {"type": "boolean", "default": True},
|
|
45
|
+
"check_format_consistency": {"type": "boolean", "default": True},
|
|
46
|
+
"run_statistical_analysis": {"type": "boolean", "default": True}
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
"output_reports": {
|
|
50
|
+
"type": "object",
|
|
51
|
+
"properties": {
|
|
52
|
+
"generate_summary_report": {"type": "boolean", "default": True},
|
|
53
|
+
"generate_detailed_report": {"type": "boolean", "default": True},
|
|
54
|
+
"generate_quality_metrics": {"type": "boolean", "default": True},
|
|
55
|
+
"save_failed_samples": {"type": "boolean", "default": True}
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
"remediation": {
|
|
59
|
+
"type": "object",
|
|
60
|
+
"properties": {
|
|
61
|
+
"auto_fix_minor_issues": {"type": "boolean", "default": True},
|
|
62
|
+
"remove_invalid_samples": {"type": "boolean", "default": False},
|
|
63
|
+
"flag_problematic_samples": {"type": "boolean", "default": True}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
def __init__(self, name: str = "result_validation", config: Dict[str, Any] = None):
|
|
70
|
+
super().__init__(name, config)
|
|
71
|
+
|
|
72
|
+
async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
|
|
73
|
+
"""执行结果校验"""
|
|
74
|
+
with MeasureTime(f"步骤 {self.name}"):
|
|
75
|
+
try:
|
|
76
|
+
step_config = self.get_step_config(config)
|
|
77
|
+
|
|
78
|
+
# 执行各种验证测试
|
|
79
|
+
validation_results = await self._run_validation_tests(data, config, step_config)
|
|
80
|
+
|
|
81
|
+
# 生成质量报告
|
|
82
|
+
quality_report = await self._generate_quality_report(data, validation_results, step_config)
|
|
83
|
+
|
|
84
|
+
# 执行修复措施
|
|
85
|
+
corrected_data = await self._apply_remediation(data, validation_results, step_config)
|
|
86
|
+
|
|
87
|
+
# 保存报告
|
|
88
|
+
report_files = await self._save_reports(quality_report, config, step_config)
|
|
89
|
+
|
|
90
|
+
# 更新处理状态
|
|
91
|
+
corrected_data['__processing_status'] = 'validated_final'
|
|
92
|
+
corrected_data['__final_validated_at'] = pd.Timestamp.now()
|
|
93
|
+
|
|
94
|
+
# 判断整体验证是否通过
|
|
95
|
+
overall_passed = self._assess_overall_quality(validation_results, step_config)
|
|
96
|
+
|
|
97
|
+
metadata = {
|
|
98
|
+
"validation_passed": overall_passed,
|
|
99
|
+
"total_samples": len(corrected_data),
|
|
100
|
+
"validation_results": validation_results,
|
|
101
|
+
"quality_score": quality_report.get("overall_quality_score", 0.0),
|
|
102
|
+
"report_files": report_files,
|
|
103
|
+
"corrected_issues": quality_report.get("corrected_issues", 0)
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
self.logger.info(f"结果校验完成,整体质量评分: {quality_report.get('overall_quality_score', 0.0):.2f}")
|
|
107
|
+
|
|
108
|
+
return StepResult(
|
|
109
|
+
step_name=self.name,
|
|
110
|
+
success=overall_passed,
|
|
111
|
+
data=corrected_data,
|
|
112
|
+
metadata=metadata
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
except Exception as e:
|
|
116
|
+
self.logger.error(f"结果校验失败: {e}")
|
|
117
|
+
return StepResult(
|
|
118
|
+
step_name=self.name,
|
|
119
|
+
success=False,
|
|
120
|
+
data=data,
|
|
121
|
+
metadata={},
|
|
122
|
+
error=str(e)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
async def _run_validation_tests(self, data: pd.DataFrame, config: PipelineConfig, step_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
126
|
+
"""运行验证测试"""
|
|
127
|
+
validation_tests = step_config.get("validation_tests", {})
|
|
128
|
+
results = {}
|
|
129
|
+
|
|
130
|
+
# 基本数据完整性检查
|
|
131
|
+
if validation_tests.get("check_data_integrity", True):
|
|
132
|
+
results["data_integrity"] = await self._check_data_integrity(data, step_config)
|
|
133
|
+
|
|
134
|
+
# 重复数据检查
|
|
135
|
+
if validation_tests.get("check_duplicates", True):
|
|
136
|
+
results["duplicates"] = await self._check_duplicates(data)
|
|
137
|
+
|
|
138
|
+
# 文件引用检查
|
|
139
|
+
if validation_tests.get("check_file_references", True):
|
|
140
|
+
results["file_references"] = await self._check_file_references(data)
|
|
141
|
+
|
|
142
|
+
# 格式一致性检查
|
|
143
|
+
if validation_tests.get("check_format_consistency", True):
|
|
144
|
+
results["format_consistency"] = await self._check_format_consistency(data)
|
|
145
|
+
|
|
146
|
+
# 统计分析
|
|
147
|
+
if validation_tests.get("run_statistical_analysis", True):
|
|
148
|
+
results["statistical_analysis"] = await self._run_statistical_analysis(data)
|
|
149
|
+
|
|
150
|
+
return results
|
|
151
|
+
|
|
152
|
+
async def _check_data_integrity(self, data: pd.DataFrame, step_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
153
|
+
"""检查数据完整性"""
|
|
154
|
+
criteria = step_config.get("validation_criteria", {})
|
|
155
|
+
required_fields = criteria.get("required_fields", ["text", "images", "labels"])
|
|
156
|
+
min_samples = criteria.get("min_samples", 10)
|
|
157
|
+
quality_thresholds = criteria.get("quality_thresholds", {})
|
|
158
|
+
|
|
159
|
+
issues = []
|
|
160
|
+
passed_samples = 0
|
|
161
|
+
|
|
162
|
+
# 检查样本数量
|
|
163
|
+
if len(data) < min_samples:
|
|
164
|
+
issues.append(f"样本数量不足: {len(data)} < {min_samples}")
|
|
165
|
+
|
|
166
|
+
# 检查必需字段
|
|
167
|
+
missing_fields = [field for field in required_fields if field not in data.columns]
|
|
168
|
+
if missing_fields:
|
|
169
|
+
issues.append(f"缺少必需字段: {missing_fields}")
|
|
170
|
+
|
|
171
|
+
# 检查每个样本的质量
|
|
172
|
+
for idx, row in data.iterrows():
|
|
173
|
+
sample_issues = []
|
|
174
|
+
|
|
175
|
+
# 检查文本长度
|
|
176
|
+
text_content = str(row.get('text', ''))
|
|
177
|
+
min_text_length = quality_thresholds.get("min_text_length", 10)
|
|
178
|
+
if len(text_content.strip()) < min_text_length:
|
|
179
|
+
sample_issues.append("文本长度不足")
|
|
180
|
+
|
|
181
|
+
# 检查空字段数量
|
|
182
|
+
empty_fields = sum(1 for field in required_fields
|
|
183
|
+
if field in data.columns and (pd.isna(row[field]) or str(row[field]).strip() == ""))
|
|
184
|
+
max_empty_fields = quality_thresholds.get("max_empty_fields", 2)
|
|
185
|
+
if empty_fields > max_empty_fields:
|
|
186
|
+
sample_issues.append(f"空字段过多: {empty_fields}")
|
|
187
|
+
|
|
188
|
+
if not sample_issues:
|
|
189
|
+
passed_samples += 1
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
"passed": len(issues) == 0,
|
|
193
|
+
"issues": issues,
|
|
194
|
+
"passed_samples": passed_samples,
|
|
195
|
+
"total_samples": len(data),
|
|
196
|
+
"pass_rate": passed_samples / len(data) if len(data) > 0 else 0
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
async def _check_duplicates(self, data: pd.DataFrame) -> Dict[str, Any]:
|
|
200
|
+
"""检查重复数据"""
|
|
201
|
+
# 基于文本和图像内容检查重复
|
|
202
|
+
content_columns = ['text', 'images']
|
|
203
|
+
available_columns = [col for col in content_columns if col in data.columns]
|
|
204
|
+
|
|
205
|
+
if not available_columns:
|
|
206
|
+
return {"passed": True, "duplicates": 0, "duplicate_pairs": []}
|
|
207
|
+
|
|
208
|
+
# 查找重复项
|
|
209
|
+
duplicates = data.duplicated(subset=available_columns, keep=False)
|
|
210
|
+
duplicate_count = duplicates.sum()
|
|
211
|
+
|
|
212
|
+
# 获取重复项的详细信息
|
|
213
|
+
duplicate_pairs = []
|
|
214
|
+
if duplicate_count > 0:
|
|
215
|
+
duplicate_data = data[duplicates]
|
|
216
|
+
groups = duplicate_data.groupby(available_columns)
|
|
217
|
+
for name, group in groups:
|
|
218
|
+
if len(group) > 1:
|
|
219
|
+
duplicate_pairs.append({
|
|
220
|
+
"content": dict(zip(available_columns, name)),
|
|
221
|
+
"indices": list(group.index)
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
"passed": duplicate_count == 0,
|
|
226
|
+
"duplicates": duplicate_count,
|
|
227
|
+
"duplicate_pairs": duplicate_pairs
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
async def _check_file_references(self, data: pd.DataFrame) -> Dict[str, Any]:
|
|
231
|
+
"""检查文件引用"""
|
|
232
|
+
if 'images' not in data.columns:
|
|
233
|
+
return {"passed": True, "missing_files": [], "invalid_paths": []}
|
|
234
|
+
|
|
235
|
+
missing_files = []
|
|
236
|
+
invalid_paths = []
|
|
237
|
+
|
|
238
|
+
for idx, row in data.iterrows():
|
|
239
|
+
images_str = str(row.get('images', ''))
|
|
240
|
+
if not images_str or images_str == 'nan':
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
# 解析图像路径
|
|
244
|
+
image_paths = [p.strip() for p in images_str.split('|') if p.strip()]
|
|
245
|
+
|
|
246
|
+
for image_path in image_paths:
|
|
247
|
+
try:
|
|
248
|
+
path_obj = Path(image_path)
|
|
249
|
+
if not path_obj.exists():
|
|
250
|
+
missing_files.append({
|
|
251
|
+
"row_index": idx,
|
|
252
|
+
"file_path": image_path
|
|
253
|
+
})
|
|
254
|
+
except Exception:
|
|
255
|
+
invalid_paths.append({
|
|
256
|
+
"row_index": idx,
|
|
257
|
+
"file_path": image_path
|
|
258
|
+
})
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
"passed": len(missing_files) == 0 and len(invalid_paths) == 0,
|
|
262
|
+
"missing_files": missing_files,
|
|
263
|
+
"invalid_paths": invalid_paths
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
async def _check_format_consistency(self, data: pd.DataFrame) -> Dict[str, Any]:
|
|
267
|
+
"""检查格式一致性"""
|
|
268
|
+
issues = []
|
|
269
|
+
|
|
270
|
+
# 检查图像路径格式
|
|
271
|
+
if 'images' in data.columns:
|
|
272
|
+
inconsistent_formats = []
|
|
273
|
+
for idx, row in data.iterrows():
|
|
274
|
+
images_str = str(row.get('images', ''))
|
|
275
|
+
if images_str and images_str != 'nan':
|
|
276
|
+
# 检查是否使用了正确的分隔符
|
|
277
|
+
if '|' not in images_str and ',' in images_str:
|
|
278
|
+
inconsistent_formats.append(idx)
|
|
279
|
+
|
|
280
|
+
if inconsistent_formats:
|
|
281
|
+
issues.append(f"图像路径分隔符不一致: {len(inconsistent_formats)} 个样本")
|
|
282
|
+
|
|
283
|
+
# 检查标签格式
|
|
284
|
+
label_columns = [col for col in data.columns if 'mllm_' in col or 'refined_' in col]
|
|
285
|
+
for col in label_columns:
|
|
286
|
+
empty_count = data[col].isna().sum() + (data[col] == '').sum()
|
|
287
|
+
if empty_count > len(data) * 0.5: # 超过50%为空
|
|
288
|
+
issues.append(f"标签列 {col} 空值过多: {empty_count}/{len(data)}")
|
|
289
|
+
|
|
290
|
+
return {
|
|
291
|
+
"passed": len(issues) == 0,
|
|
292
|
+
"issues": issues
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
async def _run_statistical_analysis(self, data: pd.DataFrame) -> Dict[str, Any]:
|
|
296
|
+
"""运行统计分析"""
|
|
297
|
+
stats = {}
|
|
298
|
+
|
|
299
|
+
# 文本长度统计
|
|
300
|
+
if 'text' in data.columns:
|
|
301
|
+
text_lengths = data['text'].astype(str).str.len()
|
|
302
|
+
stats["text_length"] = {
|
|
303
|
+
"mean": float(text_lengths.mean()),
|
|
304
|
+
"std": float(text_lengths.std()),
|
|
305
|
+
"min": int(text_lengths.min()),
|
|
306
|
+
"max": int(text_lengths.max()),
|
|
307
|
+
"median": float(text_lengths.median())
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
# 图像数量统计
|
|
311
|
+
if 'images' in data.columns:
|
|
312
|
+
image_counts = []
|
|
313
|
+
for images_str in data['images']:
|
|
314
|
+
if pd.notna(images_str) and str(images_str).strip():
|
|
315
|
+
count = len([p for p in str(images_str).split('|') if p.strip()])
|
|
316
|
+
image_counts.append(count)
|
|
317
|
+
else:
|
|
318
|
+
image_counts.append(0)
|
|
319
|
+
|
|
320
|
+
if image_counts:
|
|
321
|
+
stats["image_count"] = {
|
|
322
|
+
"mean": sum(image_counts) / len(image_counts),
|
|
323
|
+
"min": min(image_counts),
|
|
324
|
+
"max": max(image_counts),
|
|
325
|
+
"samples_with_images": sum(1 for c in image_counts if c > 0)
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
# 标注成功率统计
|
|
329
|
+
if '__mllm_annotation_success' in data.columns:
|
|
330
|
+
success_rate = data['__mllm_annotation_success'].mean()
|
|
331
|
+
stats["annotation_success_rate"] = float(success_rate)
|
|
332
|
+
|
|
333
|
+
if '__mllm_refinement_success' in data.columns:
|
|
334
|
+
refinement_rate = data['__mllm_refinement_success'].mean()
|
|
335
|
+
stats["refinement_success_rate"] = float(refinement_rate)
|
|
336
|
+
|
|
337
|
+
return {
|
|
338
|
+
"passed": True,
|
|
339
|
+
"statistics": stats
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
async def _generate_quality_report(self, data: pd.DataFrame, validation_results: Dict[str, Any], step_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
343
|
+
"""生成质量报告"""
|
|
344
|
+
report = {
|
|
345
|
+
"timestamp": pd.Timestamp.now().isoformat(),
|
|
346
|
+
"total_samples": len(data),
|
|
347
|
+
"validation_results": validation_results,
|
|
348
|
+
"overall_quality_score": 0.0,
|
|
349
|
+
"issues_summary": [],
|
|
350
|
+
"recommendations": [],
|
|
351
|
+
"corrected_issues": 0
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
# 计算总体质量分数
|
|
355
|
+
quality_factors = []
|
|
356
|
+
|
|
357
|
+
# 数据完整性权重: 40%
|
|
358
|
+
if "data_integrity" in validation_results:
|
|
359
|
+
integrity_score = validation_results["data_integrity"].get("pass_rate", 0.0)
|
|
360
|
+
quality_factors.append(("data_integrity", integrity_score, 0.4))
|
|
361
|
+
|
|
362
|
+
# 无重复性权重: 20%
|
|
363
|
+
if "duplicates" in validation_results:
|
|
364
|
+
dup_result = validation_results["duplicates"]
|
|
365
|
+
no_dup_score = 1.0 if dup_result["passed"] else max(0.0, 1.0 - dup_result["duplicates"] / len(data))
|
|
366
|
+
quality_factors.append(("no_duplicates", no_dup_score, 0.2))
|
|
367
|
+
|
|
368
|
+
# 文件引用有效性权重: 20%
|
|
369
|
+
if "file_references" in validation_results:
|
|
370
|
+
file_result = validation_results["file_references"]
|
|
371
|
+
file_score = 1.0 if file_result["passed"] else 0.5
|
|
372
|
+
quality_factors.append(("file_references", file_score, 0.2))
|
|
373
|
+
|
|
374
|
+
# 格式一致性权重: 20%
|
|
375
|
+
if "format_consistency" in validation_results:
|
|
376
|
+
format_result = validation_results["format_consistency"]
|
|
377
|
+
format_score = 1.0 if format_result["passed"] else 0.7
|
|
378
|
+
quality_factors.append(("format_consistency", format_score, 0.2))
|
|
379
|
+
|
|
380
|
+
# 计算加权平均分
|
|
381
|
+
if quality_factors:
|
|
382
|
+
weighted_sum = sum(score * weight for _, score, weight in quality_factors)
|
|
383
|
+
total_weight = sum(weight for _, _, weight in quality_factors)
|
|
384
|
+
report["overall_quality_score"] = weighted_sum / total_weight if total_weight > 0 else 0.0
|
|
385
|
+
|
|
386
|
+
# 收集问题摘要
|
|
387
|
+
for test_name, result in validation_results.items():
|
|
388
|
+
if not result.get("passed", True):
|
|
389
|
+
if "issues" in result:
|
|
390
|
+
report["issues_summary"].extend(result["issues"])
|
|
391
|
+
elif "duplicates" in result:
|
|
392
|
+
report["issues_summary"].append(f"发现 {result['duplicates']} 个重复样本")
|
|
393
|
+
elif "missing_files" in result:
|
|
394
|
+
report["issues_summary"].append(f"发现 {len(result['missing_files'])} 个缺失文件")
|
|
395
|
+
|
|
396
|
+
# 生成建议
|
|
397
|
+
if report["overall_quality_score"] < 0.8:
|
|
398
|
+
report["recommendations"].append("整体质量分数较低,建议检查数据源和处理流程")
|
|
399
|
+
|
|
400
|
+
if "data_integrity" in validation_results:
|
|
401
|
+
integrity_result = validation_results["data_integrity"]
|
|
402
|
+
if integrity_result.get("pass_rate", 1.0) < 0.9:
|
|
403
|
+
report["recommendations"].append("数据完整性不足,建议增强数据验证和清洗步骤")
|
|
404
|
+
|
|
405
|
+
return report
|
|
406
|
+
|
|
407
|
+
async def _apply_remediation(self, data: pd.DataFrame, validation_results: Dict[str, Any], step_config: Dict[str, Any]) -> pd.DataFrame:
|
|
408
|
+
"""应用修复措施"""
|
|
409
|
+
remediation = step_config.get("remediation", {})
|
|
410
|
+
data_copy = data.copy()
|
|
411
|
+
corrected_count = 0
|
|
412
|
+
|
|
413
|
+
# 自动修复小问题
|
|
414
|
+
if remediation.get("auto_fix_minor_issues", True):
|
|
415
|
+
# 清理空白字符
|
|
416
|
+
text_columns = ['text', 'mllm_summary', 'refined_summary']
|
|
417
|
+
for col in text_columns:
|
|
418
|
+
if col in data_copy.columns:
|
|
419
|
+
data_copy[col] = data_copy[col].astype(str).str.strip()
|
|
420
|
+
corrected_count += 1
|
|
421
|
+
|
|
422
|
+
# 标记有问题的样本
|
|
423
|
+
if remediation.get("flag_problematic_samples", True):
|
|
424
|
+
data_copy['__has_validation_issues'] = False
|
|
425
|
+
|
|
426
|
+
# 标记重复样本
|
|
427
|
+
if "duplicates" in validation_results and validation_results["duplicates"]["duplicate_pairs"]:
|
|
428
|
+
for pair in validation_results["duplicates"]["duplicate_pairs"]:
|
|
429
|
+
for idx in pair["indices"]:
|
|
430
|
+
if idx in data_copy.index:
|
|
431
|
+
data_copy.at[idx, '__has_validation_issues'] = True
|
|
432
|
+
|
|
433
|
+
# 标记文件缺失的样本
|
|
434
|
+
if "file_references" in validation_results:
|
|
435
|
+
for missing_file in validation_results["file_references"].get("missing_files", []):
|
|
436
|
+
idx = missing_file["row_index"]
|
|
437
|
+
if idx in data_copy.index:
|
|
438
|
+
data_copy.at[idx, '__has_validation_issues'] = True
|
|
439
|
+
|
|
440
|
+
# 移除无效样本
|
|
441
|
+
if remediation.get("remove_invalid_samples", False):
|
|
442
|
+
initial_count = len(data_copy)
|
|
443
|
+
# 这里可以根据具体的无效性标准来移除样本
|
|
444
|
+
# 暂时保留所有样本
|
|
445
|
+
removed_count = initial_count - len(data_copy)
|
|
446
|
+
if removed_count > 0:
|
|
447
|
+
self.logger.info(f"移除了 {removed_count} 个无效样本")
|
|
448
|
+
corrected_count += removed_count
|
|
449
|
+
|
|
450
|
+
return data_copy
|
|
451
|
+
|
|
452
|
+
async def _save_reports(self, quality_report: Dict[str, Any], config: PipelineConfig, step_config: Dict[str, Any]) -> List[str]:
|
|
453
|
+
"""保存报告"""
|
|
454
|
+
output_reports = step_config.get("output_reports", {})
|
|
455
|
+
report_files = []
|
|
456
|
+
|
|
457
|
+
output_dir = Path(config.output_dir) / "validation_reports"
|
|
458
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
459
|
+
|
|
460
|
+
# 保存摘要报告
|
|
461
|
+
if output_reports.get("generate_summary_report", True):
|
|
462
|
+
summary_file = output_dir / "validation_summary.json"
|
|
463
|
+
with open(summary_file, 'w', encoding='utf-8') as f:
|
|
464
|
+
json.dump(quality_report, f, ensure_ascii=False, indent=2)
|
|
465
|
+
report_files.append(str(summary_file))
|
|
466
|
+
|
|
467
|
+
# 保存详细报告
|
|
468
|
+
if output_reports.get("generate_detailed_report", True):
|
|
469
|
+
detailed_file = output_dir / "validation_detailed.json"
|
|
470
|
+
detailed_report = {
|
|
471
|
+
"summary": quality_report,
|
|
472
|
+
"detailed_validation_results": quality_report["validation_results"]
|
|
473
|
+
}
|
|
474
|
+
with open(detailed_file, 'w', encoding='utf-8') as f:
|
|
475
|
+
json.dump(detailed_report, f, ensure_ascii=False, indent=2)
|
|
476
|
+
report_files.append(str(detailed_file))
|
|
477
|
+
|
|
478
|
+
return report_files
|
|
479
|
+
|
|
480
|
+
def _assess_overall_quality(self, validation_results: Dict[str, Any], step_config: Dict[str, Any]) -> bool:
|
|
481
|
+
"""评估整体质量是否合格"""
|
|
482
|
+
criteria = step_config.get("validation_criteria", {})
|
|
483
|
+
min_success_rate = criteria.get("min_success_rate", 0.8)
|
|
484
|
+
|
|
485
|
+
# 检查数据完整性
|
|
486
|
+
if "data_integrity" in validation_results:
|
|
487
|
+
integrity_result = validation_results["data_integrity"]
|
|
488
|
+
if not integrity_result["passed"] or integrity_result.get("pass_rate", 0.0) < min_success_rate:
|
|
489
|
+
return False
|
|
490
|
+
|
|
491
|
+
# 检查其他关键指标
|
|
492
|
+
critical_tests = ["duplicates", "file_references"]
|
|
493
|
+
for test in critical_tests:
|
|
494
|
+
if test in validation_results and not validation_results[test]["passed"]:
|
|
495
|
+
# 对于某些问题,如果影响不大,可以容忍
|
|
496
|
+
if test == "file_references":
|
|
497
|
+
missing_count = len(validation_results[test].get("missing_files", []))
|
|
498
|
+
if missing_count > len(validation_results.get("data_integrity", {}).get("total_samples", 1)) * 0.1:
|
|
499
|
+
return False
|
|
500
|
+
|
|
501
|
+
return True
|