maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
"""
|
|
2
|
+
第5步:第二轮大模型精标润色
|
|
3
|
+
对第一轮标注结果进行精细化和润色
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import asyncio
|
|
8
|
+
from typing import List, Dict, Any, Optional
|
|
9
|
+
from ..core import PipelineStep, StepResult, PipelineConfig
|
|
10
|
+
from maque.performance import MeasureTime
|
|
11
|
+
from flexllm.mllm_client import MllmClient
|
|
12
|
+
from flexllm.async_api import ConcurrentExecutor
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MllmRefinementStep(PipelineStep):
|
|
16
|
+
"""MLLM精标润色步骤"""
|
|
17
|
+
|
|
18
|
+
CONFIG_SCHEMA = {
|
|
19
|
+
"type": "object",
|
|
20
|
+
"properties": {
|
|
21
|
+
"mllm_config": {
|
|
22
|
+
"type": "object",
|
|
23
|
+
"properties": {
|
|
24
|
+
"model_name": {"type": "string", "default": "gpt-4o"},
|
|
25
|
+
"base_url": {"type": "string"},
|
|
26
|
+
"api_key": {"type": "string"},
|
|
27
|
+
"temperature": {"type": "number", "default": 0.3},
|
|
28
|
+
"max_tokens": {"type": "integer", "default": 3072}
|
|
29
|
+
},
|
|
30
|
+
"required": ["model_name"]
|
|
31
|
+
},
|
|
32
|
+
"refinement_prompts": {
|
|
33
|
+
"type": "object",
|
|
34
|
+
"properties": {
|
|
35
|
+
"system_prompt": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"default": "你是一个专业的数据质量专家,负责对已有的标注结果进行精细化和质量提升。"
|
|
38
|
+
},
|
|
39
|
+
"user_prompt_template": {
|
|
40
|
+
"type": "string",
|
|
41
|
+
"default": "请对以下标注结果进行精细化处理:\n\n原始内容:\n文本:{text}\n\n第一轮标注结果:\n摘要:{summary}\n标签:{tags}\n情感:{sentiment}\n质量:{quality}\n\n请提供:\n1. 优化后的摘要\n2. 更准确的标签\n3. 精确的情感分析\n4. 详细的质量评估\n5. 改进建议"
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"refinement_criteria": {
|
|
46
|
+
"type": "object",
|
|
47
|
+
"properties": {
|
|
48
|
+
"focus_areas": {
|
|
49
|
+
"type": "array",
|
|
50
|
+
"items": {"type": "string"},
|
|
51
|
+
"default": ["accuracy", "completeness", "consistency", "clarity"],
|
|
52
|
+
"description": "重点改进的方面"
|
|
53
|
+
},
|
|
54
|
+
"quality_threshold": {
|
|
55
|
+
"type": "number",
|
|
56
|
+
"default": 0.7,
|
|
57
|
+
"description": "质量阈值,低于此值的数据会被优先精标"
|
|
58
|
+
},
|
|
59
|
+
"skip_high_quality": {
|
|
60
|
+
"type": "boolean",
|
|
61
|
+
"default": True,
|
|
62
|
+
"description": "是否跳过高质量数据的精标"
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
"concurrent_config": {
|
|
67
|
+
"type": "object",
|
|
68
|
+
"properties": {
|
|
69
|
+
"max_workers": {"type": "integer", "default": 3},
|
|
70
|
+
"batch_size": {"type": "integer", "default": 5},
|
|
71
|
+
"rate_limit": {"type": "number", "default": 0.5}
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
"output_columns": {
|
|
75
|
+
"type": "object",
|
|
76
|
+
"properties": {
|
|
77
|
+
"refined_summary": {"type": "string", "default": "refined_summary"},
|
|
78
|
+
"refined_tags": {"type": "string", "default": "refined_tags"},
|
|
79
|
+
"refined_sentiment": {"type": "string", "default": "refined_sentiment"},
|
|
80
|
+
"refined_quality": {"type": "string", "default": "refined_quality"},
|
|
81
|
+
"improvement_suggestions": {"type": "string", "default": "improvement_suggestions"},
|
|
82
|
+
"refinement_raw_response": {"type": "string", "default": "refinement_raw_response"}
|
|
83
|
+
}
|
|
84
|
+
},
|
|
85
|
+
"retry_config": {
|
|
86
|
+
"type": "object",
|
|
87
|
+
"properties": {
|
|
88
|
+
"max_retries": {"type": "integer", "default": 2},
|
|
89
|
+
"retry_delay": {"type": "number", "default": 2.0}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
def __init__(self, name: str = "mllm_refinement", config: Dict[str, Any] = None):
|
|
96
|
+
super().__init__(name, config)
|
|
97
|
+
self.mllm_client: Optional[MllmClient] = None
|
|
98
|
+
|
|
99
|
+
async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
|
|
100
|
+
"""执行MLLM精标润色"""
|
|
101
|
+
with MeasureTime(f"步骤 {self.name}"):
|
|
102
|
+
try:
|
|
103
|
+
step_config = self.get_step_config(config)
|
|
104
|
+
data_copy = data.copy()
|
|
105
|
+
|
|
106
|
+
# 初始化MLLM客户端
|
|
107
|
+
await self._initialize_mllm_client(step_config)
|
|
108
|
+
|
|
109
|
+
# 筛选需要精标的数据
|
|
110
|
+
refinement_candidates = await self._select_refinement_candidates(data_copy, step_config)
|
|
111
|
+
|
|
112
|
+
if not refinement_candidates:
|
|
113
|
+
self.logger.info("没有需要精标的数据")
|
|
114
|
+
return StepResult(
|
|
115
|
+
step_name=self.name,
|
|
116
|
+
success=True,
|
|
117
|
+
data=data_copy,
|
|
118
|
+
metadata={"skipped": True, "reason": "no_candidates"}
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# 准备精标任务
|
|
122
|
+
refinement_tasks = await self._prepare_refinement_tasks(data_copy, refinement_candidates, step_config)
|
|
123
|
+
|
|
124
|
+
# 执行并发精标
|
|
125
|
+
refinement_results = await self._execute_concurrent_refinement(
|
|
126
|
+
refinement_tasks, step_config
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# 处理精标结果
|
|
130
|
+
await self._process_refinement_results(
|
|
131
|
+
data_copy, refinement_results, step_config
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# 更新处理状态
|
|
135
|
+
data_copy['__processing_status'] = 'refined'
|
|
136
|
+
data_copy['__refined_at'] = pd.Timestamp.now()
|
|
137
|
+
|
|
138
|
+
# 统计结果
|
|
139
|
+
successful_refinements = len([r for r in refinement_results if r.get('success', False)])
|
|
140
|
+
failed_refinements = len(refinement_results) - successful_refinements
|
|
141
|
+
|
|
142
|
+
metadata = {
|
|
143
|
+
"total_rows": len(data_copy),
|
|
144
|
+
"refinement_candidates": len(refinement_candidates),
|
|
145
|
+
"successful_refinements": successful_refinements,
|
|
146
|
+
"failed_refinements": failed_refinements,
|
|
147
|
+
"success_rate": successful_refinements / len(refinement_results) if refinement_results else 0,
|
|
148
|
+
"mllm_model": step_config.get("mllm_config", {}).get("model_name", "unknown")
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
self.logger.info(f"MLLM精标完成,成功率: {metadata['success_rate']:.2%}")
|
|
152
|
+
|
|
153
|
+
return StepResult(
|
|
154
|
+
step_name=self.name,
|
|
155
|
+
success=True,
|
|
156
|
+
data=data_copy,
|
|
157
|
+
metadata=metadata
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
self.logger.error(f"MLLM精标失败: {e}")
|
|
162
|
+
return StepResult(
|
|
163
|
+
step_name=self.name,
|
|
164
|
+
success=False,
|
|
165
|
+
data=data,
|
|
166
|
+
metadata={},
|
|
167
|
+
error=str(e)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
async def _initialize_mllm_client(self, step_config: Dict[str, Any]):
|
|
171
|
+
"""初始化MLLM客户端"""
|
|
172
|
+
mllm_config = step_config.get("mllm_config", {})
|
|
173
|
+
|
|
174
|
+
self.mllm_client = MllmClient(
|
|
175
|
+
model_name=mllm_config.get("model_name", "gpt-4o"),
|
|
176
|
+
base_url=mllm_config.get("base_url"),
|
|
177
|
+
api_key=mllm_config.get("api_key"),
|
|
178
|
+
temperature=mllm_config.get("temperature", 0.3),
|
|
179
|
+
max_tokens=mllm_config.get("max_tokens", 3072)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
self.logger.info(f"初始化精标MLLM客户端: {mllm_config.get('model_name', 'gpt-4o')}")
|
|
183
|
+
|
|
184
|
+
async def _select_refinement_candidates(self, data: pd.DataFrame, step_config: Dict[str, Any]) -> List[int]:
|
|
185
|
+
"""选择需要精标的数据"""
|
|
186
|
+
criteria = step_config.get("refinement_criteria", {})
|
|
187
|
+
quality_threshold = criteria.get("quality_threshold", 0.7)
|
|
188
|
+
skip_high_quality = criteria.get("skip_high_quality", True)
|
|
189
|
+
|
|
190
|
+
candidates = []
|
|
191
|
+
|
|
192
|
+
# 检查是否有第一轮标注结果
|
|
193
|
+
if '__mllm_annotation_success' not in data.columns:
|
|
194
|
+
self.logger.warning("未找到第一轮标注结果,将对所有数据进行精标")
|
|
195
|
+
return list(data.index)
|
|
196
|
+
|
|
197
|
+
for idx, row in data.iterrows():
|
|
198
|
+
# 只对第一轮标注成功的数据进行精标
|
|
199
|
+
if not row.get('__mllm_annotation_success', False):
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
should_refine = True
|
|
203
|
+
|
|
204
|
+
if skip_high_quality:
|
|
205
|
+
# 评估质量,决定是否需要精标
|
|
206
|
+
quality_score = self._assess_annotation_quality(row)
|
|
207
|
+
if quality_score >= quality_threshold:
|
|
208
|
+
should_refine = False
|
|
209
|
+
|
|
210
|
+
if should_refine:
|
|
211
|
+
candidates.append(idx)
|
|
212
|
+
|
|
213
|
+
self.logger.info(f"选择了 {len(candidates)} 条数据进行精标")
|
|
214
|
+
return candidates
|
|
215
|
+
|
|
216
|
+
def _assess_annotation_quality(self, row: pd.Series) -> float:
|
|
217
|
+
"""评估标注质量"""
|
|
218
|
+
quality_score = 0.0
|
|
219
|
+
factors = 0
|
|
220
|
+
|
|
221
|
+
# 检查摘要质量
|
|
222
|
+
summary = str(row.get('mllm_summary', ''))
|
|
223
|
+
if summary and len(summary.strip()) >= 20:
|
|
224
|
+
quality_score += 0.25
|
|
225
|
+
factors += 1
|
|
226
|
+
|
|
227
|
+
# 检查标签质量
|
|
228
|
+
tags = str(row.get('mllm_tags', ''))
|
|
229
|
+
if tags and len(tags.strip()) >= 5:
|
|
230
|
+
quality_score += 0.25
|
|
231
|
+
factors += 1
|
|
232
|
+
|
|
233
|
+
# 检查情感分析
|
|
234
|
+
sentiment = str(row.get('mllm_sentiment', ''))
|
|
235
|
+
if sentiment and sentiment.strip():
|
|
236
|
+
quality_score += 0.25
|
|
237
|
+
factors += 1
|
|
238
|
+
|
|
239
|
+
# 检查质量评估
|
|
240
|
+
quality = str(row.get('mllm_quality', ''))
|
|
241
|
+
if quality and quality.strip():
|
|
242
|
+
quality_score += 0.25
|
|
243
|
+
factors += 1
|
|
244
|
+
|
|
245
|
+
return quality_score
|
|
246
|
+
|
|
247
|
+
async def _prepare_refinement_tasks(self, data: pd.DataFrame, candidates: List[int], step_config: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
248
|
+
"""准备精标任务"""
|
|
249
|
+
tasks = []
|
|
250
|
+
|
|
251
|
+
prompts = step_config.get("refinement_prompts", {})
|
|
252
|
+
system_prompt = prompts.get("system_prompt", "你是一个专业的数据质量专家。")
|
|
253
|
+
user_prompt_template = prompts.get("user_prompt_template", "请对以下标注结果进行精细化处理:\n\n原始内容:\n文本:{text}\n\n第一轮标注结果:\n摘要:{summary}\n标签:{tags}\n情感:{sentiment}\n质量:{quality}")
|
|
254
|
+
|
|
255
|
+
for idx in candidates:
|
|
256
|
+
row = data.loc[idx]
|
|
257
|
+
|
|
258
|
+
text_content = str(row.get('text', ''))
|
|
259
|
+
current_summary = str(row.get('mllm_summary', ''))
|
|
260
|
+
current_tags = str(row.get('mllm_tags', ''))
|
|
261
|
+
current_sentiment = str(row.get('mllm_sentiment', ''))
|
|
262
|
+
current_quality = str(row.get('mllm_quality', ''))
|
|
263
|
+
|
|
264
|
+
# 构建用户提示
|
|
265
|
+
user_prompt = user_prompt_template.format(
|
|
266
|
+
text=text_content,
|
|
267
|
+
summary=current_summary,
|
|
268
|
+
tags=current_tags,
|
|
269
|
+
sentiment=current_sentiment,
|
|
270
|
+
quality=current_quality
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
# 准备图像路径(如果有)
|
|
274
|
+
image_list = []
|
|
275
|
+
images = str(row.get('images', ''))
|
|
276
|
+
if images:
|
|
277
|
+
separator = "|" # 使用固定分隔符
|
|
278
|
+
image_list = [p.strip() for p in images.split(separator) if p.strip()]
|
|
279
|
+
|
|
280
|
+
task = {
|
|
281
|
+
"row_index": idx,
|
|
282
|
+
"text": text_content,
|
|
283
|
+
"images": image_list,
|
|
284
|
+
"system_prompt": system_prompt,
|
|
285
|
+
"user_prompt": user_prompt,
|
|
286
|
+
"current_annotations": {
|
|
287
|
+
"summary": current_summary,
|
|
288
|
+
"tags": current_tags,
|
|
289
|
+
"sentiment": current_sentiment,
|
|
290
|
+
"quality": current_quality
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
tasks.append(task)
|
|
295
|
+
|
|
296
|
+
self.logger.info(f"准备了 {len(tasks)} 个精标任务")
|
|
297
|
+
return tasks
|
|
298
|
+
|
|
299
|
+
async def _execute_concurrent_refinement(self, tasks: List[Dict[str, Any]], step_config: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
300
|
+
"""执行并发精标"""
|
|
301
|
+
concurrent_config = step_config.get("concurrent_config", {})
|
|
302
|
+
max_workers = concurrent_config.get("max_workers", 3)
|
|
303
|
+
rate_limit = concurrent_config.get("rate_limit", 0.5)
|
|
304
|
+
|
|
305
|
+
# 创建并发执行器
|
|
306
|
+
executor = ConcurrentExecutor(
|
|
307
|
+
max_concurrent=max_workers,
|
|
308
|
+
rate_limit=rate_limit
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# 准备异步任务
|
|
312
|
+
async_tasks = []
|
|
313
|
+
for task in tasks:
|
|
314
|
+
async_task = self._refine_single_item(task, step_config)
|
|
315
|
+
async_tasks.append(async_task)
|
|
316
|
+
|
|
317
|
+
# 执行并发精标
|
|
318
|
+
results = await executor.execute_all(async_tasks)
|
|
319
|
+
|
|
320
|
+
return results
|
|
321
|
+
|
|
322
|
+
async def _refine_single_item(self, task: Dict[str, Any], step_config: Dict[str, Any]) -> Dict[str, Any]:
|
|
323
|
+
"""精标单个数据项"""
|
|
324
|
+
retry_config = step_config.get("retry_config", {})
|
|
325
|
+
max_retries = retry_config.get("max_retries", 2)
|
|
326
|
+
retry_delay = retry_config.get("retry_delay", 2.0)
|
|
327
|
+
|
|
328
|
+
for attempt in range(max_retries + 1):
|
|
329
|
+
try:
|
|
330
|
+
# 调用MLLM
|
|
331
|
+
response = await self.mllm_client.chat_async(
|
|
332
|
+
messages=[
|
|
333
|
+
{"role": "system", "content": task["system_prompt"]},
|
|
334
|
+
{"role": "user", "content": task["user_prompt"]}
|
|
335
|
+
],
|
|
336
|
+
images=task["images"]
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# 解析响应
|
|
340
|
+
parsed_result = self._parse_refinement_response(response)
|
|
341
|
+
|
|
342
|
+
return {
|
|
343
|
+
"row_index": task["row_index"],
|
|
344
|
+
"success": True,
|
|
345
|
+
"raw_response": response,
|
|
346
|
+
**parsed_result
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
except Exception as e:
|
|
350
|
+
self.logger.warning(f"第 {attempt + 1} 次精标失败 (行 {task['row_index']}): {e}")
|
|
351
|
+
|
|
352
|
+
if attempt < max_retries:
|
|
353
|
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
|
354
|
+
else:
|
|
355
|
+
return {
|
|
356
|
+
"row_index": task["row_index"],
|
|
357
|
+
"success": False,
|
|
358
|
+
"error": str(e),
|
|
359
|
+
"raw_response": "",
|
|
360
|
+
"refined_summary": "",
|
|
361
|
+
"refined_tags": "",
|
|
362
|
+
"refined_sentiment": "",
|
|
363
|
+
"refined_quality": "",
|
|
364
|
+
"improvement_suggestions": ""
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
def _parse_refinement_response(self, response: str) -> Dict[str, str]:
|
|
368
|
+
"""解析精标响应"""
|
|
369
|
+
result = {
|
|
370
|
+
"refined_summary": "",
|
|
371
|
+
"refined_tags": "",
|
|
372
|
+
"refined_sentiment": "",
|
|
373
|
+
"refined_quality": "",
|
|
374
|
+
"improvement_suggestions": ""
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
try:
|
|
378
|
+
lines = response.strip().split('\n')
|
|
379
|
+
current_section = None
|
|
380
|
+
|
|
381
|
+
for line in lines:
|
|
382
|
+
line = line.strip()
|
|
383
|
+
if not line:
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
# 检查是否是新的部分标题
|
|
387
|
+
if "优化后的摘要" in line or "refined summary" in line.lower():
|
|
388
|
+
current_section = "refined_summary"
|
|
389
|
+
elif "更准确的标签" in line or "refined tag" in line.lower():
|
|
390
|
+
current_section = "refined_tags"
|
|
391
|
+
elif "精确的情感" in line or "refined sentiment" in line.lower():
|
|
392
|
+
current_section = "refined_sentiment"
|
|
393
|
+
elif "详细的质量" in line or "refined quality" in line.lower():
|
|
394
|
+
current_section = "refined_quality"
|
|
395
|
+
elif "改进建议" in line or "improvement" in line.lower():
|
|
396
|
+
current_section = "improvement_suggestions"
|
|
397
|
+
elif current_section and line:
|
|
398
|
+
# 移除常见的前缀
|
|
399
|
+
content = line.replace(":", "").replace(":", "").strip()
|
|
400
|
+
if content and not content.startswith(("1.", "2.", "3.", "4.", "5.")):
|
|
401
|
+
if result[current_section]:
|
|
402
|
+
result[current_section] += " " + content
|
|
403
|
+
else:
|
|
404
|
+
result[current_section] = content
|
|
405
|
+
|
|
406
|
+
# 如果解析失败,将整个响应作为改进建议
|
|
407
|
+
if not any(result.values()):
|
|
408
|
+
result["improvement_suggestions"] = response[:800]
|
|
409
|
+
|
|
410
|
+
except Exception as e:
|
|
411
|
+
self.logger.warning(f"解析精标响应失败: {e}")
|
|
412
|
+
result["improvement_suggestions"] = response[:800] if response else ""
|
|
413
|
+
|
|
414
|
+
return result
|
|
415
|
+
|
|
416
|
+
async def _process_refinement_results(self, data: pd.DataFrame, results: List[Dict[str, Any]], step_config: Dict[str, Any]):
|
|
417
|
+
"""处理精标结果"""
|
|
418
|
+
output_columns = step_config.get("output_columns", {})
|
|
419
|
+
|
|
420
|
+
refined_summary_col = output_columns.get("refined_summary", "refined_summary")
|
|
421
|
+
refined_tags_col = output_columns.get("refined_tags", "refined_tags")
|
|
422
|
+
refined_sentiment_col = output_columns.get("refined_sentiment", "refined_sentiment")
|
|
423
|
+
refined_quality_col = output_columns.get("refined_quality", "refined_quality")
|
|
424
|
+
improvement_suggestions_col = output_columns.get("improvement_suggestions", "improvement_suggestions")
|
|
425
|
+
refinement_raw_response_col = output_columns.get("refinement_raw_response", "refinement_raw_response")
|
|
426
|
+
|
|
427
|
+
# 初始化新列
|
|
428
|
+
data[refined_summary_col] = ""
|
|
429
|
+
data[refined_tags_col] = ""
|
|
430
|
+
data[refined_sentiment_col] = ""
|
|
431
|
+
data[refined_quality_col] = ""
|
|
432
|
+
data[improvement_suggestions_col] = ""
|
|
433
|
+
data[refinement_raw_response_col] = ""
|
|
434
|
+
data['__mllm_refinement_success'] = False
|
|
435
|
+
|
|
436
|
+
# 填充结果
|
|
437
|
+
for result in results:
|
|
438
|
+
row_idx = result["row_index"]
|
|
439
|
+
|
|
440
|
+
data.at[row_idx, refined_summary_col] = result.get("refined_summary", "")
|
|
441
|
+
data.at[row_idx, refined_tags_col] = result.get("refined_tags", "")
|
|
442
|
+
data.at[row_idx, refined_sentiment_col] = result.get("refined_sentiment", "")
|
|
443
|
+
data.at[row_idx, refined_quality_col] = result.get("refined_quality", "")
|
|
444
|
+
data.at[row_idx, improvement_suggestions_col] = result.get("improvement_suggestions", "")
|
|
445
|
+
data.at[row_idx, refinement_raw_response_col] = result.get("raw_response", "")
|
|
446
|
+
data.at[row_idx, '__mllm_refinement_success'] = result.get("success", False)
|