maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
多模态大模型训练数据处理pipeline
|
|
3
|
+
|
|
4
|
+
一个灵活的、模块化的数据处理流水线,支持Web界面交互和断点续传。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .core import DataProcessorPipeline, PipelineStep, PipelineConfig
|
|
8
|
+
from .steps import *
|
|
9
|
+
from .web_app import WebApp
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
__all__ = [
|
|
13
|
+
"DataProcessorPipeline",
|
|
14
|
+
"PipelineStep",
|
|
15
|
+
"PipelineConfig",
|
|
16
|
+
"WebApp"
|
|
17
|
+
]
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline核心架构实现
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import asyncio
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from dataclasses import dataclass, asdict
|
|
9
|
+
from typing import Dict, List, Any, Optional, Union, Callable
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
|
|
14
|
+
from loguru import logger
|
|
15
|
+
from maque.performance import MeasureTime
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class PipelineConfig:
|
|
20
|
+
"""Pipeline配置类"""
|
|
21
|
+
|
|
22
|
+
# 输入配置
|
|
23
|
+
input_file: str = ""
|
|
24
|
+
input_type: str = "csv" # csv, excel
|
|
25
|
+
sheet_name: Optional[str] = None
|
|
26
|
+
|
|
27
|
+
# 列映射配置
|
|
28
|
+
image_columns: List[str] = None # 图像列名称
|
|
29
|
+
harmful_image_columns: List[str] = None # 有害图像列
|
|
30
|
+
harmful_text_columns: List[str] = None # 有害文本列
|
|
31
|
+
text_columns: List[str] = None # 文本列
|
|
32
|
+
|
|
33
|
+
# 输出配置
|
|
34
|
+
output_dir: str = "./output"
|
|
35
|
+
checkpoint_dir: str = "./checkpoints"
|
|
36
|
+
|
|
37
|
+
# 步骤配置
|
|
38
|
+
steps_config: Dict[str, Dict[str, Any]] = None
|
|
39
|
+
|
|
40
|
+
# MLLM配置
|
|
41
|
+
mllm_config: Dict[str, Any] = None
|
|
42
|
+
|
|
43
|
+
def __post_init__(self):
|
|
44
|
+
if self.image_columns is None:
|
|
45
|
+
self.image_columns = []
|
|
46
|
+
if self.harmful_image_columns is None:
|
|
47
|
+
self.harmful_image_columns = []
|
|
48
|
+
if self.harmful_text_columns is None:
|
|
49
|
+
self.harmful_text_columns = []
|
|
50
|
+
if self.text_columns is None:
|
|
51
|
+
self.text_columns = []
|
|
52
|
+
if self.steps_config is None:
|
|
53
|
+
self.steps_config = {}
|
|
54
|
+
if self.mllm_config is None:
|
|
55
|
+
self.mllm_config = {}
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
58
|
+
return asdict(self)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_dict(cls, data: Dict[str, Any]) -> "PipelineConfig":
|
|
62
|
+
return cls(**data)
|
|
63
|
+
|
|
64
|
+
def save(self, filepath: str):
|
|
65
|
+
"""保存配置到文件"""
|
|
66
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
67
|
+
json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def load(cls, filepath: str) -> "PipelineConfig":
|
|
71
|
+
"""从文件加载配置"""
|
|
72
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
73
|
+
data = json.load(f)
|
|
74
|
+
return cls.from_dict(data)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class StepResult:
|
|
79
|
+
"""步骤执行结果"""
|
|
80
|
+
|
|
81
|
+
step_name: str
|
|
82
|
+
success: bool
|
|
83
|
+
data: pd.DataFrame
|
|
84
|
+
metadata: Dict[str, Any]
|
|
85
|
+
error: Optional[str] = None
|
|
86
|
+
execution_time: Optional[float] = None
|
|
87
|
+
|
|
88
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
89
|
+
result = {
|
|
90
|
+
"step_name": self.step_name,
|
|
91
|
+
"success": self.success,
|
|
92
|
+
"metadata": self.metadata,
|
|
93
|
+
"error": self.error,
|
|
94
|
+
"execution_time": self.execution_time,
|
|
95
|
+
"data_shape": self.data.shape if self.data is not None else None,
|
|
96
|
+
"timestamp": datetime.now().isoformat(),
|
|
97
|
+
}
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
def save_checkpoint(self, checkpoint_dir: str):
|
|
101
|
+
"""保存检查点"""
|
|
102
|
+
checkpoint_path = Path(checkpoint_dir)
|
|
103
|
+
checkpoint_path.mkdir(parents=True, exist_ok=True)
|
|
104
|
+
|
|
105
|
+
# 保存数据
|
|
106
|
+
data_file = checkpoint_path / f"{self.step_name}_data.csv"
|
|
107
|
+
if self.data is not None:
|
|
108
|
+
self.data.to_csv(data_file, index=False, encoding="utf-8")
|
|
109
|
+
|
|
110
|
+
# 保存元数据
|
|
111
|
+
meta_file = checkpoint_path / f"{self.step_name}_metadata.json"
|
|
112
|
+
with open(meta_file, "w", encoding="utf-8") as f:
|
|
113
|
+
json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def load_checkpoint(
|
|
117
|
+
cls, step_name: str, checkpoint_dir: str
|
|
118
|
+
) -> Optional["StepResult"]:
|
|
119
|
+
"""加载检查点"""
|
|
120
|
+
checkpoint_path = Path(checkpoint_dir)
|
|
121
|
+
data_file = checkpoint_path / f"{step_name}_data.csv"
|
|
122
|
+
meta_file = checkpoint_path / f"{step_name}_metadata.json"
|
|
123
|
+
|
|
124
|
+
if not (data_file.exists() and meta_file.exists()):
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
# 加载数据
|
|
128
|
+
data = pd.read_csv(data_file, encoding="utf-8")
|
|
129
|
+
|
|
130
|
+
# 加载元数据
|
|
131
|
+
with open(meta_file, "r", encoding="utf-8") as f:
|
|
132
|
+
metadata_info = json.load(f)
|
|
133
|
+
|
|
134
|
+
return cls(
|
|
135
|
+
step_name=metadata_info["step_name"],
|
|
136
|
+
success=metadata_info["success"],
|
|
137
|
+
data=data,
|
|
138
|
+
metadata=metadata_info["metadata"],
|
|
139
|
+
error=metadata_info.get("error"),
|
|
140
|
+
execution_time=metadata_info.get("execution_time"),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class PipelineStep(ABC):
|
|
145
|
+
"""Pipeline步骤基类"""
|
|
146
|
+
|
|
147
|
+
def __init__(self, name: str, config: Dict[str, Any] = None):
|
|
148
|
+
self.name = name
|
|
149
|
+
self.config = config or {}
|
|
150
|
+
self.logger = logger
|
|
151
|
+
|
|
152
|
+
@abstractmethod
|
|
153
|
+
async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
|
|
154
|
+
"""执行步骤"""
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
def validate_input(self, data: pd.DataFrame, config: PipelineConfig) -> bool:
|
|
158
|
+
"""验证输入数据"""
|
|
159
|
+
return data is not None and not data.empty
|
|
160
|
+
|
|
161
|
+
def get_step_config(self, config: PipelineConfig) -> Dict[str, Any]:
|
|
162
|
+
"""获取步骤特定配置"""
|
|
163
|
+
return config.steps_config.get(self.name, {})
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class DataProcessorPipeline:
|
|
167
|
+
"""数据处理Pipeline主类"""
|
|
168
|
+
|
|
169
|
+
def __init__(self, config: PipelineConfig):
|
|
170
|
+
self.config = config
|
|
171
|
+
self.steps: List[PipelineStep] = []
|
|
172
|
+
self.results: List[StepResult] = []
|
|
173
|
+
self.logger = logger
|
|
174
|
+
self.status_callback: Optional[Callable] = None
|
|
175
|
+
|
|
176
|
+
# 创建输出目录
|
|
177
|
+
Path(config.output_dir).mkdir(parents=True, exist_ok=True)
|
|
178
|
+
Path(config.checkpoint_dir).mkdir(parents=True, exist_ok=True)
|
|
179
|
+
|
|
180
|
+
def add_step(self, step: PipelineStep) -> "DataProcessorPipeline":
|
|
181
|
+
"""添加处理步骤"""
|
|
182
|
+
self.steps.append(step)
|
|
183
|
+
return self
|
|
184
|
+
|
|
185
|
+
def set_status_callback(self, callback: Callable[[str, Dict[str, Any]], None]):
|
|
186
|
+
"""设置状态回调函数"""
|
|
187
|
+
self.status_callback = callback
|
|
188
|
+
|
|
189
|
+
def _notify_status(self, status: str, data: Dict[str, Any] = None):
|
|
190
|
+
"""通知状态更新"""
|
|
191
|
+
if self.status_callback:
|
|
192
|
+
self.status_callback(status, data or {})
|
|
193
|
+
|
|
194
|
+
async def load_data(self) -> pd.DataFrame:
|
|
195
|
+
"""加载输入数据"""
|
|
196
|
+
self._notify_status("loading_data", {"file": self.config.input_file})
|
|
197
|
+
|
|
198
|
+
if self.config.input_type == "csv":
|
|
199
|
+
data = pd.read_csv(self.config.input_file, encoding="utf-8")
|
|
200
|
+
elif self.config.input_type == "excel":
|
|
201
|
+
# 如果有图像列,使用maque的excel_helper
|
|
202
|
+
if self.config.image_columns:
|
|
203
|
+
from maque.utils.excel_helper import extract_excel_with_images
|
|
204
|
+
|
|
205
|
+
data = extract_excel_with_images(
|
|
206
|
+
excel_path=self.config.input_file,
|
|
207
|
+
image_column_names=self.config.image_columns,
|
|
208
|
+
sheet_name=self.config.sheet_name,
|
|
209
|
+
image_output_dir=Path(self.config.output_dir) / "images",
|
|
210
|
+
)
|
|
211
|
+
else:
|
|
212
|
+
data = pd.read_excel(
|
|
213
|
+
self.config.input_file, sheet_name=self.config.sheet_name
|
|
214
|
+
)
|
|
215
|
+
else:
|
|
216
|
+
raise ValueError(f"不支持的输入类型: {self.config.input_type}")
|
|
217
|
+
|
|
218
|
+
self.logger.info(f"加载数据完成,共{len(data)}行")
|
|
219
|
+
return data
|
|
220
|
+
|
|
221
|
+
async def run(self, resume_from: Optional[str] = None) -> List[StepResult]:
|
|
222
|
+
"""运行Pipeline"""
|
|
223
|
+
with MeasureTime("Pipeline执行"):
|
|
224
|
+
self._notify_status("starting", {"total_steps": len(self.steps)})
|
|
225
|
+
|
|
226
|
+
# 加载数据
|
|
227
|
+
current_data = await self.load_data()
|
|
228
|
+
|
|
229
|
+
# 确定开始步骤
|
|
230
|
+
start_idx = 0
|
|
231
|
+
if resume_from:
|
|
232
|
+
# 尝试从检查点恢复
|
|
233
|
+
checkpoint_result = StepResult.load_checkpoint(
|
|
234
|
+
resume_from, self.config.checkpoint_dir
|
|
235
|
+
)
|
|
236
|
+
if checkpoint_result:
|
|
237
|
+
current_data = checkpoint_result.data
|
|
238
|
+
start_idx = (
|
|
239
|
+
next(
|
|
240
|
+
(
|
|
241
|
+
i
|
|
242
|
+
for i, step in enumerate(self.steps)
|
|
243
|
+
if step.name == resume_from
|
|
244
|
+
),
|
|
245
|
+
0,
|
|
246
|
+
)
|
|
247
|
+
+ 1
|
|
248
|
+
)
|
|
249
|
+
self.results.append(checkpoint_result)
|
|
250
|
+
self.logger.info(f"从检查点恢复: {resume_from}")
|
|
251
|
+
|
|
252
|
+
# 执行步骤
|
|
253
|
+
for i, step in enumerate(self.steps[start_idx:], start_idx):
|
|
254
|
+
self._notify_status(
|
|
255
|
+
"executing_step",
|
|
256
|
+
{
|
|
257
|
+
"step_name": step.name,
|
|
258
|
+
"step_index": i,
|
|
259
|
+
"total_steps": len(self.steps),
|
|
260
|
+
},
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
self.logger.info(f"执行步骤 {i + 1}/{len(self.steps)}: {step.name}")
|
|
265
|
+
|
|
266
|
+
# 验证输入
|
|
267
|
+
if not step.validate_input(current_data, self.config):
|
|
268
|
+
raise ValueError(f"步骤 {step.name} 输入验证失败")
|
|
269
|
+
|
|
270
|
+
# 执行步骤
|
|
271
|
+
result = await step.execute(current_data, self.config)
|
|
272
|
+
|
|
273
|
+
# 保存检查点
|
|
274
|
+
result.save_checkpoint(self.config.checkpoint_dir)
|
|
275
|
+
|
|
276
|
+
# 更新当前数据
|
|
277
|
+
current_data = result.data
|
|
278
|
+
self.results.append(result)
|
|
279
|
+
|
|
280
|
+
self.logger.info(f"步骤 {step.name} 执行完成")
|
|
281
|
+
|
|
282
|
+
except Exception as e:
|
|
283
|
+
error_msg = f"步骤 {step.name} 执行失败: {str(e)}"
|
|
284
|
+
self.logger.error(error_msg)
|
|
285
|
+
|
|
286
|
+
# 创建失败结果
|
|
287
|
+
failed_result = StepResult(
|
|
288
|
+
step_name=step.name,
|
|
289
|
+
success=False,
|
|
290
|
+
data=current_data,
|
|
291
|
+
metadata={},
|
|
292
|
+
error=error_msg,
|
|
293
|
+
)
|
|
294
|
+
failed_result.save_checkpoint(self.config.checkpoint_dir)
|
|
295
|
+
self.results.append(failed_result)
|
|
296
|
+
|
|
297
|
+
self._notify_status(
|
|
298
|
+
"step_failed", {"step_name": step.name, "error": error_msg}
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
raise
|
|
302
|
+
|
|
303
|
+
self._notify_status("completed", {"total_results": len(self.results)})
|
|
304
|
+
self.logger.info("Pipeline执行完成")
|
|
305
|
+
|
|
306
|
+
return self.results
|
|
307
|
+
|
|
308
|
+
def get_status(self) -> Dict[str, Any]:
|
|
309
|
+
"""获取Pipeline状态"""
|
|
310
|
+
return {
|
|
311
|
+
"total_steps": len(self.steps),
|
|
312
|
+
"completed_steps": len([r for r in self.results if r.success]),
|
|
313
|
+
"failed_steps": len([r for r in self.results if not r.success]),
|
|
314
|
+
"current_step": len(self.results),
|
|
315
|
+
"steps": [
|
|
316
|
+
{"name": step.name, "config": step.config} for step in self.steps
|
|
317
|
+
],
|
|
318
|
+
"results": [result.to_dict() for result in self.results],
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
def save_final_results(self):
|
|
322
|
+
"""保存最终结果"""
|
|
323
|
+
if not self.results:
|
|
324
|
+
return
|
|
325
|
+
|
|
326
|
+
final_result = self.results[-1]
|
|
327
|
+
if final_result.success and final_result.data is not None:
|
|
328
|
+
output_file = Path(self.config.output_dir) / "final_output.csv"
|
|
329
|
+
final_result.data.to_csv(output_file, index=False, encoding="utf-8")
|
|
330
|
+
self.logger.info(f"最终结果已保存至: {output_file}")
|
|
331
|
+
|
|
332
|
+
# 保存执行摘要
|
|
333
|
+
summary = {
|
|
334
|
+
"config": self.config.to_dict(),
|
|
335
|
+
"execution_summary": self.get_status(),
|
|
336
|
+
"timestamp": datetime.now().isoformat(),
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
summary_file = Path(self.config.output_dir) / "execution_summary.json"
|
|
340
|
+
with open(summary_file, "w", encoding="utf-8") as f:
|
|
341
|
+
json.dump(summary, f, ensure_ascii=False, indent=2)
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
MLLM Data Processor Pipeline 使用示例
|
|
4
|
+
|
|
5
|
+
这个示例展示了如何使用Pipeline进行多模态大模型训练数据处理。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
# 添加项目根目录到Python路径
|
|
13
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
14
|
+
|
|
15
|
+
from maque.mllm_data_processor_pipeline import (
|
|
16
|
+
DataProcessorPipeline,
|
|
17
|
+
PipelineConfig,
|
|
18
|
+
WebApp
|
|
19
|
+
)
|
|
20
|
+
from maque.mllm_data_processor_pipeline.steps import *
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
async def example_basic_usage():
|
|
24
|
+
"""基本使用示例"""
|
|
25
|
+
print("=== 基本使用示例 ===")
|
|
26
|
+
|
|
27
|
+
# 1. 创建配置
|
|
28
|
+
config = PipelineConfig(
|
|
29
|
+
input_file="example_data.csv",
|
|
30
|
+
input_type="csv",
|
|
31
|
+
image_columns=["image_url"],
|
|
32
|
+
text_columns=["text", "content"],
|
|
33
|
+
harmful_image_columns=["harmful_img"],
|
|
34
|
+
harmful_text_columns=["harmful_text"],
|
|
35
|
+
output_dir="./output",
|
|
36
|
+
checkpoint_dir="./checkpoints",
|
|
37
|
+
mllm_config={
|
|
38
|
+
"model_name": "gpt-4o",
|
|
39
|
+
"api_key": "your-api-key-here",
|
|
40
|
+
"temperature": 0.7,
|
|
41
|
+
"max_tokens": 2048
|
|
42
|
+
},
|
|
43
|
+
steps_config={
|
|
44
|
+
"data_validation": {
|
|
45
|
+
"skip_validation": False,
|
|
46
|
+
"min_text_length": 10,
|
|
47
|
+
"require_images": False
|
|
48
|
+
},
|
|
49
|
+
"mllm_annotation": {
|
|
50
|
+
"concurrent_config": {
|
|
51
|
+
"max_workers": 5,
|
|
52
|
+
"rate_limit": 1.0
|
|
53
|
+
},
|
|
54
|
+
"annotation_prompts": {
|
|
55
|
+
"system_prompt": "你是一个专业的数据标注员,请根据给定的图像和文本内容进行标注。",
|
|
56
|
+
"user_prompt_template": """请对以下内容进行标注:
|
|
57
|
+
文本:{text}
|
|
58
|
+
|
|
59
|
+
请提供:
|
|
60
|
+
1. 内容摘要(50字以内)
|
|
61
|
+
2. 主要标签(用逗号分隔)
|
|
62
|
+
3. 情感倾向(正面/负面/中性)
|
|
63
|
+
4. 质量评估(高/中/低)"""
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
"mllm_refinement": {
|
|
67
|
+
"concurrent_config": {
|
|
68
|
+
"max_workers": 3,
|
|
69
|
+
"rate_limit": 0.5
|
|
70
|
+
},
|
|
71
|
+
"refinement_criteria": {
|
|
72
|
+
"quality_threshold": 0.7,
|
|
73
|
+
"skip_high_quality": True
|
|
74
|
+
}
|
|
75
|
+
},
|
|
76
|
+
"format_conversion": {
|
|
77
|
+
"output_formats": ["jsonl", "csv", "json"],
|
|
78
|
+
"field_mapping": {
|
|
79
|
+
"text_field": "text",
|
|
80
|
+
"images_field": "images",
|
|
81
|
+
"labels_field": "labels"
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# 2. 创建Pipeline
|
|
88
|
+
pipeline = DataProcessorPipeline(config)
|
|
89
|
+
|
|
90
|
+
# 3. 添加处理步骤
|
|
91
|
+
pipeline.add_step(DataLoaderStep()) \
|
|
92
|
+
.add_step(DataAlignmentStep()) \
|
|
93
|
+
.add_step(DataValidationStep()) \
|
|
94
|
+
.add_step(MllmAnnotationStep()) \
|
|
95
|
+
.add_step(MllmRefinementStep()) \
|
|
96
|
+
.add_step(FormatConversionStep()) \
|
|
97
|
+
.add_step(ResultValidationStep())
|
|
98
|
+
|
|
99
|
+
# 4. 设置状态回调(可选)
|
|
100
|
+
def status_callback(status, data):
|
|
101
|
+
print(f"状态更新: {status} - {data}")
|
|
102
|
+
|
|
103
|
+
pipeline.set_status_callback(status_callback)
|
|
104
|
+
|
|
105
|
+
# 5. 运行Pipeline
|
|
106
|
+
try:
|
|
107
|
+
results = await pipeline.run()
|
|
108
|
+
|
|
109
|
+
print("\n=== 执行结果 ===")
|
|
110
|
+
for i, result in enumerate(results):
|
|
111
|
+
step_name = result.step_name
|
|
112
|
+
success = "✓" if result.success else "✗"
|
|
113
|
+
print(f"{i+1}. {step_name}: {success}")
|
|
114
|
+
if result.error:
|
|
115
|
+
print(f" 错误: {result.error}")
|
|
116
|
+
|
|
117
|
+
# 6. 保存最终结果
|
|
118
|
+
pipeline.save_final_results()
|
|
119
|
+
print("\n处理完成!结果已保存到output目录。")
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
print(f"Pipeline执行失败: {e}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def example_resume_from_checkpoint():
|
|
126
|
+
"""从检查点恢复示例"""
|
|
127
|
+
print("=== 从检查点恢复示例 ===")
|
|
128
|
+
|
|
129
|
+
config = PipelineConfig(
|
|
130
|
+
input_file="example_data.csv",
|
|
131
|
+
input_type="csv",
|
|
132
|
+
checkpoint_dir="./checkpoints"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
pipeline = DataProcessorPipeline(config)
|
|
136
|
+
# 添加步骤...
|
|
137
|
+
|
|
138
|
+
# 从特定步骤恢复
|
|
139
|
+
try:
|
|
140
|
+
results = await pipeline.run(resume_from="mllm_annotation")
|
|
141
|
+
print("从检查点恢复执行成功!")
|
|
142
|
+
except Exception as e:
|
|
143
|
+
print(f"从检查点恢复失败: {e}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def example_web_interface():
|
|
147
|
+
"""Web界面示例"""
|
|
148
|
+
print("=== Web界面示例 ===")
|
|
149
|
+
|
|
150
|
+
# 创建Web应用
|
|
151
|
+
app = WebApp(static_dir="./static")
|
|
152
|
+
|
|
153
|
+
# 启动Web服务器
|
|
154
|
+
print("启动Web界面...")
|
|
155
|
+
print("访问: http://localhost:8000")
|
|
156
|
+
app.run(host="0.0.0.0", port=8000)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def example_custom_step():
|
|
160
|
+
"""自定义步骤示例"""
|
|
161
|
+
print("=== 自定义步骤示例 ===")
|
|
162
|
+
|
|
163
|
+
class CustomProcessingStep(PipelineStep):
|
|
164
|
+
"""自定义处理步骤"""
|
|
165
|
+
|
|
166
|
+
def __init__(self, name: str = "custom_processing"):
|
|
167
|
+
super().__init__(name)
|
|
168
|
+
|
|
169
|
+
async def execute(self, data, config):
|
|
170
|
+
"""执行自定义处理"""
|
|
171
|
+
try:
|
|
172
|
+
# 自定义处理逻辑
|
|
173
|
+
processed_data = data.copy()
|
|
174
|
+
|
|
175
|
+
# 添加自定义列
|
|
176
|
+
processed_data['custom_score'] = 0.85
|
|
177
|
+
processed_data['custom_tag'] = 'processed'
|
|
178
|
+
|
|
179
|
+
return StepResult(
|
|
180
|
+
step_name=self.name,
|
|
181
|
+
success=True,
|
|
182
|
+
data=processed_data,
|
|
183
|
+
metadata={"custom_metric": 42}
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
return StepResult(
|
|
188
|
+
step_name=self.name,
|
|
189
|
+
success=False,
|
|
190
|
+
data=data,
|
|
191
|
+
metadata={},
|
|
192
|
+
error=str(e)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# 使用自定义步骤
|
|
196
|
+
config = PipelineConfig(input_file="example_data.csv")
|
|
197
|
+
pipeline = DataProcessorPipeline(config)
|
|
198
|
+
|
|
199
|
+
pipeline.add_step(DataLoaderStep()) \
|
|
200
|
+
.add_step(CustomProcessingStep()) \
|
|
201
|
+
.add_step(FormatConversionStep())
|
|
202
|
+
|
|
203
|
+
print("自定义步骤已添加到Pipeline")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
async def example_with_excel():
|
|
207
|
+
"""Excel文件处理示例"""
|
|
208
|
+
print("=== Excel文件处理示例 ===")
|
|
209
|
+
|
|
210
|
+
config = PipelineConfig(
|
|
211
|
+
input_file="example_data.xlsx",
|
|
212
|
+
input_type="excel",
|
|
213
|
+
sheet_name=None, # 使用默认工作表
|
|
214
|
+
image_columns=["image1", "image2"], # Excel中包含图像的列
|
|
215
|
+
text_columns=["title", "description"],
|
|
216
|
+
output_dir="./excel_output",
|
|
217
|
+
steps_config={
|
|
218
|
+
"data_loader": {
|
|
219
|
+
"extract_images_from_excel": True,
|
|
220
|
+
"image_output_dir": "extracted_images"
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
pipeline = DataProcessorPipeline(config)
|
|
226
|
+
pipeline.add_step(DataLoaderStep()) \
|
|
227
|
+
.add_step(DataAlignmentStep()) \
|
|
228
|
+
.add_step(FormatConversionStep())
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
results = await pipeline.run()
|
|
232
|
+
print("Excel文件处理完成!")
|
|
233
|
+
except Exception as e:
|
|
234
|
+
print(f"Excel文件处理失败: {e}")
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def create_sample_data():
|
|
238
|
+
"""创建示例数据文件"""
|
|
239
|
+
import pandas as pd
|
|
240
|
+
|
|
241
|
+
# 创建示例CSV数据
|
|
242
|
+
sample_data = {
|
|
243
|
+
'text': [
|
|
244
|
+
'这是一个关于猫的有趣故事。',
|
|
245
|
+
'人工智能技术正在快速发展。',
|
|
246
|
+
'今天的天气非常好,适合出游。',
|
|
247
|
+
'这款产品的质量令人担忧。',
|
|
248
|
+
'学习编程需要持续的练习和思考。'
|
|
249
|
+
],
|
|
250
|
+
'image_url': [
|
|
251
|
+
'images/cat1.jpg',
|
|
252
|
+
'images/ai_tech.png',
|
|
253
|
+
'images/sunny_day.jpg',
|
|
254
|
+
'images/product.jpg',
|
|
255
|
+
'images/coding.png'
|
|
256
|
+
],
|
|
257
|
+
'harmful_img': [0, 0, 0, 1, 0],
|
|
258
|
+
'harmful_text': [0, 0, 0, 1, 0]
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
df = pd.DataFrame(sample_data)
|
|
262
|
+
df.to_csv('example_data.csv', index=False, encoding='utf-8')
|
|
263
|
+
print("示例数据文件 example_data.csv 已创建")
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def main():
|
|
267
|
+
"""主函数"""
|
|
268
|
+
import argparse
|
|
269
|
+
|
|
270
|
+
parser = argparse.ArgumentParser(description='MLLM Data Processor Pipeline 示例')
|
|
271
|
+
parser.add_argument('--mode', choices=['basic', 'resume', 'web', 'custom', 'excel', 'create-sample'],
|
|
272
|
+
default='basic', help='运行模式')
|
|
273
|
+
|
|
274
|
+
args = parser.parse_args()
|
|
275
|
+
|
|
276
|
+
if args.mode == 'create-sample':
|
|
277
|
+
create_sample_data()
|
|
278
|
+
elif args.mode == 'basic':
|
|
279
|
+
asyncio.run(example_basic_usage())
|
|
280
|
+
elif args.mode == 'resume':
|
|
281
|
+
asyncio.run(example_resume_from_checkpoint())
|
|
282
|
+
elif args.mode == 'web':
|
|
283
|
+
example_web_interface()
|
|
284
|
+
elif args.mode == 'custom':
|
|
285
|
+
example_custom_step()
|
|
286
|
+
elif args.mode == 'excel':
|
|
287
|
+
asyncio.run(example_with_excel())
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
if __name__ == "__main__":
|
|
291
|
+
main()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline处理步骤实现
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .data_loader import DataLoaderStep
|
|
6
|
+
from .data_alignment import DataAlignmentStep
|
|
7
|
+
from .data_validation import DataValidationStep
|
|
8
|
+
from .mllm_annotation import MllmAnnotationStep
|
|
9
|
+
from .mllm_refinement import MllmRefinementStep
|
|
10
|
+
from .format_conversion import FormatConversionStep
|
|
11
|
+
from .result_validation import ResultValidationStep
|
|
12
|
+
|
|
13
|
+
# 所有可用的步骤类
|
|
14
|
+
ALL_STEPS = [
|
|
15
|
+
DataLoaderStep,
|
|
16
|
+
DataAlignmentStep,
|
|
17
|
+
DataValidationStep,
|
|
18
|
+
MllmAnnotationStep,
|
|
19
|
+
MllmRefinementStep,
|
|
20
|
+
FormatConversionStep,
|
|
21
|
+
ResultValidationStep
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
def get_all_steps():
|
|
25
|
+
"""获取所有可用的步骤类"""
|
|
26
|
+
return ALL_STEPS
|
|
27
|
+
|
|
28
|
+
def create_step_from_config(step_config: dict):
|
|
29
|
+
"""根据配置创建步骤实例"""
|
|
30
|
+
step_name = step_config.get("name")
|
|
31
|
+
step_type = step_config.get("type")
|
|
32
|
+
step_params = step_config.get("params", {})
|
|
33
|
+
|
|
34
|
+
# 根据类型查找对应的步骤类
|
|
35
|
+
step_class = None
|
|
36
|
+
for cls in ALL_STEPS:
|
|
37
|
+
if cls.__name__ == step_type:
|
|
38
|
+
step_class = cls
|
|
39
|
+
break
|
|
40
|
+
|
|
41
|
+
if not step_class:
|
|
42
|
+
raise ValueError(f"未知的步骤类型: {step_type}")
|
|
43
|
+
|
|
44
|
+
return step_class(name=step_name, config=step_params)
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
"DataLoaderStep",
|
|
48
|
+
"DataAlignmentStep",
|
|
49
|
+
"DataValidationStep",
|
|
50
|
+
"MllmAnnotationStep",
|
|
51
|
+
"MllmRefinementStep",
|
|
52
|
+
"FormatConversionStep",
|
|
53
|
+
"ResultValidationStep",
|
|
54
|
+
"get_all_steps",
|
|
55
|
+
"create_step_from_config"
|
|
56
|
+
]
|