maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,17 @@
1
+ """
2
+ 多模态大模型训练数据处理pipeline
3
+
4
+ 一个灵活的、模块化的数据处理流水线,支持Web界面交互和断点续传。
5
+ """
6
+
7
+ from .core import DataProcessorPipeline, PipelineStep, PipelineConfig
8
+ from .steps import *
9
+ from .web_app import WebApp
10
+
11
+ __version__ = "0.1.0"
12
+ __all__ = [
13
+ "DataProcessorPipeline",
14
+ "PipelineStep",
15
+ "PipelineConfig",
16
+ "WebApp"
17
+ ]
@@ -0,0 +1,341 @@
1
+ """
2
+ Pipeline核心架构实现
3
+ """
4
+
5
+ import json
6
+ import asyncio
7
+ from abc import ABC, abstractmethod
8
+ from dataclasses import dataclass, asdict
9
+ from typing import Dict, List, Any, Optional, Union, Callable
10
+ from pathlib import Path
11
+ import pandas as pd
12
+ from datetime import datetime
13
+
14
+ from loguru import logger
15
+ from maque.performance import MeasureTime
16
+
17
+
18
+ @dataclass
19
+ class PipelineConfig:
20
+ """Pipeline配置类"""
21
+
22
+ # 输入配置
23
+ input_file: str = ""
24
+ input_type: str = "csv" # csv, excel
25
+ sheet_name: Optional[str] = None
26
+
27
+ # 列映射配置
28
+ image_columns: List[str] = None # 图像列名称
29
+ harmful_image_columns: List[str] = None # 有害图像列
30
+ harmful_text_columns: List[str] = None # 有害文本列
31
+ text_columns: List[str] = None # 文本列
32
+
33
+ # 输出配置
34
+ output_dir: str = "./output"
35
+ checkpoint_dir: str = "./checkpoints"
36
+
37
+ # 步骤配置
38
+ steps_config: Dict[str, Dict[str, Any]] = None
39
+
40
+ # MLLM配置
41
+ mllm_config: Dict[str, Any] = None
42
+
43
+ def __post_init__(self):
44
+ if self.image_columns is None:
45
+ self.image_columns = []
46
+ if self.harmful_image_columns is None:
47
+ self.harmful_image_columns = []
48
+ if self.harmful_text_columns is None:
49
+ self.harmful_text_columns = []
50
+ if self.text_columns is None:
51
+ self.text_columns = []
52
+ if self.steps_config is None:
53
+ self.steps_config = {}
54
+ if self.mllm_config is None:
55
+ self.mllm_config = {}
56
+
57
+ def to_dict(self) -> Dict[str, Any]:
58
+ return asdict(self)
59
+
60
+ @classmethod
61
+ def from_dict(cls, data: Dict[str, Any]) -> "PipelineConfig":
62
+ return cls(**data)
63
+
64
+ def save(self, filepath: str):
65
+ """保存配置到文件"""
66
+ with open(filepath, "w", encoding="utf-8") as f:
67
+ json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)
68
+
69
+ @classmethod
70
+ def load(cls, filepath: str) -> "PipelineConfig":
71
+ """从文件加载配置"""
72
+ with open(filepath, "r", encoding="utf-8") as f:
73
+ data = json.load(f)
74
+ return cls.from_dict(data)
75
+
76
+
77
+ @dataclass
78
+ class StepResult:
79
+ """步骤执行结果"""
80
+
81
+ step_name: str
82
+ success: bool
83
+ data: pd.DataFrame
84
+ metadata: Dict[str, Any]
85
+ error: Optional[str] = None
86
+ execution_time: Optional[float] = None
87
+
88
+ def to_dict(self) -> Dict[str, Any]:
89
+ result = {
90
+ "step_name": self.step_name,
91
+ "success": self.success,
92
+ "metadata": self.metadata,
93
+ "error": self.error,
94
+ "execution_time": self.execution_time,
95
+ "data_shape": self.data.shape if self.data is not None else None,
96
+ "timestamp": datetime.now().isoformat(),
97
+ }
98
+ return result
99
+
100
+ def save_checkpoint(self, checkpoint_dir: str):
101
+ """保存检查点"""
102
+ checkpoint_path = Path(checkpoint_dir)
103
+ checkpoint_path.mkdir(parents=True, exist_ok=True)
104
+
105
+ # 保存数据
106
+ data_file = checkpoint_path / f"{self.step_name}_data.csv"
107
+ if self.data is not None:
108
+ self.data.to_csv(data_file, index=False, encoding="utf-8")
109
+
110
+ # 保存元数据
111
+ meta_file = checkpoint_path / f"{self.step_name}_metadata.json"
112
+ with open(meta_file, "w", encoding="utf-8") as f:
113
+ json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)
114
+
115
+ @classmethod
116
+ def load_checkpoint(
117
+ cls, step_name: str, checkpoint_dir: str
118
+ ) -> Optional["StepResult"]:
119
+ """加载检查点"""
120
+ checkpoint_path = Path(checkpoint_dir)
121
+ data_file = checkpoint_path / f"{step_name}_data.csv"
122
+ meta_file = checkpoint_path / f"{step_name}_metadata.json"
123
+
124
+ if not (data_file.exists() and meta_file.exists()):
125
+ return None
126
+
127
+ # 加载数据
128
+ data = pd.read_csv(data_file, encoding="utf-8")
129
+
130
+ # 加载元数据
131
+ with open(meta_file, "r", encoding="utf-8") as f:
132
+ metadata_info = json.load(f)
133
+
134
+ return cls(
135
+ step_name=metadata_info["step_name"],
136
+ success=metadata_info["success"],
137
+ data=data,
138
+ metadata=metadata_info["metadata"],
139
+ error=metadata_info.get("error"),
140
+ execution_time=metadata_info.get("execution_time"),
141
+ )
142
+
143
+
144
+ class PipelineStep(ABC):
145
+ """Pipeline步骤基类"""
146
+
147
+ def __init__(self, name: str, config: Dict[str, Any] = None):
148
+ self.name = name
149
+ self.config = config or {}
150
+ self.logger = logger
151
+
152
+ @abstractmethod
153
+ async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
154
+ """执行步骤"""
155
+ pass
156
+
157
+ def validate_input(self, data: pd.DataFrame, config: PipelineConfig) -> bool:
158
+ """验证输入数据"""
159
+ return data is not None and not data.empty
160
+
161
+ def get_step_config(self, config: PipelineConfig) -> Dict[str, Any]:
162
+ """获取步骤特定配置"""
163
+ return config.steps_config.get(self.name, {})
164
+
165
+
166
+ class DataProcessorPipeline:
167
+ """数据处理Pipeline主类"""
168
+
169
+ def __init__(self, config: PipelineConfig):
170
+ self.config = config
171
+ self.steps: List[PipelineStep] = []
172
+ self.results: List[StepResult] = []
173
+ self.logger = logger
174
+ self.status_callback: Optional[Callable] = None
175
+
176
+ # 创建输出目录
177
+ Path(config.output_dir).mkdir(parents=True, exist_ok=True)
178
+ Path(config.checkpoint_dir).mkdir(parents=True, exist_ok=True)
179
+
180
+ def add_step(self, step: PipelineStep) -> "DataProcessorPipeline":
181
+ """添加处理步骤"""
182
+ self.steps.append(step)
183
+ return self
184
+
185
+ def set_status_callback(self, callback: Callable[[str, Dict[str, Any]], None]):
186
+ """设置状态回调函数"""
187
+ self.status_callback = callback
188
+
189
+ def _notify_status(self, status: str, data: Dict[str, Any] = None):
190
+ """通知状态更新"""
191
+ if self.status_callback:
192
+ self.status_callback(status, data or {})
193
+
194
+ async def load_data(self) -> pd.DataFrame:
195
+ """加载输入数据"""
196
+ self._notify_status("loading_data", {"file": self.config.input_file})
197
+
198
+ if self.config.input_type == "csv":
199
+ data = pd.read_csv(self.config.input_file, encoding="utf-8")
200
+ elif self.config.input_type == "excel":
201
+ # 如果有图像列,使用maque的excel_helper
202
+ if self.config.image_columns:
203
+ from maque.utils.excel_helper import extract_excel_with_images
204
+
205
+ data = extract_excel_with_images(
206
+ excel_path=self.config.input_file,
207
+ image_column_names=self.config.image_columns,
208
+ sheet_name=self.config.sheet_name,
209
+ image_output_dir=Path(self.config.output_dir) / "images",
210
+ )
211
+ else:
212
+ data = pd.read_excel(
213
+ self.config.input_file, sheet_name=self.config.sheet_name
214
+ )
215
+ else:
216
+ raise ValueError(f"不支持的输入类型: {self.config.input_type}")
217
+
218
+ self.logger.info(f"加载数据完成,共{len(data)}行")
219
+ return data
220
+
221
+ async def run(self, resume_from: Optional[str] = None) -> List[StepResult]:
222
+ """运行Pipeline"""
223
+ with MeasureTime("Pipeline执行"):
224
+ self._notify_status("starting", {"total_steps": len(self.steps)})
225
+
226
+ # 加载数据
227
+ current_data = await self.load_data()
228
+
229
+ # 确定开始步骤
230
+ start_idx = 0
231
+ if resume_from:
232
+ # 尝试从检查点恢复
233
+ checkpoint_result = StepResult.load_checkpoint(
234
+ resume_from, self.config.checkpoint_dir
235
+ )
236
+ if checkpoint_result:
237
+ current_data = checkpoint_result.data
238
+ start_idx = (
239
+ next(
240
+ (
241
+ i
242
+ for i, step in enumerate(self.steps)
243
+ if step.name == resume_from
244
+ ),
245
+ 0,
246
+ )
247
+ + 1
248
+ )
249
+ self.results.append(checkpoint_result)
250
+ self.logger.info(f"从检查点恢复: {resume_from}")
251
+
252
+ # 执行步骤
253
+ for i, step in enumerate(self.steps[start_idx:], start_idx):
254
+ self._notify_status(
255
+ "executing_step",
256
+ {
257
+ "step_name": step.name,
258
+ "step_index": i,
259
+ "total_steps": len(self.steps),
260
+ },
261
+ )
262
+
263
+ try:
264
+ self.logger.info(f"执行步骤 {i + 1}/{len(self.steps)}: {step.name}")
265
+
266
+ # 验证输入
267
+ if not step.validate_input(current_data, self.config):
268
+ raise ValueError(f"步骤 {step.name} 输入验证失败")
269
+
270
+ # 执行步骤
271
+ result = await step.execute(current_data, self.config)
272
+
273
+ # 保存检查点
274
+ result.save_checkpoint(self.config.checkpoint_dir)
275
+
276
+ # 更新当前数据
277
+ current_data = result.data
278
+ self.results.append(result)
279
+
280
+ self.logger.info(f"步骤 {step.name} 执行完成")
281
+
282
+ except Exception as e:
283
+ error_msg = f"步骤 {step.name} 执行失败: {str(e)}"
284
+ self.logger.error(error_msg)
285
+
286
+ # 创建失败结果
287
+ failed_result = StepResult(
288
+ step_name=step.name,
289
+ success=False,
290
+ data=current_data,
291
+ metadata={},
292
+ error=error_msg,
293
+ )
294
+ failed_result.save_checkpoint(self.config.checkpoint_dir)
295
+ self.results.append(failed_result)
296
+
297
+ self._notify_status(
298
+ "step_failed", {"step_name": step.name, "error": error_msg}
299
+ )
300
+
301
+ raise
302
+
303
+ self._notify_status("completed", {"total_results": len(self.results)})
304
+ self.logger.info("Pipeline执行完成")
305
+
306
+ return self.results
307
+
308
+ def get_status(self) -> Dict[str, Any]:
309
+ """获取Pipeline状态"""
310
+ return {
311
+ "total_steps": len(self.steps),
312
+ "completed_steps": len([r for r in self.results if r.success]),
313
+ "failed_steps": len([r for r in self.results if not r.success]),
314
+ "current_step": len(self.results),
315
+ "steps": [
316
+ {"name": step.name, "config": step.config} for step in self.steps
317
+ ],
318
+ "results": [result.to_dict() for result in self.results],
319
+ }
320
+
321
+ def save_final_results(self):
322
+ """保存最终结果"""
323
+ if not self.results:
324
+ return
325
+
326
+ final_result = self.results[-1]
327
+ if final_result.success and final_result.data is not None:
328
+ output_file = Path(self.config.output_dir) / "final_output.csv"
329
+ final_result.data.to_csv(output_file, index=False, encoding="utf-8")
330
+ self.logger.info(f"最终结果已保存至: {output_file}")
331
+
332
+ # 保存执行摘要
333
+ summary = {
334
+ "config": self.config.to_dict(),
335
+ "execution_summary": self.get_status(),
336
+ "timestamp": datetime.now().isoformat(),
337
+ }
338
+
339
+ summary_file = Path(self.config.output_dir) / "execution_summary.json"
340
+ with open(summary_file, "w", encoding="utf-8") as f:
341
+ json.dump(summary, f, ensure_ascii=False, indent=2)
@@ -0,0 +1,291 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MLLM Data Processor Pipeline 使用示例
4
+
5
+ 这个示例展示了如何使用Pipeline进行多模态大模型训练数据处理。
6
+ """
7
+
8
+ import asyncio
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ # 添加项目根目录到Python路径
13
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
14
+
15
+ from maque.mllm_data_processor_pipeline import (
16
+ DataProcessorPipeline,
17
+ PipelineConfig,
18
+ WebApp
19
+ )
20
+ from maque.mllm_data_processor_pipeline.steps import *
21
+
22
+
23
+ async def example_basic_usage():
24
+ """基本使用示例"""
25
+ print("=== 基本使用示例 ===")
26
+
27
+ # 1. 创建配置
28
+ config = PipelineConfig(
29
+ input_file="example_data.csv",
30
+ input_type="csv",
31
+ image_columns=["image_url"],
32
+ text_columns=["text", "content"],
33
+ harmful_image_columns=["harmful_img"],
34
+ harmful_text_columns=["harmful_text"],
35
+ output_dir="./output",
36
+ checkpoint_dir="./checkpoints",
37
+ mllm_config={
38
+ "model_name": "gpt-4o",
39
+ "api_key": "your-api-key-here",
40
+ "temperature": 0.7,
41
+ "max_tokens": 2048
42
+ },
43
+ steps_config={
44
+ "data_validation": {
45
+ "skip_validation": False,
46
+ "min_text_length": 10,
47
+ "require_images": False
48
+ },
49
+ "mllm_annotation": {
50
+ "concurrent_config": {
51
+ "max_workers": 5,
52
+ "rate_limit": 1.0
53
+ },
54
+ "annotation_prompts": {
55
+ "system_prompt": "你是一个专业的数据标注员,请根据给定的图像和文本内容进行标注。",
56
+ "user_prompt_template": """请对以下内容进行标注:
57
+ 文本:{text}
58
+
59
+ 请提供:
60
+ 1. 内容摘要(50字以内)
61
+ 2. 主要标签(用逗号分隔)
62
+ 3. 情感倾向(正面/负面/中性)
63
+ 4. 质量评估(高/中/低)"""
64
+ }
65
+ },
66
+ "mllm_refinement": {
67
+ "concurrent_config": {
68
+ "max_workers": 3,
69
+ "rate_limit": 0.5
70
+ },
71
+ "refinement_criteria": {
72
+ "quality_threshold": 0.7,
73
+ "skip_high_quality": True
74
+ }
75
+ },
76
+ "format_conversion": {
77
+ "output_formats": ["jsonl", "csv", "json"],
78
+ "field_mapping": {
79
+ "text_field": "text",
80
+ "images_field": "images",
81
+ "labels_field": "labels"
82
+ }
83
+ }
84
+ }
85
+ )
86
+
87
+ # 2. 创建Pipeline
88
+ pipeline = DataProcessorPipeline(config)
89
+
90
+ # 3. 添加处理步骤
91
+ pipeline.add_step(DataLoaderStep()) \
92
+ .add_step(DataAlignmentStep()) \
93
+ .add_step(DataValidationStep()) \
94
+ .add_step(MllmAnnotationStep()) \
95
+ .add_step(MllmRefinementStep()) \
96
+ .add_step(FormatConversionStep()) \
97
+ .add_step(ResultValidationStep())
98
+
99
+ # 4. 设置状态回调(可选)
100
+ def status_callback(status, data):
101
+ print(f"状态更新: {status} - {data}")
102
+
103
+ pipeline.set_status_callback(status_callback)
104
+
105
+ # 5. 运行Pipeline
106
+ try:
107
+ results = await pipeline.run()
108
+
109
+ print("\n=== 执行结果 ===")
110
+ for i, result in enumerate(results):
111
+ step_name = result.step_name
112
+ success = "✓" if result.success else "✗"
113
+ print(f"{i+1}. {step_name}: {success}")
114
+ if result.error:
115
+ print(f" 错误: {result.error}")
116
+
117
+ # 6. 保存最终结果
118
+ pipeline.save_final_results()
119
+ print("\n处理完成!结果已保存到output目录。")
120
+
121
+ except Exception as e:
122
+ print(f"Pipeline执行失败: {e}")
123
+
124
+
125
+ async def example_resume_from_checkpoint():
126
+ """从检查点恢复示例"""
127
+ print("=== 从检查点恢复示例 ===")
128
+
129
+ config = PipelineConfig(
130
+ input_file="example_data.csv",
131
+ input_type="csv",
132
+ checkpoint_dir="./checkpoints"
133
+ )
134
+
135
+ pipeline = DataProcessorPipeline(config)
136
+ # 添加步骤...
137
+
138
+ # 从特定步骤恢复
139
+ try:
140
+ results = await pipeline.run(resume_from="mllm_annotation")
141
+ print("从检查点恢复执行成功!")
142
+ except Exception as e:
143
+ print(f"从检查点恢复失败: {e}")
144
+
145
+
146
+ def example_web_interface():
147
+ """Web界面示例"""
148
+ print("=== Web界面示例 ===")
149
+
150
+ # 创建Web应用
151
+ app = WebApp(static_dir="./static")
152
+
153
+ # 启动Web服务器
154
+ print("启动Web界面...")
155
+ print("访问: http://localhost:8000")
156
+ app.run(host="0.0.0.0", port=8000)
157
+
158
+
159
+ def example_custom_step():
160
+ """自定义步骤示例"""
161
+ print("=== 自定义步骤示例 ===")
162
+
163
+ class CustomProcessingStep(PipelineStep):
164
+ """自定义处理步骤"""
165
+
166
+ def __init__(self, name: str = "custom_processing"):
167
+ super().__init__(name)
168
+
169
+ async def execute(self, data, config):
170
+ """执行自定义处理"""
171
+ try:
172
+ # 自定义处理逻辑
173
+ processed_data = data.copy()
174
+
175
+ # 添加自定义列
176
+ processed_data['custom_score'] = 0.85
177
+ processed_data['custom_tag'] = 'processed'
178
+
179
+ return StepResult(
180
+ step_name=self.name,
181
+ success=True,
182
+ data=processed_data,
183
+ metadata={"custom_metric": 42}
184
+ )
185
+
186
+ except Exception as e:
187
+ return StepResult(
188
+ step_name=self.name,
189
+ success=False,
190
+ data=data,
191
+ metadata={},
192
+ error=str(e)
193
+ )
194
+
195
+ # 使用自定义步骤
196
+ config = PipelineConfig(input_file="example_data.csv")
197
+ pipeline = DataProcessorPipeline(config)
198
+
199
+ pipeline.add_step(DataLoaderStep()) \
200
+ .add_step(CustomProcessingStep()) \
201
+ .add_step(FormatConversionStep())
202
+
203
+ print("自定义步骤已添加到Pipeline")
204
+
205
+
206
+ async def example_with_excel():
207
+ """Excel文件处理示例"""
208
+ print("=== Excel文件处理示例 ===")
209
+
210
+ config = PipelineConfig(
211
+ input_file="example_data.xlsx",
212
+ input_type="excel",
213
+ sheet_name=None, # 使用默认工作表
214
+ image_columns=["image1", "image2"], # Excel中包含图像的列
215
+ text_columns=["title", "description"],
216
+ output_dir="./excel_output",
217
+ steps_config={
218
+ "data_loader": {
219
+ "extract_images_from_excel": True,
220
+ "image_output_dir": "extracted_images"
221
+ }
222
+ }
223
+ )
224
+
225
+ pipeline = DataProcessorPipeline(config)
226
+ pipeline.add_step(DataLoaderStep()) \
227
+ .add_step(DataAlignmentStep()) \
228
+ .add_step(FormatConversionStep())
229
+
230
+ try:
231
+ results = await pipeline.run()
232
+ print("Excel文件处理完成!")
233
+ except Exception as e:
234
+ print(f"Excel文件处理失败: {e}")
235
+
236
+
237
+ def create_sample_data():
238
+ """创建示例数据文件"""
239
+ import pandas as pd
240
+
241
+ # 创建示例CSV数据
242
+ sample_data = {
243
+ 'text': [
244
+ '这是一个关于猫的有趣故事。',
245
+ '人工智能技术正在快速发展。',
246
+ '今天的天气非常好,适合出游。',
247
+ '这款产品的质量令人担忧。',
248
+ '学习编程需要持续的练习和思考。'
249
+ ],
250
+ 'image_url': [
251
+ 'images/cat1.jpg',
252
+ 'images/ai_tech.png',
253
+ 'images/sunny_day.jpg',
254
+ 'images/product.jpg',
255
+ 'images/coding.png'
256
+ ],
257
+ 'harmful_img': [0, 0, 0, 1, 0],
258
+ 'harmful_text': [0, 0, 0, 1, 0]
259
+ }
260
+
261
+ df = pd.DataFrame(sample_data)
262
+ df.to_csv('example_data.csv', index=False, encoding='utf-8')
263
+ print("示例数据文件 example_data.csv 已创建")
264
+
265
+
266
+ def main():
267
+ """主函数"""
268
+ import argparse
269
+
270
+ parser = argparse.ArgumentParser(description='MLLM Data Processor Pipeline 示例')
271
+ parser.add_argument('--mode', choices=['basic', 'resume', 'web', 'custom', 'excel', 'create-sample'],
272
+ default='basic', help='运行模式')
273
+
274
+ args = parser.parse_args()
275
+
276
+ if args.mode == 'create-sample':
277
+ create_sample_data()
278
+ elif args.mode == 'basic':
279
+ asyncio.run(example_basic_usage())
280
+ elif args.mode == 'resume':
281
+ asyncio.run(example_resume_from_checkpoint())
282
+ elif args.mode == 'web':
283
+ example_web_interface()
284
+ elif args.mode == 'custom':
285
+ example_custom_step()
286
+ elif args.mode == 'excel':
287
+ asyncio.run(example_with_excel())
288
+
289
+
290
+ if __name__ == "__main__":
291
+ main()
@@ -0,0 +1,56 @@
1
+ """
2
+ Pipeline处理步骤实现
3
+ """
4
+
5
+ from .data_loader import DataLoaderStep
6
+ from .data_alignment import DataAlignmentStep
7
+ from .data_validation import DataValidationStep
8
+ from .mllm_annotation import MllmAnnotationStep
9
+ from .mllm_refinement import MllmRefinementStep
10
+ from .format_conversion import FormatConversionStep
11
+ from .result_validation import ResultValidationStep
12
+
13
+ # 所有可用的步骤类
14
+ ALL_STEPS = [
15
+ DataLoaderStep,
16
+ DataAlignmentStep,
17
+ DataValidationStep,
18
+ MllmAnnotationStep,
19
+ MllmRefinementStep,
20
+ FormatConversionStep,
21
+ ResultValidationStep
22
+ ]
23
+
24
+ def get_all_steps():
25
+ """获取所有可用的步骤类"""
26
+ return ALL_STEPS
27
+
28
+ def create_step_from_config(step_config: dict):
29
+ """根据配置创建步骤实例"""
30
+ step_name = step_config.get("name")
31
+ step_type = step_config.get("type")
32
+ step_params = step_config.get("params", {})
33
+
34
+ # 根据类型查找对应的步骤类
35
+ step_class = None
36
+ for cls in ALL_STEPS:
37
+ if cls.__name__ == step_type:
38
+ step_class = cls
39
+ break
40
+
41
+ if not step_class:
42
+ raise ValueError(f"未知的步骤类型: {step_type}")
43
+
44
+ return step_class(name=step_name, config=step_params)
45
+
46
+ __all__ = [
47
+ "DataLoaderStep",
48
+ "DataAlignmentStep",
49
+ "DataValidationStep",
50
+ "MllmAnnotationStep",
51
+ "MllmRefinementStep",
52
+ "FormatConversionStep",
53
+ "ResultValidationStep",
54
+ "get_all_steps",
55
+ "create_step_from_config"
56
+ ]