maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
"""
|
|
2
|
+
第6步:格式转换步骤
|
|
3
|
+
整理为训练格式,支持多种输出格式
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Any, Optional
|
|
10
|
+
from ..core import PipelineStep, StepResult, PipelineConfig
|
|
11
|
+
from maque.performance import MeasureTime
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FormatConversionStep(PipelineStep):
|
|
15
|
+
"""格式转换步骤"""
|
|
16
|
+
|
|
17
|
+
CONFIG_SCHEMA = {
|
|
18
|
+
"type": "object",
|
|
19
|
+
"properties": {
|
|
20
|
+
"output_formats": {
|
|
21
|
+
"type": "array",
|
|
22
|
+
"items": {
|
|
23
|
+
"type": "string",
|
|
24
|
+
"enum": ["jsonl", "csv", "json", "parquet", "hf_dataset"]
|
|
25
|
+
},
|
|
26
|
+
"default": ["jsonl", "csv"],
|
|
27
|
+
"description": "输出格式列表"
|
|
28
|
+
},
|
|
29
|
+
"format_configs": {
|
|
30
|
+
"type": "object",
|
|
31
|
+
"properties": {
|
|
32
|
+
"jsonl": {
|
|
33
|
+
"type": "object",
|
|
34
|
+
"properties": {
|
|
35
|
+
"filename": {"type": "string", "default": "training_data.jsonl"},
|
|
36
|
+
"include_metadata": {"type": "boolean", "default": False}
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
"csv": {
|
|
40
|
+
"type": "object",
|
|
41
|
+
"properties": {
|
|
42
|
+
"filename": {"type": "string", "default": "training_data.csv"},
|
|
43
|
+
"encoding": {"type": "string", "default": "utf-8"}
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"json": {
|
|
47
|
+
"type": "object",
|
|
48
|
+
"properties": {
|
|
49
|
+
"filename": {"type": "string", "default": "training_data.json"},
|
|
50
|
+
"indent": {"type": "integer", "default": 2}
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"parquet": {
|
|
54
|
+
"type": "object",
|
|
55
|
+
"properties": {
|
|
56
|
+
"filename": {"type": "string", "default": "training_data.parquet"}
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
"hf_dataset": {
|
|
60
|
+
"type": "object",
|
|
61
|
+
"properties": {
|
|
62
|
+
"dataset_name": {"type": "string", "default": "mllm_training_data"},
|
|
63
|
+
"split_ratios": {
|
|
64
|
+
"type": "object",
|
|
65
|
+
"properties": {
|
|
66
|
+
"train": {"type": "number", "default": 0.8},
|
|
67
|
+
"validation": {"type": "number", "default": 0.1},
|
|
68
|
+
"test": {"type": "number", "default": 0.1}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
"default": {}
|
|
75
|
+
},
|
|
76
|
+
"field_mapping": {
|
|
77
|
+
"type": "object",
|
|
78
|
+
"properties": {
|
|
79
|
+
"text_field": {"type": "string", "default": "text"},
|
|
80
|
+
"images_field": {"type": "string", "default": "images"},
|
|
81
|
+
"labels_field": {"type": "string", "default": "labels"},
|
|
82
|
+
"metadata_fields": {
|
|
83
|
+
"type": "array",
|
|
84
|
+
"items": {"type": "string"},
|
|
85
|
+
"default": ["summary", "tags", "sentiment", "quality"]
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
"default": {}
|
|
89
|
+
},
|
|
90
|
+
"filtering": {
|
|
91
|
+
"type": "object",
|
|
92
|
+
"properties": {
|
|
93
|
+
"only_successful": {"type": "boolean", "default": True},
|
|
94
|
+
"quality_threshold": {"type": "number", "default": 0.0},
|
|
95
|
+
"exclude_columns": {
|
|
96
|
+
"type": "array",
|
|
97
|
+
"items": {"type": "string"},
|
|
98
|
+
"default": ["__row_id", "__processing_status", "__loaded_at", "__aligned_at", "__validated_at", "__annotated_at", "__refined_at"]
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
"default": {}
|
|
102
|
+
},
|
|
103
|
+
"data_augmentation": {
|
|
104
|
+
"type": "object",
|
|
105
|
+
"properties": {
|
|
106
|
+
"duplicate_successful": {"type": "boolean", "default": False},
|
|
107
|
+
"add_negative_samples": {"type": "boolean", "default": False}
|
|
108
|
+
},
|
|
109
|
+
"default": {}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def __init__(self, name: str = "format_conversion", config: Dict[str, Any] = None):
|
|
115
|
+
super().__init__(name, config)
|
|
116
|
+
|
|
117
|
+
async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
|
|
118
|
+
"""执行格式转换"""
|
|
119
|
+
with MeasureTime(f"步骤 {self.name}"):
|
|
120
|
+
try:
|
|
121
|
+
step_config = self.get_step_config(config)
|
|
122
|
+
|
|
123
|
+
# 过滤和预处理数据
|
|
124
|
+
processed_data = await self._preprocess_data(data, step_config)
|
|
125
|
+
|
|
126
|
+
# 转换为训练格式
|
|
127
|
+
training_data = await self._convert_to_training_format(processed_data, step_config)
|
|
128
|
+
|
|
129
|
+
# 输出多种格式
|
|
130
|
+
output_files = await self._export_multiple_formats(training_data, config, step_config)
|
|
131
|
+
|
|
132
|
+
# 生成统计信息
|
|
133
|
+
stats = await self._generate_statistics(training_data, processed_data)
|
|
134
|
+
|
|
135
|
+
metadata = {
|
|
136
|
+
"original_rows": len(data),
|
|
137
|
+
"processed_rows": len(processed_data),
|
|
138
|
+
"training_samples": len(training_data),
|
|
139
|
+
"output_files": output_files,
|
|
140
|
+
"statistics": stats
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
self.logger.info(f"格式转换完成,生成 {len(training_data)} 个训练样本")
|
|
144
|
+
|
|
145
|
+
return StepResult(
|
|
146
|
+
step_name=self.name,
|
|
147
|
+
success=True,
|
|
148
|
+
data=processed_data, # 返回处理后的数据
|
|
149
|
+
metadata=metadata
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
self.logger.error(f"格式转换失败: {e}")
|
|
154
|
+
return StepResult(
|
|
155
|
+
step_name=self.name,
|
|
156
|
+
success=False,
|
|
157
|
+
data=data,
|
|
158
|
+
metadata={},
|
|
159
|
+
error=str(e)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
async def _preprocess_data(self, data: pd.DataFrame, step_config: Dict[str, Any]) -> pd.DataFrame:
|
|
163
|
+
"""预处理数据"""
|
|
164
|
+
data_copy = data.copy()
|
|
165
|
+
filtering = step_config.get("filtering", {})
|
|
166
|
+
|
|
167
|
+
# 只保留成功处理的数据
|
|
168
|
+
if filtering.get("only_successful", True):
|
|
169
|
+
if '__mllm_annotation_success' in data_copy.columns:
|
|
170
|
+
data_copy = data_copy[data_copy['__mllm_annotation_success'] == True]
|
|
171
|
+
self.logger.info(f"筛选成功标注的数据,剩余 {len(data_copy)} 行")
|
|
172
|
+
|
|
173
|
+
# 质量阈值过滤
|
|
174
|
+
quality_threshold = filtering.get("quality_threshold", 0.0)
|
|
175
|
+
if quality_threshold > 0:
|
|
176
|
+
# 这里需要根据实际的质量评估逻辑来过滤
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
# 移除不需要的列
|
|
180
|
+
exclude_columns = filtering.get("exclude_columns", [])
|
|
181
|
+
columns_to_drop = [col for col in exclude_columns if col in data_copy.columns]
|
|
182
|
+
if columns_to_drop:
|
|
183
|
+
data_copy = data_copy.drop(columns=columns_to_drop)
|
|
184
|
+
self.logger.info(f"移除了 {len(columns_to_drop)} 个元数据列")
|
|
185
|
+
|
|
186
|
+
return data_copy
|
|
187
|
+
|
|
188
|
+
async def _convert_to_training_format(self, data: pd.DataFrame, step_config: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
189
|
+
"""转换为训练格式"""
|
|
190
|
+
field_mapping = step_config.get("field_mapping", {})
|
|
191
|
+
text_field = field_mapping.get("text_field", "text")
|
|
192
|
+
images_field = field_mapping.get("images_field", "images")
|
|
193
|
+
labels_field = field_mapping.get("labels_field", "labels")
|
|
194
|
+
metadata_fields = field_mapping.get("metadata_fields", ["summary", "tags", "sentiment", "quality"])
|
|
195
|
+
|
|
196
|
+
training_data = []
|
|
197
|
+
|
|
198
|
+
for idx, row in data.iterrows():
|
|
199
|
+
sample = {}
|
|
200
|
+
|
|
201
|
+
# 文本字段
|
|
202
|
+
text_content = str(row.get('text', '')) if pd.notna(row.get('text')) else ""
|
|
203
|
+
sample[text_field] = text_content
|
|
204
|
+
|
|
205
|
+
# 图像字段
|
|
206
|
+
images_content = str(row.get('images', '')) if pd.notna(row.get('images')) else ""
|
|
207
|
+
if images_content:
|
|
208
|
+
# 分割图像路径
|
|
209
|
+
image_paths = [p.strip() for p in images_content.split('|') if p.strip()]
|
|
210
|
+
sample[images_field] = image_paths
|
|
211
|
+
else:
|
|
212
|
+
sample[images_field] = []
|
|
213
|
+
|
|
214
|
+
# 标签字段(基于标注结果)
|
|
215
|
+
labels = {}
|
|
216
|
+
|
|
217
|
+
# 使用精标结果(如果存在),否则使用初标结果
|
|
218
|
+
for field in metadata_fields:
|
|
219
|
+
refined_field = f"refined_{field}"
|
|
220
|
+
original_field = f"mllm_{field}"
|
|
221
|
+
|
|
222
|
+
if refined_field in data.columns and pd.notna(row.get(refined_field)):
|
|
223
|
+
labels[field] = str(row[refined_field])
|
|
224
|
+
elif original_field in data.columns and pd.notna(row.get(original_field)):
|
|
225
|
+
labels[field] = str(row[original_field])
|
|
226
|
+
else:
|
|
227
|
+
labels[field] = ""
|
|
228
|
+
|
|
229
|
+
sample[labels_field] = labels
|
|
230
|
+
|
|
231
|
+
# 添加其他有用的字段
|
|
232
|
+
sample["id"] = f"sample_{idx}"
|
|
233
|
+
sample["source"] = "mllm_pipeline"
|
|
234
|
+
|
|
235
|
+
# 添加有害内容标记(如果存在)
|
|
236
|
+
if 'harmful_images' in data.columns:
|
|
237
|
+
sample["harmful_images"] = str(row.get('harmful_images', ''))
|
|
238
|
+
if 'harmful_text' in data.columns:
|
|
239
|
+
sample["harmful_text"] = str(row.get('harmful_text', ''))
|
|
240
|
+
|
|
241
|
+
training_data.append(sample)
|
|
242
|
+
|
|
243
|
+
return training_data
|
|
244
|
+
|
|
245
|
+
async def _export_multiple_formats(self, training_data: List[Dict[str, Any]], config: PipelineConfig, step_config: Dict[str, Any]) -> List[str]:
|
|
246
|
+
"""导出多种格式"""
|
|
247
|
+
output_formats = step_config.get("output_formats", ["jsonl", "csv"])
|
|
248
|
+
format_configs = step_config.get("format_configs", {})
|
|
249
|
+
output_files = []
|
|
250
|
+
|
|
251
|
+
output_dir = Path(config.output_dir)
|
|
252
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
253
|
+
|
|
254
|
+
for format_type in output_formats:
|
|
255
|
+
try:
|
|
256
|
+
if format_type == "jsonl":
|
|
257
|
+
file_path = await self._export_jsonl(training_data, output_dir, format_configs.get("jsonl", {}))
|
|
258
|
+
elif format_type == "csv":
|
|
259
|
+
file_path = await self._export_csv(training_data, output_dir, format_configs.get("csv", {}))
|
|
260
|
+
elif format_type == "json":
|
|
261
|
+
file_path = await self._export_json(training_data, output_dir, format_configs.get("json", {}))
|
|
262
|
+
elif format_type == "parquet":
|
|
263
|
+
file_path = await self._export_parquet(training_data, output_dir, format_configs.get("parquet", {}))
|
|
264
|
+
elif format_type == "hf_dataset":
|
|
265
|
+
file_path = await self._export_hf_dataset(training_data, output_dir, format_configs.get("hf_dataset", {}))
|
|
266
|
+
else:
|
|
267
|
+
self.logger.warning(f"不支持的输出格式: {format_type}")
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
output_files.append(file_path)
|
|
271
|
+
self.logger.info(f"导出 {format_type} 格式: {file_path}")
|
|
272
|
+
|
|
273
|
+
except Exception as e:
|
|
274
|
+
self.logger.error(f"导出 {format_type} 格式失败: {e}")
|
|
275
|
+
|
|
276
|
+
return output_files
|
|
277
|
+
|
|
278
|
+
async def _export_jsonl(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
|
|
279
|
+
"""导出JSONL格式"""
|
|
280
|
+
filename = config.get("filename", "training_data.jsonl")
|
|
281
|
+
file_path = output_dir / filename
|
|
282
|
+
|
|
283
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
284
|
+
for sample in training_data:
|
|
285
|
+
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
|
|
286
|
+
|
|
287
|
+
return str(file_path)
|
|
288
|
+
|
|
289
|
+
async def _export_csv(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
|
|
290
|
+
"""导出CSV格式"""
|
|
291
|
+
filename = config.get("filename", "training_data.csv")
|
|
292
|
+
encoding = config.get("encoding", "utf-8")
|
|
293
|
+
file_path = output_dir / filename
|
|
294
|
+
|
|
295
|
+
# 将嵌套的字典和列表转换为字符串
|
|
296
|
+
flattened_data = []
|
|
297
|
+
for sample in training_data:
|
|
298
|
+
flat_sample = {}
|
|
299
|
+
for key, value in sample.items():
|
|
300
|
+
if isinstance(value, (dict, list)):
|
|
301
|
+
flat_sample[key] = json.dumps(value, ensure_ascii=False)
|
|
302
|
+
else:
|
|
303
|
+
flat_sample[key] = value
|
|
304
|
+
flattened_data.append(flat_sample)
|
|
305
|
+
|
|
306
|
+
df = pd.DataFrame(flattened_data)
|
|
307
|
+
df.to_csv(file_path, index=False, encoding=encoding)
|
|
308
|
+
|
|
309
|
+
return str(file_path)
|
|
310
|
+
|
|
311
|
+
async def _export_json(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
|
|
312
|
+
"""导出JSON格式"""
|
|
313
|
+
filename = config.get("filename", "training_data.json")
|
|
314
|
+
indent = config.get("indent", 2)
|
|
315
|
+
file_path = output_dir / filename
|
|
316
|
+
|
|
317
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
318
|
+
json.dump(training_data, f, ensure_ascii=False, indent=indent)
|
|
319
|
+
|
|
320
|
+
return str(file_path)
|
|
321
|
+
|
|
322
|
+
async def _export_parquet(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
|
|
323
|
+
"""导出Parquet格式"""
|
|
324
|
+
filename = config.get("filename", "training_data.parquet")
|
|
325
|
+
file_path = output_dir / filename
|
|
326
|
+
|
|
327
|
+
# 将嵌套的字典和列表转换为字符串
|
|
328
|
+
flattened_data = []
|
|
329
|
+
for sample in training_data:
|
|
330
|
+
flat_sample = {}
|
|
331
|
+
for key, value in sample.items():
|
|
332
|
+
if isinstance(value, (dict, list)):
|
|
333
|
+
flat_sample[key] = json.dumps(value, ensure_ascii=False)
|
|
334
|
+
else:
|
|
335
|
+
flat_sample[key] = value
|
|
336
|
+
flattened_data.append(flat_sample)
|
|
337
|
+
|
|
338
|
+
df = pd.DataFrame(flattened_data)
|
|
339
|
+
df.to_parquet(file_path, index=False)
|
|
340
|
+
|
|
341
|
+
return str(file_path)
|
|
342
|
+
|
|
343
|
+
async def _export_hf_dataset(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
|
|
344
|
+
"""导出HuggingFace Dataset格式"""
|
|
345
|
+
try:
|
|
346
|
+
from datasets import Dataset
|
|
347
|
+
except ImportError:
|
|
348
|
+
self.logger.error("需要安装 datasets 库才能导出 HuggingFace 格式")
|
|
349
|
+
raise ImportError("pip install datasets")
|
|
350
|
+
|
|
351
|
+
dataset_name = config.get("dataset_name", "mllm_training_data")
|
|
352
|
+
split_ratios = config.get("split_ratios", {"train": 0.8, "validation": 0.1, "test": 0.1})
|
|
353
|
+
|
|
354
|
+
# 创建数据集
|
|
355
|
+
dataset = Dataset.from_list(training_data)
|
|
356
|
+
|
|
357
|
+
# 分割数据集
|
|
358
|
+
if len(split_ratios) > 1:
|
|
359
|
+
train_ratio = split_ratios.get("train", 0.8)
|
|
360
|
+
val_ratio = split_ratios.get("validation", 0.1)
|
|
361
|
+
|
|
362
|
+
# 首先分割训练集和测试集
|
|
363
|
+
train_test_split = dataset.train_test_split(test_size=1-train_ratio, seed=42)
|
|
364
|
+
train_dataset = train_test_split["train"]
|
|
365
|
+
temp_dataset = train_test_split["test"]
|
|
366
|
+
|
|
367
|
+
# 再从剩余数据中分割验证集和测试集
|
|
368
|
+
if val_ratio > 0:
|
|
369
|
+
val_test_ratio = val_ratio / (val_ratio + split_ratios.get("test", 0.1))
|
|
370
|
+
val_test_split = temp_dataset.train_test_split(test_size=1-val_test_ratio, seed=42)
|
|
371
|
+
val_dataset = val_test_split["train"]
|
|
372
|
+
test_dataset = val_test_split["test"]
|
|
373
|
+
else:
|
|
374
|
+
val_dataset = None
|
|
375
|
+
test_dataset = temp_dataset
|
|
376
|
+
|
|
377
|
+
# 保存分割后的数据集
|
|
378
|
+
dataset_dir = output_dir / dataset_name
|
|
379
|
+
train_dataset.save_to_disk(str(dataset_dir / "train"))
|
|
380
|
+
if val_dataset:
|
|
381
|
+
val_dataset.save_to_disk(str(dataset_dir / "validation"))
|
|
382
|
+
test_dataset.save_to_disk(str(dataset_dir / "test"))
|
|
383
|
+
else:
|
|
384
|
+
# 不分割,直接保存
|
|
385
|
+
dataset_dir = output_dir / dataset_name
|
|
386
|
+
dataset.save_to_disk(str(dataset_dir))
|
|
387
|
+
|
|
388
|
+
return str(dataset_dir)
|
|
389
|
+
|
|
390
|
+
async def _generate_statistics(self, training_data: List[Dict[str, Any]], processed_data: pd.DataFrame) -> Dict[str, Any]:
|
|
391
|
+
"""生成统计信息"""
|
|
392
|
+
stats = {
|
|
393
|
+
"total_samples": len(training_data),
|
|
394
|
+
"samples_with_images": len([s for s in training_data if s.get("images", [])]),
|
|
395
|
+
"samples_with_text": len([s for s in training_data if s.get("text", "").strip()]),
|
|
396
|
+
"average_text_length": 0,
|
|
397
|
+
"image_count_distribution": {},
|
|
398
|
+
"quality_distribution": {}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
# 文本长度统计
|
|
402
|
+
text_lengths = [len(s.get("text", "")) for s in training_data]
|
|
403
|
+
if text_lengths:
|
|
404
|
+
stats["average_text_length"] = sum(text_lengths) / len(text_lengths)
|
|
405
|
+
|
|
406
|
+
# 图像数量分布
|
|
407
|
+
image_counts = [len(s.get("images", [])) for s in training_data]
|
|
408
|
+
for count in image_counts:
|
|
409
|
+
stats["image_count_distribution"][str(count)] = stats["image_count_distribution"].get(str(count), 0) + 1
|
|
410
|
+
|
|
411
|
+
return stats
|