maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,411 @@
1
+ """
2
+ 第6步:格式转换步骤
3
+ 整理为训练格式,支持多种输出格式
4
+ """
5
+
6
+ import pandas as pd
7
+ import json
8
+ from pathlib import Path
9
+ from typing import List, Dict, Any, Optional
10
+ from ..core import PipelineStep, StepResult, PipelineConfig
11
+ from maque.performance import MeasureTime
12
+
13
+
14
+ class FormatConversionStep(PipelineStep):
15
+ """格式转换步骤"""
16
+
17
+ CONFIG_SCHEMA = {
18
+ "type": "object",
19
+ "properties": {
20
+ "output_formats": {
21
+ "type": "array",
22
+ "items": {
23
+ "type": "string",
24
+ "enum": ["jsonl", "csv", "json", "parquet", "hf_dataset"]
25
+ },
26
+ "default": ["jsonl", "csv"],
27
+ "description": "输出格式列表"
28
+ },
29
+ "format_configs": {
30
+ "type": "object",
31
+ "properties": {
32
+ "jsonl": {
33
+ "type": "object",
34
+ "properties": {
35
+ "filename": {"type": "string", "default": "training_data.jsonl"},
36
+ "include_metadata": {"type": "boolean", "default": False}
37
+ }
38
+ },
39
+ "csv": {
40
+ "type": "object",
41
+ "properties": {
42
+ "filename": {"type": "string", "default": "training_data.csv"},
43
+ "encoding": {"type": "string", "default": "utf-8"}
44
+ }
45
+ },
46
+ "json": {
47
+ "type": "object",
48
+ "properties": {
49
+ "filename": {"type": "string", "default": "training_data.json"},
50
+ "indent": {"type": "integer", "default": 2}
51
+ }
52
+ },
53
+ "parquet": {
54
+ "type": "object",
55
+ "properties": {
56
+ "filename": {"type": "string", "default": "training_data.parquet"}
57
+ }
58
+ },
59
+ "hf_dataset": {
60
+ "type": "object",
61
+ "properties": {
62
+ "dataset_name": {"type": "string", "default": "mllm_training_data"},
63
+ "split_ratios": {
64
+ "type": "object",
65
+ "properties": {
66
+ "train": {"type": "number", "default": 0.8},
67
+ "validation": {"type": "number", "default": 0.1},
68
+ "test": {"type": "number", "default": 0.1}
69
+ }
70
+ }
71
+ }
72
+ }
73
+ },
74
+ "default": {}
75
+ },
76
+ "field_mapping": {
77
+ "type": "object",
78
+ "properties": {
79
+ "text_field": {"type": "string", "default": "text"},
80
+ "images_field": {"type": "string", "default": "images"},
81
+ "labels_field": {"type": "string", "default": "labels"},
82
+ "metadata_fields": {
83
+ "type": "array",
84
+ "items": {"type": "string"},
85
+ "default": ["summary", "tags", "sentiment", "quality"]
86
+ }
87
+ },
88
+ "default": {}
89
+ },
90
+ "filtering": {
91
+ "type": "object",
92
+ "properties": {
93
+ "only_successful": {"type": "boolean", "default": True},
94
+ "quality_threshold": {"type": "number", "default": 0.0},
95
+ "exclude_columns": {
96
+ "type": "array",
97
+ "items": {"type": "string"},
98
+ "default": ["__row_id", "__processing_status", "__loaded_at", "__aligned_at", "__validated_at", "__annotated_at", "__refined_at"]
99
+ }
100
+ },
101
+ "default": {}
102
+ },
103
+ "data_augmentation": {
104
+ "type": "object",
105
+ "properties": {
106
+ "duplicate_successful": {"type": "boolean", "default": False},
107
+ "add_negative_samples": {"type": "boolean", "default": False}
108
+ },
109
+ "default": {}
110
+ }
111
+ }
112
+ }
113
+
114
+ def __init__(self, name: str = "format_conversion", config: Dict[str, Any] = None):
115
+ super().__init__(name, config)
116
+
117
+ async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
118
+ """执行格式转换"""
119
+ with MeasureTime(f"步骤 {self.name}"):
120
+ try:
121
+ step_config = self.get_step_config(config)
122
+
123
+ # 过滤和预处理数据
124
+ processed_data = await self._preprocess_data(data, step_config)
125
+
126
+ # 转换为训练格式
127
+ training_data = await self._convert_to_training_format(processed_data, step_config)
128
+
129
+ # 输出多种格式
130
+ output_files = await self._export_multiple_formats(training_data, config, step_config)
131
+
132
+ # 生成统计信息
133
+ stats = await self._generate_statistics(training_data, processed_data)
134
+
135
+ metadata = {
136
+ "original_rows": len(data),
137
+ "processed_rows": len(processed_data),
138
+ "training_samples": len(training_data),
139
+ "output_files": output_files,
140
+ "statistics": stats
141
+ }
142
+
143
+ self.logger.info(f"格式转换完成,生成 {len(training_data)} 个训练样本")
144
+
145
+ return StepResult(
146
+ step_name=self.name,
147
+ success=True,
148
+ data=processed_data, # 返回处理后的数据
149
+ metadata=metadata
150
+ )
151
+
152
+ except Exception as e:
153
+ self.logger.error(f"格式转换失败: {e}")
154
+ return StepResult(
155
+ step_name=self.name,
156
+ success=False,
157
+ data=data,
158
+ metadata={},
159
+ error=str(e)
160
+ )
161
+
162
+ async def _preprocess_data(self, data: pd.DataFrame, step_config: Dict[str, Any]) -> pd.DataFrame:
163
+ """预处理数据"""
164
+ data_copy = data.copy()
165
+ filtering = step_config.get("filtering", {})
166
+
167
+ # 只保留成功处理的数据
168
+ if filtering.get("only_successful", True):
169
+ if '__mllm_annotation_success' in data_copy.columns:
170
+ data_copy = data_copy[data_copy['__mllm_annotation_success'] == True]
171
+ self.logger.info(f"筛选成功标注的数据,剩余 {len(data_copy)} 行")
172
+
173
+ # 质量阈值过滤
174
+ quality_threshold = filtering.get("quality_threshold", 0.0)
175
+ if quality_threshold > 0:
176
+ # 这里需要根据实际的质量评估逻辑来过滤
177
+ pass
178
+
179
+ # 移除不需要的列
180
+ exclude_columns = filtering.get("exclude_columns", [])
181
+ columns_to_drop = [col for col in exclude_columns if col in data_copy.columns]
182
+ if columns_to_drop:
183
+ data_copy = data_copy.drop(columns=columns_to_drop)
184
+ self.logger.info(f"移除了 {len(columns_to_drop)} 个元数据列")
185
+
186
+ return data_copy
187
+
188
+ async def _convert_to_training_format(self, data: pd.DataFrame, step_config: Dict[str, Any]) -> List[Dict[str, Any]]:
189
+ """转换为训练格式"""
190
+ field_mapping = step_config.get("field_mapping", {})
191
+ text_field = field_mapping.get("text_field", "text")
192
+ images_field = field_mapping.get("images_field", "images")
193
+ labels_field = field_mapping.get("labels_field", "labels")
194
+ metadata_fields = field_mapping.get("metadata_fields", ["summary", "tags", "sentiment", "quality"])
195
+
196
+ training_data = []
197
+
198
+ for idx, row in data.iterrows():
199
+ sample = {}
200
+
201
+ # 文本字段
202
+ text_content = str(row.get('text', '')) if pd.notna(row.get('text')) else ""
203
+ sample[text_field] = text_content
204
+
205
+ # 图像字段
206
+ images_content = str(row.get('images', '')) if pd.notna(row.get('images')) else ""
207
+ if images_content:
208
+ # 分割图像路径
209
+ image_paths = [p.strip() for p in images_content.split('|') if p.strip()]
210
+ sample[images_field] = image_paths
211
+ else:
212
+ sample[images_field] = []
213
+
214
+ # 标签字段(基于标注结果)
215
+ labels = {}
216
+
217
+ # 使用精标结果(如果存在),否则使用初标结果
218
+ for field in metadata_fields:
219
+ refined_field = f"refined_{field}"
220
+ original_field = f"mllm_{field}"
221
+
222
+ if refined_field in data.columns and pd.notna(row.get(refined_field)):
223
+ labels[field] = str(row[refined_field])
224
+ elif original_field in data.columns and pd.notna(row.get(original_field)):
225
+ labels[field] = str(row[original_field])
226
+ else:
227
+ labels[field] = ""
228
+
229
+ sample[labels_field] = labels
230
+
231
+ # 添加其他有用的字段
232
+ sample["id"] = f"sample_{idx}"
233
+ sample["source"] = "mllm_pipeline"
234
+
235
+ # 添加有害内容标记(如果存在)
236
+ if 'harmful_images' in data.columns:
237
+ sample["harmful_images"] = str(row.get('harmful_images', ''))
238
+ if 'harmful_text' in data.columns:
239
+ sample["harmful_text"] = str(row.get('harmful_text', ''))
240
+
241
+ training_data.append(sample)
242
+
243
+ return training_data
244
+
245
+ async def _export_multiple_formats(self, training_data: List[Dict[str, Any]], config: PipelineConfig, step_config: Dict[str, Any]) -> List[str]:
246
+ """导出多种格式"""
247
+ output_formats = step_config.get("output_formats", ["jsonl", "csv"])
248
+ format_configs = step_config.get("format_configs", {})
249
+ output_files = []
250
+
251
+ output_dir = Path(config.output_dir)
252
+ output_dir.mkdir(parents=True, exist_ok=True)
253
+
254
+ for format_type in output_formats:
255
+ try:
256
+ if format_type == "jsonl":
257
+ file_path = await self._export_jsonl(training_data, output_dir, format_configs.get("jsonl", {}))
258
+ elif format_type == "csv":
259
+ file_path = await self._export_csv(training_data, output_dir, format_configs.get("csv", {}))
260
+ elif format_type == "json":
261
+ file_path = await self._export_json(training_data, output_dir, format_configs.get("json", {}))
262
+ elif format_type == "parquet":
263
+ file_path = await self._export_parquet(training_data, output_dir, format_configs.get("parquet", {}))
264
+ elif format_type == "hf_dataset":
265
+ file_path = await self._export_hf_dataset(training_data, output_dir, format_configs.get("hf_dataset", {}))
266
+ else:
267
+ self.logger.warning(f"不支持的输出格式: {format_type}")
268
+ continue
269
+
270
+ output_files.append(file_path)
271
+ self.logger.info(f"导出 {format_type} 格式: {file_path}")
272
+
273
+ except Exception as e:
274
+ self.logger.error(f"导出 {format_type} 格式失败: {e}")
275
+
276
+ return output_files
277
+
278
+ async def _export_jsonl(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
279
+ """导出JSONL格式"""
280
+ filename = config.get("filename", "training_data.jsonl")
281
+ file_path = output_dir / filename
282
+
283
+ with open(file_path, 'w', encoding='utf-8') as f:
284
+ for sample in training_data:
285
+ f.write(json.dumps(sample, ensure_ascii=False) + '\n')
286
+
287
+ return str(file_path)
288
+
289
+ async def _export_csv(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
290
+ """导出CSV格式"""
291
+ filename = config.get("filename", "training_data.csv")
292
+ encoding = config.get("encoding", "utf-8")
293
+ file_path = output_dir / filename
294
+
295
+ # 将嵌套的字典和列表转换为字符串
296
+ flattened_data = []
297
+ for sample in training_data:
298
+ flat_sample = {}
299
+ for key, value in sample.items():
300
+ if isinstance(value, (dict, list)):
301
+ flat_sample[key] = json.dumps(value, ensure_ascii=False)
302
+ else:
303
+ flat_sample[key] = value
304
+ flattened_data.append(flat_sample)
305
+
306
+ df = pd.DataFrame(flattened_data)
307
+ df.to_csv(file_path, index=False, encoding=encoding)
308
+
309
+ return str(file_path)
310
+
311
+ async def _export_json(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
312
+ """导出JSON格式"""
313
+ filename = config.get("filename", "training_data.json")
314
+ indent = config.get("indent", 2)
315
+ file_path = output_dir / filename
316
+
317
+ with open(file_path, 'w', encoding='utf-8') as f:
318
+ json.dump(training_data, f, ensure_ascii=False, indent=indent)
319
+
320
+ return str(file_path)
321
+
322
+ async def _export_parquet(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
323
+ """导出Parquet格式"""
324
+ filename = config.get("filename", "training_data.parquet")
325
+ file_path = output_dir / filename
326
+
327
+ # 将嵌套的字典和列表转换为字符串
328
+ flattened_data = []
329
+ for sample in training_data:
330
+ flat_sample = {}
331
+ for key, value in sample.items():
332
+ if isinstance(value, (dict, list)):
333
+ flat_sample[key] = json.dumps(value, ensure_ascii=False)
334
+ else:
335
+ flat_sample[key] = value
336
+ flattened_data.append(flat_sample)
337
+
338
+ df = pd.DataFrame(flattened_data)
339
+ df.to_parquet(file_path, index=False)
340
+
341
+ return str(file_path)
342
+
343
+ async def _export_hf_dataset(self, training_data: List[Dict[str, Any]], output_dir: Path, config: Dict[str, Any]) -> str:
344
+ """导出HuggingFace Dataset格式"""
345
+ try:
346
+ from datasets import Dataset
347
+ except ImportError:
348
+ self.logger.error("需要安装 datasets 库才能导出 HuggingFace 格式")
349
+ raise ImportError("pip install datasets")
350
+
351
+ dataset_name = config.get("dataset_name", "mllm_training_data")
352
+ split_ratios = config.get("split_ratios", {"train": 0.8, "validation": 0.1, "test": 0.1})
353
+
354
+ # 创建数据集
355
+ dataset = Dataset.from_list(training_data)
356
+
357
+ # 分割数据集
358
+ if len(split_ratios) > 1:
359
+ train_ratio = split_ratios.get("train", 0.8)
360
+ val_ratio = split_ratios.get("validation", 0.1)
361
+
362
+ # 首先分割训练集和测试集
363
+ train_test_split = dataset.train_test_split(test_size=1-train_ratio, seed=42)
364
+ train_dataset = train_test_split["train"]
365
+ temp_dataset = train_test_split["test"]
366
+
367
+ # 再从剩余数据中分割验证集和测试集
368
+ if val_ratio > 0:
369
+ val_test_ratio = val_ratio / (val_ratio + split_ratios.get("test", 0.1))
370
+ val_test_split = temp_dataset.train_test_split(test_size=1-val_test_ratio, seed=42)
371
+ val_dataset = val_test_split["train"]
372
+ test_dataset = val_test_split["test"]
373
+ else:
374
+ val_dataset = None
375
+ test_dataset = temp_dataset
376
+
377
+ # 保存分割后的数据集
378
+ dataset_dir = output_dir / dataset_name
379
+ train_dataset.save_to_disk(str(dataset_dir / "train"))
380
+ if val_dataset:
381
+ val_dataset.save_to_disk(str(dataset_dir / "validation"))
382
+ test_dataset.save_to_disk(str(dataset_dir / "test"))
383
+ else:
384
+ # 不分割,直接保存
385
+ dataset_dir = output_dir / dataset_name
386
+ dataset.save_to_disk(str(dataset_dir))
387
+
388
+ return str(dataset_dir)
389
+
390
+ async def _generate_statistics(self, training_data: List[Dict[str, Any]], processed_data: pd.DataFrame) -> Dict[str, Any]:
391
+ """生成统计信息"""
392
+ stats = {
393
+ "total_samples": len(training_data),
394
+ "samples_with_images": len([s for s in training_data if s.get("images", [])]),
395
+ "samples_with_text": len([s for s in training_data if s.get("text", "").strip()]),
396
+ "average_text_length": 0,
397
+ "image_count_distribution": {},
398
+ "quality_distribution": {}
399
+ }
400
+
401
+ # 文本长度统计
402
+ text_lengths = [len(s.get("text", "")) for s in training_data]
403
+ if text_lengths:
404
+ stats["average_text_length"] = sum(text_lengths) / len(text_lengths)
405
+
406
+ # 图像数量分布
407
+ image_counts = [len(s.get("images", [])) for s in training_data]
408
+ for count in image_counts:
409
+ stats["image_count_distribution"][str(count)] = stats["image_count_distribution"].get(str(count), 0) + 1
410
+
411
+ return stats