maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,331 @@
1
+ """
2
+ 第4步:第一轮大模型标注
3
+ 使用多模态大模型对数据进行初步标注
4
+ """
5
+
6
+ import pandas as pd
7
+ import asyncio
8
+ from typing import List, Dict, Any, Optional
9
+ from ..core import PipelineStep, StepResult, PipelineConfig
10
+ from maque.performance import MeasureTime
11
+ from flexllm.mllm_client import MllmClient
12
+ from flexllm.async_api import ConcurrentExecutor
13
+
14
+
15
+ class MllmAnnotationStep(PipelineStep):
16
+ """MLLM标注步骤"""
17
+
18
+ CONFIG_SCHEMA = {
19
+ "type": "object",
20
+ "properties": {
21
+ "mllm_config": {
22
+ "type": "object",
23
+ "properties": {
24
+ "model_name": {"type": "string", "default": "gpt-4o"},
25
+ "base_url": {"type": "string"},
26
+ "api_key": {"type": "string"},
27
+ "temperature": {"type": "number", "default": 0.7},
28
+ "max_tokens": {"type": "integer", "default": 2048}
29
+ },
30
+ "required": ["model_name"]
31
+ },
32
+ "annotation_prompts": {
33
+ "type": "object",
34
+ "properties": {
35
+ "system_prompt": {
36
+ "type": "string",
37
+ "default": "你是一个专业的数据标注员,请根据给定的图像和文本内容进行标注。"
38
+ },
39
+ "user_prompt_template": {
40
+ "type": "string",
41
+ "default": "请对以下内容进行标注:\n文本:{text}\n\n请提供:\n1. 内容摘要\n2. 主要标签\n3. 情感倾向\n4. 质量评估"
42
+ }
43
+ }
44
+ },
45
+ "concurrent_config": {
46
+ "type": "object",
47
+ "properties": {
48
+ "max_workers": {"type": "integer", "default": 5},
49
+ "batch_size": {"type": "integer", "default": 10},
50
+ "rate_limit": {"type": "number", "default": 1.0}
51
+ }
52
+ },
53
+ "output_columns": {
54
+ "type": "object",
55
+ "properties": {
56
+ "summary": {"type": "string", "default": "mllm_summary"},
57
+ "tags": {"type": "string", "default": "mllm_tags"},
58
+ "sentiment": {"type": "string", "default": "mllm_sentiment"},
59
+ "quality": {"type": "string", "default": "mllm_quality"},
60
+ "raw_response": {"type": "string", "default": "mllm_raw_response"}
61
+ }
62
+ },
63
+ "retry_config": {
64
+ "type": "object",
65
+ "properties": {
66
+ "max_retries": {"type": "integer", "default": 3},
67
+ "retry_delay": {"type": "number", "default": 1.0}
68
+ }
69
+ }
70
+ }
71
+ }
72
+
73
+ def __init__(self, name: str = "mllm_annotation", config: Dict[str, Any] = None):
74
+ super().__init__(name, config)
75
+ self.mllm_client: Optional[MllmClient] = None
76
+
77
+ async def execute(self, data: pd.DataFrame, config: PipelineConfig) -> StepResult:
78
+ """执行MLLM标注"""
79
+ with MeasureTime(f"步骤 {self.name}"):
80
+ try:
81
+ step_config = self.get_step_config(config)
82
+ data_copy = data.copy()
83
+
84
+ # 初始化MLLM客户端
85
+ await self._initialize_mllm_client(step_config)
86
+
87
+ # 准备标注任务
88
+ annotation_tasks = await self._prepare_annotation_tasks(data_copy, step_config)
89
+
90
+ # 执行并发标注
91
+ annotation_results = await self._execute_concurrent_annotation(
92
+ annotation_tasks, step_config
93
+ )
94
+
95
+ # 处理标注结果
96
+ await self._process_annotation_results(
97
+ data_copy, annotation_results, step_config
98
+ )
99
+
100
+ # 更新处理状态
101
+ data_copy['__processing_status'] = 'annotated'
102
+ data_copy['__annotated_at'] = pd.Timestamp.now()
103
+
104
+ # 统计结果
105
+ successful_annotations = len([r for r in annotation_results if r.get('success', False)])
106
+ failed_annotations = len(annotation_results) - successful_annotations
107
+
108
+ metadata = {
109
+ "total_rows": len(data_copy),
110
+ "successful_annotations": successful_annotations,
111
+ "failed_annotations": failed_annotations,
112
+ "success_rate": successful_annotations / len(annotation_results) if annotation_results else 0,
113
+ "mllm_model": step_config.get("mllm_config", {}).get("model_name", "unknown")
114
+ }
115
+
116
+ self.logger.info(f"MLLM标注完成,成功率: {metadata['success_rate']:.2%}")
117
+
118
+ return StepResult(
119
+ step_name=self.name,
120
+ success=True,
121
+ data=data_copy,
122
+ metadata=metadata
123
+ )
124
+
125
+ except Exception as e:
126
+ self.logger.error(f"MLLM标注失败: {e}")
127
+ return StepResult(
128
+ step_name=self.name,
129
+ success=False,
130
+ data=data,
131
+ metadata={},
132
+ error=str(e)
133
+ )
134
+
135
+ async def _initialize_mllm_client(self, step_config: Dict[str, Any]):
136
+ """初始化MLLM客户端"""
137
+ mllm_config = step_config.get("mllm_config", {})
138
+
139
+ self.mllm_client = MllmClient(
140
+ model_name=mllm_config.get("model_name", "gpt-4o"),
141
+ base_url=mllm_config.get("base_url"),
142
+ api_key=mllm_config.get("api_key"),
143
+ temperature=mllm_config.get("temperature", 0.7),
144
+ max_tokens=mllm_config.get("max_tokens", 2048)
145
+ )
146
+
147
+ self.logger.info(f"初始化MLLM客户端: {mllm_config.get('model_name', 'gpt-4o')}")
148
+
149
+ async def _prepare_annotation_tasks(self, data: pd.DataFrame, step_config: Dict[str, Any]) -> List[Dict[str, Any]]:
150
+ """准备标注任务"""
151
+ tasks = []
152
+
153
+ prompts = step_config.get("annotation_prompts", {})
154
+ system_prompt = prompts.get("system_prompt", "你是一个专业的数据标注员。")
155
+ user_prompt_template = prompts.get("user_prompt_template", "请对以下内容进行标注:\n文本:{text}")
156
+
157
+ text_col = "text" # 从data_alignment步骤输出的列名
158
+ image_col = "images" # 从data_alignment步骤输出的列名
159
+
160
+ for idx, row in data.iterrows():
161
+ text_content = str(row[text_col]) if pd.notna(row[text_col]) else ""
162
+ image_paths = str(row[image_col]) if pd.notna(row[image_col]) else ""
163
+
164
+ # 构建用户提示
165
+ user_prompt = user_prompt_template.format(
166
+ text=text_content,
167
+ images=image_paths
168
+ )
169
+
170
+ # 准备图像路径列表
171
+ image_list = []
172
+ if image_paths:
173
+ separator = step_config.get("image_separator", "|")
174
+ image_list = [p.strip() for p in image_paths.split(separator) if p.strip()]
175
+
176
+ task = {
177
+ "row_index": idx,
178
+ "text": text_content,
179
+ "images": image_list,
180
+ "system_prompt": system_prompt,
181
+ "user_prompt": user_prompt
182
+ }
183
+
184
+ tasks.append(task)
185
+
186
+ self.logger.info(f"准备了 {len(tasks)} 个标注任务")
187
+ return tasks
188
+
189
+ async def _execute_concurrent_annotation(self, tasks: List[Dict[str, Any]], step_config: Dict[str, Any]) -> List[Dict[str, Any]]:
190
+ """执行并发标注"""
191
+ concurrent_config = step_config.get("concurrent_config", {})
192
+ max_workers = concurrent_config.get("max_workers", 5)
193
+ batch_size = concurrent_config.get("batch_size", 10)
194
+ rate_limit = concurrent_config.get("rate_limit", 1.0)
195
+
196
+ # 创建并发执行器
197
+ executor = ConcurrentExecutor(
198
+ max_concurrent=max_workers,
199
+ rate_limit=rate_limit
200
+ )
201
+
202
+ # 准备异步任务
203
+ async_tasks = []
204
+ for task in tasks:
205
+ async_task = self._annotate_single_item(task, step_config)
206
+ async_tasks.append(async_task)
207
+
208
+ # 执行并发标注
209
+ results = await executor.execute_all(async_tasks)
210
+
211
+ return results
212
+
213
+ async def _annotate_single_item(self, task: Dict[str, Any], step_config: Dict[str, Any]) -> Dict[str, Any]:
214
+ """标注单个数据项"""
215
+ retry_config = step_config.get("retry_config", {})
216
+ max_retries = retry_config.get("max_retries", 3)
217
+ retry_delay = retry_config.get("retry_delay", 1.0)
218
+
219
+ for attempt in range(max_retries + 1):
220
+ try:
221
+ # 调用MLLM
222
+ response = await self.mllm_client.chat_async(
223
+ messages=[
224
+ {"role": "system", "content": task["system_prompt"]},
225
+ {"role": "user", "content": task["user_prompt"]}
226
+ ],
227
+ images=task["images"]
228
+ )
229
+
230
+ # 解析响应
231
+ parsed_result = self._parse_mllm_response(response)
232
+
233
+ return {
234
+ "row_index": task["row_index"],
235
+ "success": True,
236
+ "raw_response": response,
237
+ **parsed_result
238
+ }
239
+
240
+ except Exception as e:
241
+ self.logger.warning(f"第 {attempt + 1} 次标注失败 (行 {task['row_index']}): {e}")
242
+
243
+ if attempt < max_retries:
244
+ await asyncio.sleep(retry_delay * (attempt + 1))
245
+ else:
246
+ return {
247
+ "row_index": task["row_index"],
248
+ "success": False,
249
+ "error": str(e),
250
+ "raw_response": "",
251
+ "summary": "",
252
+ "tags": "",
253
+ "sentiment": "",
254
+ "quality": ""
255
+ }
256
+
257
+ def _parse_mllm_response(self, response: str) -> Dict[str, str]:
258
+ """解析MLLM响应"""
259
+ # 简单的响应解析,实际使用时可能需要更复杂的解析逻辑
260
+ result = {
261
+ "summary": "",
262
+ "tags": "",
263
+ "sentiment": "",
264
+ "quality": ""
265
+ }
266
+
267
+ try:
268
+ lines = response.strip().split('\n')
269
+ current_section = None
270
+
271
+ for line in lines:
272
+ line = line.strip()
273
+ if not line:
274
+ continue
275
+
276
+ # 检查是否是新的部分标题
277
+ if "摘要" in line or "summary" in line.lower():
278
+ current_section = "summary"
279
+ elif "标签" in line or "tag" in line.lower():
280
+ current_section = "tags"
281
+ elif "情感" in line or "sentiment" in line.lower():
282
+ current_section = "sentiment"
283
+ elif "质量" in line or "quality" in line.lower():
284
+ current_section = "quality"
285
+ elif current_section and line:
286
+ # 移除常见的前缀
287
+ content = line.replace(":", "").replace(":", "").strip()
288
+ if content and not content.startswith(("1.", "2.", "3.", "4.")):
289
+ if result[current_section]:
290
+ result[current_section] += " " + content
291
+ else:
292
+ result[current_section] = content
293
+
294
+ # 如果解析失败,将整个响应作为摘要
295
+ if not any(result.values()):
296
+ result["summary"] = response[:500] # 截取前500字符
297
+
298
+ except Exception as e:
299
+ self.logger.warning(f"解析MLLM响应失败: {e}")
300
+ result["summary"] = response[:500] if response else ""
301
+
302
+ return result
303
+
304
+ async def _process_annotation_results(self, data: pd.DataFrame, results: List[Dict[str, Any]], step_config: Dict[str, Any]):
305
+ """处理标注结果"""
306
+ output_columns = step_config.get("output_columns", {})
307
+
308
+ summary_col = output_columns.get("summary", "mllm_summary")
309
+ tags_col = output_columns.get("tags", "mllm_tags")
310
+ sentiment_col = output_columns.get("sentiment", "mllm_sentiment")
311
+ quality_col = output_columns.get("quality", "mllm_quality")
312
+ raw_response_col = output_columns.get("raw_response", "mllm_raw_response")
313
+
314
+ # 初始化新列
315
+ data[summary_col] = ""
316
+ data[tags_col] = ""
317
+ data[sentiment_col] = ""
318
+ data[quality_col] = ""
319
+ data[raw_response_col] = ""
320
+ data['__mllm_annotation_success'] = False
321
+
322
+ # 填充结果
323
+ for result in results:
324
+ row_idx = result["row_index"]
325
+
326
+ data.at[row_idx, summary_col] = result.get("summary", "")
327
+ data.at[row_idx, tags_col] = result.get("tags", "")
328
+ data.at[row_idx, sentiment_col] = result.get("sentiment", "")
329
+ data.at[row_idx, quality_col] = result.get("quality", "")
330
+ data.at[row_idx, raw_response_col] = result.get("raw_response", "")
331
+ data.at[row_idx, '__mllm_annotation_success'] = result.get("success", False)