maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,317 @@
1
+ """
2
+ Web界面应用框架
3
+ """
4
+
5
+ import asyncio
6
+ import json
7
+ import uuid
8
+ from pathlib import Path
9
+ from typing import Dict, Any, Optional, List
10
+ from datetime import datetime
11
+
12
+ from fastapi import FastAPI, WebSocket, HTTPException, UploadFile, File, BackgroundTasks
13
+ from fastapi.responses import HTMLResponse, FileResponse
14
+ from fastapi.staticfiles import StaticFiles
15
+ from pydantic import BaseModel
16
+ import uvicorn
17
+
18
+ from .core import DataProcessorPipeline, PipelineConfig, StepResult
19
+ from .steps import get_all_steps, create_step_from_config
20
+ from loguru import logger
21
+
22
+
23
+ class PipelineRunRequest(BaseModel):
24
+ """Pipeline运行请求"""
25
+
26
+ config: Dict[str, Any]
27
+ resume_from: Optional[str] = None
28
+
29
+
30
+ class PipelineStatusResponse(BaseModel):
31
+ """Pipeline状态响应"""
32
+
33
+ pipeline_id: str
34
+ status: str # idle, running, completed, failed
35
+ current_step: Optional[str] = None
36
+ progress: float = 0.0
37
+ results: List[Dict[str, Any]] = []
38
+ error: Optional[str] = None
39
+
40
+
41
+ class WebSocketManager:
42
+ """WebSocket连接管理器"""
43
+
44
+ def __init__(self):
45
+ self.connections: Dict[str, WebSocket] = {}
46
+
47
+ async def connect(self, websocket: WebSocket, client_id: str):
48
+ await websocket.accept()
49
+ self.connections[client_id] = websocket
50
+ logger.info(f"WebSocket连接建立: {client_id}")
51
+
52
+ def disconnect(self, client_id: str):
53
+ if client_id in self.connections:
54
+ del self.connections[client_id]
55
+ logger.info(f"WebSocket连接断开: {client_id}")
56
+
57
+ async def send_message(self, client_id: str, message: Dict[str, Any]):
58
+ if client_id in self.connections:
59
+ try:
60
+ await self.connections[client_id].send_text(json.dumps(message))
61
+ except Exception as e:
62
+ logger.error(f"发送WebSocket消息失败: {e}")
63
+ self.disconnect(client_id)
64
+
65
+
66
+ class PipelineManager:
67
+ """Pipeline管理器"""
68
+
69
+ def __init__(self):
70
+ self.pipelines: Dict[str, DataProcessorPipeline] = {}
71
+ self.pipeline_status: Dict[str, Dict[str, Any]] = {}
72
+ self.websocket_manager = WebSocketManager()
73
+
74
+ def create_pipeline(self, config: PipelineConfig) -> str:
75
+ """创建Pipeline"""
76
+ pipeline_id = str(uuid.uuid4())
77
+ pipeline = DataProcessorPipeline(config)
78
+
79
+ self.pipelines[pipeline_id] = pipeline
80
+ self.pipeline_status[pipeline_id] = {
81
+ "status": "idle",
82
+ "current_step": None,
83
+ "progress": 0.0,
84
+ "results": [],
85
+ "error": None,
86
+ "created_at": datetime.now().isoformat(),
87
+ }
88
+
89
+ # 设置状态回调
90
+ pipeline.set_status_callback(
91
+ lambda status, data: asyncio.create_task(
92
+ self._update_pipeline_status(pipeline_id, status, data)
93
+ )
94
+ )
95
+
96
+ return pipeline_id
97
+
98
+ async def _update_pipeline_status(
99
+ self, pipeline_id: str, status: str, data: Dict[str, Any]
100
+ ):
101
+ """更新Pipeline状态"""
102
+ if pipeline_id not in self.pipeline_status:
103
+ return
104
+
105
+ pipeline_status = self.pipeline_status[pipeline_id]
106
+
107
+ if status == "starting":
108
+ pipeline_status["status"] = "running"
109
+ pipeline_status["progress"] = 0.0
110
+ elif status == "executing_step":
111
+ pipeline_status["current_step"] = data.get("step_name")
112
+ pipeline_status["progress"] = (data.get("step_index", 0) + 1) / data.get(
113
+ "total_steps", 1
114
+ )
115
+ elif status == "step_failed":
116
+ pipeline_status["status"] = "failed"
117
+ pipeline_status["error"] = data.get("error")
118
+ elif status == "completed":
119
+ pipeline_status["status"] = "completed"
120
+ pipeline_status["progress"] = 1.0
121
+ pipeline_status["current_step"] = None
122
+
123
+ # 通过WebSocket发送状态更新
124
+ await self.websocket_manager.send_message(
125
+ pipeline_id,
126
+ {
127
+ "type": "status_update",
128
+ "pipeline_id": pipeline_id,
129
+ "status": pipeline_status,
130
+ },
131
+ )
132
+
133
+ async def run_pipeline(self, pipeline_id: str, resume_from: Optional[str] = None):
134
+ """运行Pipeline"""
135
+ if pipeline_id not in self.pipelines:
136
+ raise ValueError(f"Pipeline {pipeline_id} 不存在")
137
+
138
+ pipeline = self.pipelines[pipeline_id]
139
+
140
+ try:
141
+ results = await pipeline.run(resume_from=resume_from)
142
+ self.pipeline_status[pipeline_id]["results"] = [
143
+ r.to_dict() for r in results
144
+ ]
145
+ pipeline.save_final_results()
146
+ except Exception as e:
147
+ logger.error(f"Pipeline {pipeline_id} 执行失败: {e}")
148
+ self.pipeline_status[pipeline_id]["status"] = "failed"
149
+ self.pipeline_status[pipeline_id]["error"] = str(e)
150
+
151
+ def get_pipeline_status(self, pipeline_id: str) -> Dict[str, Any]:
152
+ """获取Pipeline状态"""
153
+ if pipeline_id not in self.pipeline_status:
154
+ raise ValueError(f"Pipeline {pipeline_id} 不存在")
155
+
156
+ return self.pipeline_status[pipeline_id]
157
+
158
+ def list_pipelines(self) -> List[Dict[str, Any]]:
159
+ """列出所有Pipeline"""
160
+ return [
161
+ {"pipeline_id": pid, **status}
162
+ for pid, status in self.pipeline_status.items()
163
+ ]
164
+
165
+
166
+ class WebApp:
167
+ """Web应用主类"""
168
+
169
+ def __init__(self, static_dir: Optional[str] = None):
170
+ self.app = FastAPI(title="MLLM Data Processor Pipeline", version="0.1.0")
171
+ self.pipeline_manager = PipelineManager()
172
+ self.static_dir = static_dir or str(Path(__file__).parent / "static")
173
+
174
+ self._setup_routes()
175
+ self._setup_static_files()
176
+
177
+ def _setup_static_files(self):
178
+ """设置静态文件服务"""
179
+ static_path = Path(self.static_dir)
180
+ if static_path.exists():
181
+ self.app.mount(
182
+ "/static", StaticFiles(directory=str(static_path)), name="static"
183
+ )
184
+
185
+ def _setup_routes(self):
186
+ """设置路由"""
187
+
188
+ @self.app.get("/", response_class=HTMLResponse)
189
+ async def index():
190
+ """主页"""
191
+ html_file = Path(self.static_dir) / "index.html"
192
+ if html_file.exists():
193
+ return FileResponse(html_file)
194
+ return self._get_default_html()
195
+
196
+ @self.app.post("/api/pipeline/create")
197
+ async def create_pipeline(request: PipelineRunRequest):
198
+ """创建Pipeline"""
199
+ try:
200
+ config = PipelineConfig.from_dict(request.config)
201
+ pipeline_id = self.pipeline_manager.create_pipeline(config)
202
+ return {"pipeline_id": pipeline_id, "status": "created"}
203
+ except Exception as e:
204
+ logger.error(f"创建Pipeline失败: {e}")
205
+ raise HTTPException(status_code=400, detail=str(e))
206
+
207
+ @self.app.post("/api/pipeline/{pipeline_id}/run")
208
+ async def run_pipeline(
209
+ pipeline_id: str,
210
+ background_tasks: BackgroundTasks,
211
+ resume_from: Optional[str] = None,
212
+ ):
213
+ """运行Pipeline"""
214
+ try:
215
+ background_tasks.add_task(
216
+ self.pipeline_manager.run_pipeline, pipeline_id, resume_from
217
+ )
218
+ return {"message": "Pipeline开始执行", "pipeline_id": pipeline_id}
219
+ except Exception as e:
220
+ logger.error(f"运行Pipeline失败: {e}")
221
+ raise HTTPException(status_code=400, detail=str(e))
222
+
223
+ @self.app.get("/api/pipeline/{pipeline_id}/status")
224
+ async def get_pipeline_status(pipeline_id: str):
225
+ """获取Pipeline状态"""
226
+ try:
227
+ return self.pipeline_manager.get_pipeline_status(pipeline_id)
228
+ except Exception as e:
229
+ raise HTTPException(status_code=404, detail=str(e))
230
+
231
+ @self.app.get("/api/pipelines")
232
+ async def list_pipelines():
233
+ """列出所有Pipeline"""
234
+ return self.pipeline_manager.list_pipelines()
235
+
236
+ @self.app.get("/api/steps")
237
+ async def get_available_steps():
238
+ """获取可用的处理步骤"""
239
+ return {
240
+ "steps": [
241
+ {
242
+ "name": step_class.__name__,
243
+ "description": getattr(step_class, "__doc__", ""),
244
+ "config_schema": getattr(step_class, "CONFIG_SCHEMA", {}),
245
+ }
246
+ for step_class in get_all_steps()
247
+ ]
248
+ }
249
+
250
+ @self.app.post("/api/upload")
251
+ async def upload_file(file: UploadFile = File(...)):
252
+ """上传文件"""
253
+ try:
254
+ upload_dir = Path("uploads")
255
+ upload_dir.mkdir(exist_ok=True)
256
+
257
+ file_path = upload_dir / file.filename
258
+ with open(file_path, "wb") as f:
259
+ content = await file.read()
260
+ f.write(content)
261
+
262
+ return {
263
+ "filename": file.filename,
264
+ "file_path": str(file_path),
265
+ "size": len(content),
266
+ }
267
+ except Exception as e:
268
+ logger.error(f"文件上传失败: {e}")
269
+ raise HTTPException(status_code=500, detail=str(e))
270
+
271
+ @self.app.websocket("/ws/{client_id}")
272
+ async def websocket_endpoint(websocket: WebSocket, client_id: str):
273
+ """WebSocket端点"""
274
+ await self.pipeline_manager.websocket_manager.connect(websocket, client_id)
275
+ try:
276
+ while True:
277
+ data = await websocket.receive_text()
278
+ # 处理客户端消息
279
+ message = json.loads(data)
280
+ logger.info(f"收到WebSocket消息: {message}")
281
+ except Exception as e:
282
+ logger.error(f"WebSocket错误: {e}")
283
+ finally:
284
+ self.pipeline_manager.websocket_manager.disconnect(client_id)
285
+
286
+ def _get_default_html(self) -> str:
287
+ """获取默认HTML页面"""
288
+ return """
289
+ <!DOCTYPE html>
290
+ <html>
291
+ <head>
292
+ <title>MLLM Data Processor Pipeline</title>
293
+ <meta charset="utf-8">
294
+ <style>
295
+ body { font-family: Arial, sans-serif; margin: 40px; }
296
+ .container { max-width: 1200px; margin: 0 auto; }
297
+ h1 { color: #333; }
298
+ .section { margin: 20px 0; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }
299
+ </style>
300
+ </head>
301
+ <body>
302
+ <div class="container">
303
+ <h1>MLLM Data Processor Pipeline</h1>
304
+ <div class="section">
305
+ <h2>欢迎使用多模态大模型训练数据处理Pipeline</h2>
306
+ <p>这是一个灵活的、模块化的数据处理流水线,支持Web界面交互和断点续传。</p>
307
+ <p>请访问 <a href="/docs">/docs</a> 查看API文档</p>
308
+ </div>
309
+ </div>
310
+ </body>
311
+ </html>
312
+ """
313
+
314
+ def run(self, host: str = "127.0.0.1", port: int = 8000, **kwargs):
315
+ """运行Web应用"""
316
+ logger.info(f"启动Web应用: http://{host}:{port}")
317
+ uvicorn.run(self.app, host=host, port=port, **kwargs)
maque/nlp/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ from .sentence_splitter import SentenceSplitter, Sentence, split_sentences
2
+ from .risk_matcher import RiskMatcher, MatchResult, ExtractResult, match_and_extract
3
+
4
+ __all__ = [
5
+ # sentence_splitter
6
+ "SentenceSplitter",
7
+ "Sentence",
8
+ "split_sentences",
9
+ # risk_matcher
10
+ "RiskMatcher",
11
+ "MatchResult",
12
+ "ExtractResult",
13
+ "match_and_extract",
14
+ ]
maque/nlp/ngram.py ADDED
@@ -0,0 +1,9 @@
1
+
2
+ def ngram(words: str, n: int, reverse=True):
3
+ ngram_dict = {}
4
+ words = ['<S>'] + list(words) + ['<E>']
5
+ zip_args = ', '.join([f"words[{i}:]" for i in range(n)])
6
+ zip_result = eval(f"zip({zip_args})")
7
+ for i in zip_result:
8
+ ngram_dict[i] = ngram_dict.get(i, 0) + 1
9
+ return sorted(ngram_dict.items(), key=lambda i: i[1], reverse=reverse)
maque/nlp/parser.py ADDED
@@ -0,0 +1,63 @@
1
+ import json5
2
+ import re
3
+ import ast
4
+ from typing import Optional
5
+
6
+ def extract_code_snippets(text, strict=True):
7
+ """Extract code snippets"""
8
+ # 首先处理带有 ``` 标志的代码块
9
+ pattern = r"```(\w+)?\s*([\s\S]*?)```"
10
+ matches = re.findall(pattern, text)
11
+
12
+ code_snippets = []
13
+ for lang, code in matches:
14
+ code_snippets.append({
15
+ "language": lang.strip() if lang else "unknown",
16
+ "code": code.strip(),
17
+ })
18
+
19
+ if not strict:
20
+ # 查找并排除已经被处理过的 ``` ... ``` 内的代码块
21
+ text = re.sub(pattern, "", text)
22
+
23
+ # 处理剩下的 { ... } 格式的代码块
24
+ pattern = r"\{[\s\S]*?\}"
25
+ matches = re.findall(pattern, text)
26
+
27
+ for code in matches:
28
+ code_snippets.append({
29
+ "language": "unknown",
30
+ "code": code.strip(),
31
+ })
32
+
33
+ return code_snippets
34
+
35
+
36
+ def parse_to_obj(text: str, strict=False, raise_error=True):
37
+ """Parse to obj"""
38
+ code_snippets = extract_code_snippets(text, strict=strict)
39
+ code_snippets = [code_snippet["code"] for code_snippet in code_snippets]
40
+ code_snippets = [code_snippet.strip() for code_snippet in code_snippets if code_snippet.strip()]
41
+ if not code_snippets:
42
+ return None
43
+ code_str = code_snippets[-1]
44
+ try:
45
+ return ast.literal_eval(code_str)
46
+ except:
47
+ try:
48
+ return json5.loads(code_str)
49
+ except:
50
+ if raise_error:
51
+ raise ValueError(f"Failed to parse to obj: {text}")
52
+ return None
53
+
54
+
55
+ def parse_to_code(text: str, strict=False) -> Optional[str]:
56
+ """Parse to code"""
57
+ code_snippets = extract_code_snippets(text, strict=strict)
58
+ code_snippets = [code_snippet["code"] for code_snippet in code_snippets]
59
+ code_snippets = [code_snippet.strip() for code_snippet in code_snippets if code_snippet.strip()]
60
+ if not code_snippets:
61
+ return None
62
+ code_str = code_snippets[-1]
63
+ return code_str