maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,308 @@
1
+ #! /usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ 多模态 Embedding - 支持文本和图片 (jina-clip-v2 等)
6
+ """
7
+
8
+ import asyncio
9
+ import base64
10
+ import time
11
+ from pathlib import Path
12
+ from typing import List, Optional, Union, Literal
13
+
14
+ import aiohttp
15
+ import requests
16
+
17
+ from .base import BaseEmbedding
18
+
19
+
20
+ InputType = Literal["text", "image", "auto"]
21
+
22
+
23
+ class MultiModalEmbedding(BaseEmbedding):
24
+ """
25
+ 多模态 Embedding 客户端
26
+ 支持文本和图片的向量化 (jina-clip-v2 等)
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ base_url: str,
32
+ model: str = "jinaai/jina-clip-v2",
33
+ api_key: str = "EMPTY",
34
+ dimensions: Optional[int] = None,
35
+ timeout: float = 60.0,
36
+ max_retries: int = 3,
37
+ ):
38
+ """
39
+ 初始化多模态 Embedding 客户端
40
+
41
+ Args:
42
+ base_url: API 基础 URL
43
+ model: 模型名称
44
+ api_key: API 密钥
45
+ dimensions: 输出维度 (如果模型支持)
46
+ timeout: 请求超时时间(秒)
47
+ max_retries: 最大重试次数
48
+ """
49
+ self.base_url = base_url.rstrip("/")
50
+ self.model = model
51
+ self.api_key = api_key
52
+ self._dimensions = dimensions
53
+ self.timeout = timeout
54
+ self.max_retries = max_retries
55
+ self._actual_dimension: Optional[int] = None
56
+
57
+ def _get_headers(self) -> dict:
58
+ return {
59
+ "Content-Type": "application/json",
60
+ "Authorization": f"Bearer {self.api_key}",
61
+ }
62
+
63
+ def _is_image(self, input_str: str) -> bool:
64
+ """判断输入是否为图片"""
65
+ # URL 图片
66
+ if input_str.startswith(("http://", "https://")):
67
+ lower = input_str.lower()
68
+ return any(ext in lower for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"])
69
+ # 本地文件
70
+ if Path(input_str).suffix.lower() in [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"]:
71
+ return Path(input_str).exists()
72
+ # Base64
73
+ if input_str.startswith("data:image"):
74
+ return True
75
+ return False
76
+
77
+ def _encode_image(self, image_path: str) -> str:
78
+ """将本地图片编码为 base64 data URL"""
79
+ path = Path(image_path)
80
+ if not path.exists():
81
+ raise FileNotFoundError(f"图片不存在: {image_path}")
82
+
83
+ suffix = path.suffix.lower()
84
+ mime_types = {
85
+ ".jpg": "image/jpeg",
86
+ ".jpeg": "image/jpeg",
87
+ ".png": "image/png",
88
+ ".gif": "image/gif",
89
+ ".webp": "image/webp",
90
+ ".bmp": "image/bmp",
91
+ }
92
+ mime = mime_types.get(suffix, "image/jpeg")
93
+
94
+ with open(path, "rb") as f:
95
+ data = base64.b64encode(f.read()).decode("utf-8")
96
+
97
+ return f"data:{mime};base64,{data}"
98
+
99
+ def _prepare_input(
100
+ self,
101
+ input_str: str,
102
+ input_type: InputType = "auto",
103
+ ) -> dict:
104
+ """
105
+ 准备单个输入,返回 API 格式
106
+
107
+ Returns:
108
+ {"text": "..."} 或 {"image": "..."}
109
+ """
110
+ if input_type == "auto":
111
+ is_image = self._is_image(input_str)
112
+ else:
113
+ is_image = input_type == "image"
114
+
115
+ if is_image:
116
+ # 本地文件需要编码
117
+ if not input_str.startswith(("http://", "https://", "data:")):
118
+ input_str = self._encode_image(input_str)
119
+ return {"image": input_str}
120
+ else:
121
+ return {"text": input_str}
122
+
123
+ def _build_payload(
124
+ self,
125
+ inputs: List[str],
126
+ input_type: InputType = "auto",
127
+ dimensions: Optional[int] = None,
128
+ ) -> dict:
129
+ """构建请求 payload"""
130
+ # 准备输入
131
+ prepared_inputs = [
132
+ self._prepare_input(inp, input_type) for inp in inputs
133
+ ]
134
+
135
+ payload = {
136
+ "model": self.model,
137
+ "input": prepared_inputs,
138
+ }
139
+
140
+ dimensions = dimensions or self._dimensions
141
+ if dimensions:
142
+ payload["dimensions"] = dimensions
143
+
144
+ return payload
145
+
146
+ def _parse_response(self, data: dict) -> List[List[float]]:
147
+ """解析 API 响应"""
148
+ results = sorted(data["data"], key=lambda x: x["index"])
149
+ embeddings = [item["embedding"] for item in results]
150
+
151
+ # 记录实际维度
152
+ if embeddings and not self._actual_dimension:
153
+ self._actual_dimension = len(embeddings[0])
154
+
155
+ return embeddings
156
+
157
+ # ========== 实现 BaseEmbedding 接口 ==========
158
+
159
+ def embed(
160
+ self,
161
+ inputs: Union[str, List[str]],
162
+ input_type: InputType = "auto",
163
+ dimensions: Optional[int] = None,
164
+ ) -> List[List[float]]:
165
+ """
166
+ 向量化输入(文本或图片)
167
+
168
+ Args:
169
+ inputs: 输入或输入列表(文本/图片路径/图片URL)
170
+ input_type: 输入类型 "text"/"image"/"auto"
171
+ dimensions: 输出维度
172
+
173
+ Returns:
174
+ 向量列表
175
+ """
176
+ if isinstance(inputs, str):
177
+ inputs = [inputs]
178
+
179
+ url = f"{self.base_url}/v1/embeddings"
180
+ payload = self._build_payload(inputs, input_type, dimensions)
181
+
182
+ for attempt in range(self.max_retries):
183
+ try:
184
+ response = requests.post(
185
+ url,
186
+ json=payload,
187
+ headers=self._get_headers(),
188
+ timeout=self.timeout,
189
+ )
190
+ response.raise_for_status()
191
+ data = response.json()
192
+ return self._parse_response(data)
193
+
194
+ except requests.RequestException:
195
+ if attempt == self.max_retries - 1:
196
+ raise
197
+ time.sleep(0.5 * (attempt + 1))
198
+
199
+ async def aembed(
200
+ self,
201
+ inputs: Union[str, List[str]],
202
+ input_type: InputType = "auto",
203
+ dimensions: Optional[int] = None,
204
+ ) -> List[List[float]]:
205
+ """异步向量化输入"""
206
+ if isinstance(inputs, str):
207
+ inputs = [inputs]
208
+
209
+ url = f"{self.base_url}/v1/embeddings"
210
+ payload = self._build_payload(inputs, input_type, dimensions)
211
+
212
+ timeout = aiohttp.ClientTimeout(total=self.timeout)
213
+ async with aiohttp.ClientSession(timeout=timeout) as session:
214
+ for attempt in range(self.max_retries):
215
+ try:
216
+ async with session.post(
217
+ url,
218
+ json=payload,
219
+ headers=self._get_headers(),
220
+ ) as response:
221
+ response.raise_for_status()
222
+ data = await response.json()
223
+ return self._parse_response(data)
224
+
225
+ except aiohttp.ClientError:
226
+ if attempt == self.max_retries - 1:
227
+ raise
228
+ await asyncio.sleep(0.5 * (attempt + 1))
229
+
230
+ @property
231
+ def dimension(self) -> int:
232
+ """向量维度"""
233
+ if self._dimensions:
234
+ return self._dimensions
235
+ if self._actual_dimension:
236
+ return self._actual_dimension
237
+ return 768 # CLIP 默认维度
238
+
239
+ @property
240
+ def supports_image(self) -> bool:
241
+ return True
242
+
243
+ # ========== 便捷方法 ==========
244
+
245
+ def embed_text(
246
+ self,
247
+ texts: Union[str, List[str]],
248
+ dimensions: Optional[int] = None,
249
+ ) -> List[List[float]]:
250
+ """仅向量化文本"""
251
+ return self.embed(texts, input_type="text", dimensions=dimensions)
252
+
253
+ def embed_image(
254
+ self,
255
+ images: Union[str, List[str]],
256
+ dimensions: Optional[int] = None,
257
+ ) -> List[List[float]]:
258
+ """仅向量化图片"""
259
+ return self.embed(images, input_type="image", dimensions=dimensions)
260
+
261
+ async def aembed_text(
262
+ self,
263
+ texts: Union[str, List[str]],
264
+ dimensions: Optional[int] = None,
265
+ ) -> List[List[float]]:
266
+ """异步仅向量化文本"""
267
+ return await self.aembed(texts, input_type="text", dimensions=dimensions)
268
+
269
+ async def aembed_image(
270
+ self,
271
+ images: Union[str, List[str]],
272
+ dimensions: Optional[int] = None,
273
+ ) -> List[List[float]]:
274
+ """异步仅向量化图片"""
275
+ return await self.aembed(images, input_type="image", dimensions=dimensions)
276
+
277
+ def embed_batch(
278
+ self,
279
+ inputs: List[str],
280
+ batch_size: int = 16,
281
+ input_type: InputType = "auto",
282
+ dimensions: Optional[int] = None,
283
+ ) -> List[List[float]]:
284
+ """批量向量化"""
285
+ return asyncio.run(
286
+ self.aembed_batch(inputs, batch_size, input_type, dimensions)
287
+ )
288
+
289
+ async def aembed_batch(
290
+ self,
291
+ inputs: List[str],
292
+ batch_size: int = 16,
293
+ input_type: InputType = "auto",
294
+ dimensions: Optional[int] = None,
295
+ ) -> List[List[float]]:
296
+ """异步批量向量化"""
297
+ all_embeddings = [None] * len(inputs)
298
+
299
+ for i in range(0, len(inputs), batch_size):
300
+ batch = inputs[i : i + batch_size]
301
+ vectors = await self.aembed(batch, input_type, dimensions)
302
+ for j, vec in enumerate(vectors):
303
+ all_embeddings[i + j] = vec
304
+
305
+ return all_embeddings
306
+
307
+ def __repr__(self) -> str:
308
+ return f"MultiModalEmbedding(base_url={self.base_url!r}, model={self.model!r})"