maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
#! /usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
多模态 Embedding - 支持文本和图片 (jina-clip-v2 等)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import base64
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Optional, Union, Literal
|
|
13
|
+
|
|
14
|
+
import aiohttp
|
|
15
|
+
import requests
|
|
16
|
+
|
|
17
|
+
from .base import BaseEmbedding
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
InputType = Literal["text", "image", "auto"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MultiModalEmbedding(BaseEmbedding):
|
|
24
|
+
"""
|
|
25
|
+
多模态 Embedding 客户端
|
|
26
|
+
支持文本和图片的向量化 (jina-clip-v2 等)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
base_url: str,
|
|
32
|
+
model: str = "jinaai/jina-clip-v2",
|
|
33
|
+
api_key: str = "EMPTY",
|
|
34
|
+
dimensions: Optional[int] = None,
|
|
35
|
+
timeout: float = 60.0,
|
|
36
|
+
max_retries: int = 3,
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
初始化多模态 Embedding 客户端
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
base_url: API 基础 URL
|
|
43
|
+
model: 模型名称
|
|
44
|
+
api_key: API 密钥
|
|
45
|
+
dimensions: 输出维度 (如果模型支持)
|
|
46
|
+
timeout: 请求超时时间(秒)
|
|
47
|
+
max_retries: 最大重试次数
|
|
48
|
+
"""
|
|
49
|
+
self.base_url = base_url.rstrip("/")
|
|
50
|
+
self.model = model
|
|
51
|
+
self.api_key = api_key
|
|
52
|
+
self._dimensions = dimensions
|
|
53
|
+
self.timeout = timeout
|
|
54
|
+
self.max_retries = max_retries
|
|
55
|
+
self._actual_dimension: Optional[int] = None
|
|
56
|
+
|
|
57
|
+
def _get_headers(self) -> dict:
|
|
58
|
+
return {
|
|
59
|
+
"Content-Type": "application/json",
|
|
60
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def _is_image(self, input_str: str) -> bool:
|
|
64
|
+
"""判断输入是否为图片"""
|
|
65
|
+
# URL 图片
|
|
66
|
+
if input_str.startswith(("http://", "https://")):
|
|
67
|
+
lower = input_str.lower()
|
|
68
|
+
return any(ext in lower for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"])
|
|
69
|
+
# 本地文件
|
|
70
|
+
if Path(input_str).suffix.lower() in [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"]:
|
|
71
|
+
return Path(input_str).exists()
|
|
72
|
+
# Base64
|
|
73
|
+
if input_str.startswith("data:image"):
|
|
74
|
+
return True
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
def _encode_image(self, image_path: str) -> str:
|
|
78
|
+
"""将本地图片编码为 base64 data URL"""
|
|
79
|
+
path = Path(image_path)
|
|
80
|
+
if not path.exists():
|
|
81
|
+
raise FileNotFoundError(f"图片不存在: {image_path}")
|
|
82
|
+
|
|
83
|
+
suffix = path.suffix.lower()
|
|
84
|
+
mime_types = {
|
|
85
|
+
".jpg": "image/jpeg",
|
|
86
|
+
".jpeg": "image/jpeg",
|
|
87
|
+
".png": "image/png",
|
|
88
|
+
".gif": "image/gif",
|
|
89
|
+
".webp": "image/webp",
|
|
90
|
+
".bmp": "image/bmp",
|
|
91
|
+
}
|
|
92
|
+
mime = mime_types.get(suffix, "image/jpeg")
|
|
93
|
+
|
|
94
|
+
with open(path, "rb") as f:
|
|
95
|
+
data = base64.b64encode(f.read()).decode("utf-8")
|
|
96
|
+
|
|
97
|
+
return f"data:{mime};base64,{data}"
|
|
98
|
+
|
|
99
|
+
def _prepare_input(
|
|
100
|
+
self,
|
|
101
|
+
input_str: str,
|
|
102
|
+
input_type: InputType = "auto",
|
|
103
|
+
) -> dict:
|
|
104
|
+
"""
|
|
105
|
+
准备单个输入,返回 API 格式
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
{"text": "..."} 或 {"image": "..."}
|
|
109
|
+
"""
|
|
110
|
+
if input_type == "auto":
|
|
111
|
+
is_image = self._is_image(input_str)
|
|
112
|
+
else:
|
|
113
|
+
is_image = input_type == "image"
|
|
114
|
+
|
|
115
|
+
if is_image:
|
|
116
|
+
# 本地文件需要编码
|
|
117
|
+
if not input_str.startswith(("http://", "https://", "data:")):
|
|
118
|
+
input_str = self._encode_image(input_str)
|
|
119
|
+
return {"image": input_str}
|
|
120
|
+
else:
|
|
121
|
+
return {"text": input_str}
|
|
122
|
+
|
|
123
|
+
def _build_payload(
|
|
124
|
+
self,
|
|
125
|
+
inputs: List[str],
|
|
126
|
+
input_type: InputType = "auto",
|
|
127
|
+
dimensions: Optional[int] = None,
|
|
128
|
+
) -> dict:
|
|
129
|
+
"""构建请求 payload"""
|
|
130
|
+
# 准备输入
|
|
131
|
+
prepared_inputs = [
|
|
132
|
+
self._prepare_input(inp, input_type) for inp in inputs
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
payload = {
|
|
136
|
+
"model": self.model,
|
|
137
|
+
"input": prepared_inputs,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
dimensions = dimensions or self._dimensions
|
|
141
|
+
if dimensions:
|
|
142
|
+
payload["dimensions"] = dimensions
|
|
143
|
+
|
|
144
|
+
return payload
|
|
145
|
+
|
|
146
|
+
def _parse_response(self, data: dict) -> List[List[float]]:
|
|
147
|
+
"""解析 API 响应"""
|
|
148
|
+
results = sorted(data["data"], key=lambda x: x["index"])
|
|
149
|
+
embeddings = [item["embedding"] for item in results]
|
|
150
|
+
|
|
151
|
+
# 记录实际维度
|
|
152
|
+
if embeddings and not self._actual_dimension:
|
|
153
|
+
self._actual_dimension = len(embeddings[0])
|
|
154
|
+
|
|
155
|
+
return embeddings
|
|
156
|
+
|
|
157
|
+
# ========== 实现 BaseEmbedding 接口 ==========
|
|
158
|
+
|
|
159
|
+
def embed(
|
|
160
|
+
self,
|
|
161
|
+
inputs: Union[str, List[str]],
|
|
162
|
+
input_type: InputType = "auto",
|
|
163
|
+
dimensions: Optional[int] = None,
|
|
164
|
+
) -> List[List[float]]:
|
|
165
|
+
"""
|
|
166
|
+
向量化输入(文本或图片)
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
inputs: 输入或输入列表(文本/图片路径/图片URL)
|
|
170
|
+
input_type: 输入类型 "text"/"image"/"auto"
|
|
171
|
+
dimensions: 输出维度
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
向量列表
|
|
175
|
+
"""
|
|
176
|
+
if isinstance(inputs, str):
|
|
177
|
+
inputs = [inputs]
|
|
178
|
+
|
|
179
|
+
url = f"{self.base_url}/v1/embeddings"
|
|
180
|
+
payload = self._build_payload(inputs, input_type, dimensions)
|
|
181
|
+
|
|
182
|
+
for attempt in range(self.max_retries):
|
|
183
|
+
try:
|
|
184
|
+
response = requests.post(
|
|
185
|
+
url,
|
|
186
|
+
json=payload,
|
|
187
|
+
headers=self._get_headers(),
|
|
188
|
+
timeout=self.timeout,
|
|
189
|
+
)
|
|
190
|
+
response.raise_for_status()
|
|
191
|
+
data = response.json()
|
|
192
|
+
return self._parse_response(data)
|
|
193
|
+
|
|
194
|
+
except requests.RequestException:
|
|
195
|
+
if attempt == self.max_retries - 1:
|
|
196
|
+
raise
|
|
197
|
+
time.sleep(0.5 * (attempt + 1))
|
|
198
|
+
|
|
199
|
+
async def aembed(
|
|
200
|
+
self,
|
|
201
|
+
inputs: Union[str, List[str]],
|
|
202
|
+
input_type: InputType = "auto",
|
|
203
|
+
dimensions: Optional[int] = None,
|
|
204
|
+
) -> List[List[float]]:
|
|
205
|
+
"""异步向量化输入"""
|
|
206
|
+
if isinstance(inputs, str):
|
|
207
|
+
inputs = [inputs]
|
|
208
|
+
|
|
209
|
+
url = f"{self.base_url}/v1/embeddings"
|
|
210
|
+
payload = self._build_payload(inputs, input_type, dimensions)
|
|
211
|
+
|
|
212
|
+
timeout = aiohttp.ClientTimeout(total=self.timeout)
|
|
213
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
214
|
+
for attempt in range(self.max_retries):
|
|
215
|
+
try:
|
|
216
|
+
async with session.post(
|
|
217
|
+
url,
|
|
218
|
+
json=payload,
|
|
219
|
+
headers=self._get_headers(),
|
|
220
|
+
) as response:
|
|
221
|
+
response.raise_for_status()
|
|
222
|
+
data = await response.json()
|
|
223
|
+
return self._parse_response(data)
|
|
224
|
+
|
|
225
|
+
except aiohttp.ClientError:
|
|
226
|
+
if attempt == self.max_retries - 1:
|
|
227
|
+
raise
|
|
228
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
229
|
+
|
|
230
|
+
@property
|
|
231
|
+
def dimension(self) -> int:
|
|
232
|
+
"""向量维度"""
|
|
233
|
+
if self._dimensions:
|
|
234
|
+
return self._dimensions
|
|
235
|
+
if self._actual_dimension:
|
|
236
|
+
return self._actual_dimension
|
|
237
|
+
return 768 # CLIP 默认维度
|
|
238
|
+
|
|
239
|
+
@property
|
|
240
|
+
def supports_image(self) -> bool:
|
|
241
|
+
return True
|
|
242
|
+
|
|
243
|
+
# ========== 便捷方法 ==========
|
|
244
|
+
|
|
245
|
+
def embed_text(
|
|
246
|
+
self,
|
|
247
|
+
texts: Union[str, List[str]],
|
|
248
|
+
dimensions: Optional[int] = None,
|
|
249
|
+
) -> List[List[float]]:
|
|
250
|
+
"""仅向量化文本"""
|
|
251
|
+
return self.embed(texts, input_type="text", dimensions=dimensions)
|
|
252
|
+
|
|
253
|
+
def embed_image(
|
|
254
|
+
self,
|
|
255
|
+
images: Union[str, List[str]],
|
|
256
|
+
dimensions: Optional[int] = None,
|
|
257
|
+
) -> List[List[float]]:
|
|
258
|
+
"""仅向量化图片"""
|
|
259
|
+
return self.embed(images, input_type="image", dimensions=dimensions)
|
|
260
|
+
|
|
261
|
+
async def aembed_text(
|
|
262
|
+
self,
|
|
263
|
+
texts: Union[str, List[str]],
|
|
264
|
+
dimensions: Optional[int] = None,
|
|
265
|
+
) -> List[List[float]]:
|
|
266
|
+
"""异步仅向量化文本"""
|
|
267
|
+
return await self.aembed(texts, input_type="text", dimensions=dimensions)
|
|
268
|
+
|
|
269
|
+
async def aembed_image(
|
|
270
|
+
self,
|
|
271
|
+
images: Union[str, List[str]],
|
|
272
|
+
dimensions: Optional[int] = None,
|
|
273
|
+
) -> List[List[float]]:
|
|
274
|
+
"""异步仅向量化图片"""
|
|
275
|
+
return await self.aembed(images, input_type="image", dimensions=dimensions)
|
|
276
|
+
|
|
277
|
+
def embed_batch(
|
|
278
|
+
self,
|
|
279
|
+
inputs: List[str],
|
|
280
|
+
batch_size: int = 16,
|
|
281
|
+
input_type: InputType = "auto",
|
|
282
|
+
dimensions: Optional[int] = None,
|
|
283
|
+
) -> List[List[float]]:
|
|
284
|
+
"""批量向量化"""
|
|
285
|
+
return asyncio.run(
|
|
286
|
+
self.aembed_batch(inputs, batch_size, input_type, dimensions)
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
async def aembed_batch(
|
|
290
|
+
self,
|
|
291
|
+
inputs: List[str],
|
|
292
|
+
batch_size: int = 16,
|
|
293
|
+
input_type: InputType = "auto",
|
|
294
|
+
dimensions: Optional[int] = None,
|
|
295
|
+
) -> List[List[float]]:
|
|
296
|
+
"""异步批量向量化"""
|
|
297
|
+
all_embeddings = [None] * len(inputs)
|
|
298
|
+
|
|
299
|
+
for i in range(0, len(inputs), batch_size):
|
|
300
|
+
batch = inputs[i : i + batch_size]
|
|
301
|
+
vectors = await self.aembed(batch, input_type, dimensions)
|
|
302
|
+
for j, vec in enumerate(vectors):
|
|
303
|
+
all_embeddings[i + j] = vec
|
|
304
|
+
|
|
305
|
+
return all_embeddings
|
|
306
|
+
|
|
307
|
+
def __repr__(self) -> str:
|
|
308
|
+
return f"MultiModalEmbedding(base_url={self.base_url!r}, model={self.model!r})"
|