maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
maque/embedding/text.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
#! /usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
文本 Embedding - 支持 vLLM/OpenAI 兼容的 API
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import time
|
|
10
|
+
from typing import List, Optional, Union, Literal
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
|
|
13
|
+
import aiohttp
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
from .base import BaseEmbedding
|
|
17
|
+
|
|
18
|
+
# jina-embeddings-v3 支持的任务类型
|
|
19
|
+
TaskType = Literal[
|
|
20
|
+
"text-matching", # 语义相似度、对称检索
|
|
21
|
+
"retrieval.query", # 非对称检索 - 查询端
|
|
22
|
+
"retrieval.passage", # 非对称检索 - 文档端
|
|
23
|
+
"classification", # 分类任务
|
|
24
|
+
"separation", # 聚类、重排序
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class EmbeddingResult:
|
|
29
|
+
"""Embedding 结果"""
|
|
30
|
+
|
|
31
|
+
index: int
|
|
32
|
+
embedding: List[float]
|
|
33
|
+
text: str = ""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class EmbeddingResponse:
|
|
38
|
+
"""Embedding 响应"""
|
|
39
|
+
|
|
40
|
+
embeddings: List[EmbeddingResult]
|
|
41
|
+
model: str
|
|
42
|
+
usage: dict = field(default_factory=dict)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class TextEmbedding(BaseEmbedding):
|
|
46
|
+
"""
|
|
47
|
+
文本 Embedding 客户端
|
|
48
|
+
支持 vLLM 和 OpenAI 兼容的 API (jina-v3, bge-m3 等)
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
base_url: str,
|
|
54
|
+
model: str,
|
|
55
|
+
api_key: str = "EMPTY",
|
|
56
|
+
task: Optional[TaskType] = None,
|
|
57
|
+
dimensions: Optional[int] = None,
|
|
58
|
+
local_truncate: bool = False,
|
|
59
|
+
timeout: float = 60.0,
|
|
60
|
+
max_retries: int = 3,
|
|
61
|
+
):
|
|
62
|
+
"""
|
|
63
|
+
初始化文本 Embedding 客户端
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
base_url: API 基础 URL,如 http://localhost:8000
|
|
67
|
+
model: 模型名称,如 jinaai/jina-embeddings-v3
|
|
68
|
+
api_key: API 密钥,vLLM 默认不需要
|
|
69
|
+
task: 任务类型 (jina-v3 支持)
|
|
70
|
+
dimensions: 输出维度 (Matryoshka)
|
|
71
|
+
local_truncate: 是否本地截取维度(适用于服务端不支持 dimensions 参数的情况)
|
|
72
|
+
timeout: 请求超时时间(秒)
|
|
73
|
+
max_retries: 最大重试次数
|
|
74
|
+
"""
|
|
75
|
+
self.base_url = base_url.rstrip("/")
|
|
76
|
+
self.model = model
|
|
77
|
+
self.api_key = api_key
|
|
78
|
+
self.task = task
|
|
79
|
+
self._dimensions = dimensions
|
|
80
|
+
self.local_truncate = local_truncate
|
|
81
|
+
self.timeout = timeout
|
|
82
|
+
self.max_retries = max_retries
|
|
83
|
+
self._actual_dimension: Optional[int] = None
|
|
84
|
+
|
|
85
|
+
def _get_headers(self) -> dict:
|
|
86
|
+
return {
|
|
87
|
+
"Content-Type": "application/json",
|
|
88
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
def _build_payload(
|
|
92
|
+
self,
|
|
93
|
+
texts: List[str],
|
|
94
|
+
task: Optional[TaskType] = None,
|
|
95
|
+
dimensions: Optional[int] = None,
|
|
96
|
+
include_dimensions: bool = True,
|
|
97
|
+
) -> dict:
|
|
98
|
+
"""构建请求 payload"""
|
|
99
|
+
payload = {
|
|
100
|
+
"model": self.model,
|
|
101
|
+
"input": texts,
|
|
102
|
+
}
|
|
103
|
+
task = task or self.task
|
|
104
|
+
dimensions = dimensions or self._dimensions
|
|
105
|
+
|
|
106
|
+
if task:
|
|
107
|
+
payload["task"] = task
|
|
108
|
+
if dimensions and include_dimensions:
|
|
109
|
+
payload["dimensions"] = dimensions
|
|
110
|
+
|
|
111
|
+
return payload
|
|
112
|
+
|
|
113
|
+
def _parse_response(
|
|
114
|
+
self, data: dict, texts: List[str], truncate_to: Optional[int] = None
|
|
115
|
+
) -> EmbeddingResponse:
|
|
116
|
+
"""解析 API 响应
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
data: API 响应数据
|
|
120
|
+
texts: 原始文本列表
|
|
121
|
+
truncate_to: 本地截取维度(MRL 模式用)
|
|
122
|
+
"""
|
|
123
|
+
results = []
|
|
124
|
+
for item in data["data"]:
|
|
125
|
+
embedding = item["embedding"]
|
|
126
|
+
# 本地截取(MRL 模式)
|
|
127
|
+
if truncate_to and len(embedding) > truncate_to:
|
|
128
|
+
embedding = embedding[:truncate_to]
|
|
129
|
+
results.append(
|
|
130
|
+
EmbeddingResult(
|
|
131
|
+
index=item["index"],
|
|
132
|
+
embedding=embedding,
|
|
133
|
+
text=texts[item["index"]] if item["index"] < len(texts) else "",
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# 记录实际维度(截取后的维度)
|
|
138
|
+
if results and not self._actual_dimension:
|
|
139
|
+
self._actual_dimension = len(results[0].embedding)
|
|
140
|
+
|
|
141
|
+
return EmbeddingResponse(
|
|
142
|
+
embeddings=sorted(results, key=lambda x: x.index),
|
|
143
|
+
model=data.get("model", self.model),
|
|
144
|
+
usage=data.get("usage", {}),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# ========== 实现 BaseEmbedding 接口 ==========
|
|
148
|
+
|
|
149
|
+
def embed(
|
|
150
|
+
self,
|
|
151
|
+
inputs: Union[str, List[str]],
|
|
152
|
+
task: Optional[TaskType] = None,
|
|
153
|
+
dimensions: Optional[int] = None,
|
|
154
|
+
) -> List[List[float]]:
|
|
155
|
+
"""
|
|
156
|
+
向量化文本
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
inputs: 文本或文本列表
|
|
160
|
+
task: 任务类型
|
|
161
|
+
dimensions: 输出维度
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
向量列表
|
|
165
|
+
"""
|
|
166
|
+
response = self.embed_with_response(inputs, task, dimensions)
|
|
167
|
+
return [r.embedding for r in response.embeddings]
|
|
168
|
+
|
|
169
|
+
async def aembed(
|
|
170
|
+
self,
|
|
171
|
+
inputs: Union[str, List[str]],
|
|
172
|
+
task: Optional[TaskType] = None,
|
|
173
|
+
dimensions: Optional[int] = None,
|
|
174
|
+
) -> List[List[float]]:
|
|
175
|
+
"""异步向量化文本"""
|
|
176
|
+
response = await self.aembed_with_response(inputs, task, dimensions)
|
|
177
|
+
return [r.embedding for r in response.embeddings]
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def dimension(self) -> int:
|
|
181
|
+
"""向量维度"""
|
|
182
|
+
if self._dimensions:
|
|
183
|
+
return self._dimensions
|
|
184
|
+
if self._actual_dimension:
|
|
185
|
+
return self._actual_dimension
|
|
186
|
+
# 默认维度,实际调用后会更新
|
|
187
|
+
return 1024
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def supports_image(self) -> bool:
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
# ========== 扩展方法 ==========
|
|
194
|
+
|
|
195
|
+
def embed_with_response(
|
|
196
|
+
self,
|
|
197
|
+
texts: Union[str, List[str]],
|
|
198
|
+
task: Optional[TaskType] = None,
|
|
199
|
+
dimensions: Optional[int] = None,
|
|
200
|
+
) -> EmbeddingResponse:
|
|
201
|
+
"""
|
|
202
|
+
同步获取 embedding,返回完整响应
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
texts: 单个文本或文本列表
|
|
206
|
+
task: 任务类型
|
|
207
|
+
dimensions: 输出维度
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
EmbeddingResponse 对象
|
|
211
|
+
"""
|
|
212
|
+
if isinstance(texts, str):
|
|
213
|
+
texts = [texts]
|
|
214
|
+
|
|
215
|
+
url = f"{self.base_url}/v1/embeddings"
|
|
216
|
+
target_dim = dimensions or self._dimensions
|
|
217
|
+
# local_truncate=True 时不发送 dimensions 参数,获取完整向量后本地截取
|
|
218
|
+
include_dimensions = not self.local_truncate
|
|
219
|
+
payload = self._build_payload(texts, task, dimensions, include_dimensions=include_dimensions)
|
|
220
|
+
|
|
221
|
+
for attempt in range(self.max_retries):
|
|
222
|
+
try:
|
|
223
|
+
response = requests.post(
|
|
224
|
+
url,
|
|
225
|
+
json=payload,
|
|
226
|
+
headers=self._get_headers(),
|
|
227
|
+
timeout=self.timeout,
|
|
228
|
+
)
|
|
229
|
+
response.raise_for_status()
|
|
230
|
+
data = response.json()
|
|
231
|
+
|
|
232
|
+
# 本地截取模式
|
|
233
|
+
truncate_to = target_dim if self.local_truncate else None
|
|
234
|
+
return self._parse_response(data, texts, truncate_to=truncate_to)
|
|
235
|
+
|
|
236
|
+
except requests.RequestException:
|
|
237
|
+
if attempt == self.max_retries - 1:
|
|
238
|
+
raise
|
|
239
|
+
time.sleep(0.5 * (attempt + 1))
|
|
240
|
+
|
|
241
|
+
async def aembed_with_response(
|
|
242
|
+
self,
|
|
243
|
+
texts: Union[str, List[str]],
|
|
244
|
+
task: Optional[TaskType] = None,
|
|
245
|
+
dimensions: Optional[int] = None,
|
|
246
|
+
) -> EmbeddingResponse:
|
|
247
|
+
"""异步获取 embedding,返回完整响应"""
|
|
248
|
+
if isinstance(texts, str):
|
|
249
|
+
texts = [texts]
|
|
250
|
+
|
|
251
|
+
url = f"{self.base_url}/v1/embeddings"
|
|
252
|
+
target_dim = dimensions or self._dimensions
|
|
253
|
+
include_dimensions = not self.local_truncate
|
|
254
|
+
payload = self._build_payload(texts, task, dimensions, include_dimensions=include_dimensions)
|
|
255
|
+
|
|
256
|
+
timeout = aiohttp.ClientTimeout(total=self.timeout)
|
|
257
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
258
|
+
for attempt in range(self.max_retries):
|
|
259
|
+
try:
|
|
260
|
+
async with session.post(
|
|
261
|
+
url,
|
|
262
|
+
json=payload,
|
|
263
|
+
headers=self._get_headers(),
|
|
264
|
+
) as response:
|
|
265
|
+
response.raise_for_status()
|
|
266
|
+
data = await response.json()
|
|
267
|
+
|
|
268
|
+
truncate_to = target_dim if self.local_truncate else None
|
|
269
|
+
return self._parse_response(data, texts, truncate_to=truncate_to)
|
|
270
|
+
|
|
271
|
+
except aiohttp.ClientError:
|
|
272
|
+
if attempt == self.max_retries - 1:
|
|
273
|
+
raise
|
|
274
|
+
await asyncio.sleep(0.5 * (attempt + 1))
|
|
275
|
+
|
|
276
|
+
def embed_batch(
|
|
277
|
+
self,
|
|
278
|
+
texts: List[str],
|
|
279
|
+
batch_size: int = 32,
|
|
280
|
+
task: Optional[TaskType] = None,
|
|
281
|
+
dimensions: Optional[int] = None,
|
|
282
|
+
) -> List[List[float]]:
|
|
283
|
+
"""同步批量获取 embedding"""
|
|
284
|
+
return asyncio.run(
|
|
285
|
+
self.aembed_batch(texts, batch_size, task, dimensions)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
async def aembed_batch(
|
|
289
|
+
self,
|
|
290
|
+
texts: List[str],
|
|
291
|
+
batch_size: int = 32,
|
|
292
|
+
task: Optional[TaskType] = None,
|
|
293
|
+
dimensions: Optional[int] = None,
|
|
294
|
+
) -> List[List[float]]:
|
|
295
|
+
"""批量异步获取 embedding,自动分批处理"""
|
|
296
|
+
all_embeddings = [None] * len(texts)
|
|
297
|
+
|
|
298
|
+
for i in range(0, len(texts), batch_size):
|
|
299
|
+
batch = texts[i : i + batch_size]
|
|
300
|
+
vectors = await self.aembed(batch, task, dimensions)
|
|
301
|
+
for j, vec in enumerate(vectors):
|
|
302
|
+
all_embeddings[i + j] = vec
|
|
303
|
+
|
|
304
|
+
return all_embeddings
|
|
305
|
+
|
|
306
|
+
def __repr__(self) -> str:
|
|
307
|
+
return f"TextEmbedding(base_url={self.base_url!r}, model={self.model!r})"
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# 向后兼容别名
|
|
311
|
+
EmbeddingClient = TextEmbedding
|
maque/git/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Git 模块 - 纯 Python Git 操作
|
|
3
|
+
|
|
4
|
+
基于 Dulwich 实现,不依赖 git 客户端。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from .pure_git import (
|
|
9
|
+
PureGitRepo,
|
|
10
|
+
GitStatus,
|
|
11
|
+
GitCommitInfo,
|
|
12
|
+
GitStashEntry,
|
|
13
|
+
GitBlameEntry,
|
|
14
|
+
)
|
|
15
|
+
except ImportError:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
'PureGitRepo',
|
|
20
|
+
'GitStatus',
|
|
21
|
+
'GitCommitInfo',
|
|
22
|
+
'GitStashEntry',
|
|
23
|
+
'GitBlameEntry',
|
|
24
|
+
]
|