maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,311 @@
1
+ #! /usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ 文本 Embedding - 支持 vLLM/OpenAI 兼容的 API
6
+ """
7
+
8
+ import asyncio
9
+ import time
10
+ from typing import List, Optional, Union, Literal
11
+ from dataclasses import dataclass, field
12
+
13
+ import aiohttp
14
+ import requests
15
+
16
+ from .base import BaseEmbedding
17
+
18
+ # jina-embeddings-v3 支持的任务类型
19
+ TaskType = Literal[
20
+ "text-matching", # 语义相似度、对称检索
21
+ "retrieval.query", # 非对称检索 - 查询端
22
+ "retrieval.passage", # 非对称检索 - 文档端
23
+ "classification", # 分类任务
24
+ "separation", # 聚类、重排序
25
+ ]
26
+
27
+ @dataclass
28
+ class EmbeddingResult:
29
+ """Embedding 结果"""
30
+
31
+ index: int
32
+ embedding: List[float]
33
+ text: str = ""
34
+
35
+
36
+ @dataclass
37
+ class EmbeddingResponse:
38
+ """Embedding 响应"""
39
+
40
+ embeddings: List[EmbeddingResult]
41
+ model: str
42
+ usage: dict = field(default_factory=dict)
43
+
44
+
45
+ class TextEmbedding(BaseEmbedding):
46
+ """
47
+ 文本 Embedding 客户端
48
+ 支持 vLLM 和 OpenAI 兼容的 API (jina-v3, bge-m3 等)
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ base_url: str,
54
+ model: str,
55
+ api_key: str = "EMPTY",
56
+ task: Optional[TaskType] = None,
57
+ dimensions: Optional[int] = None,
58
+ local_truncate: bool = False,
59
+ timeout: float = 60.0,
60
+ max_retries: int = 3,
61
+ ):
62
+ """
63
+ 初始化文本 Embedding 客户端
64
+
65
+ Args:
66
+ base_url: API 基础 URL,如 http://localhost:8000
67
+ model: 模型名称,如 jinaai/jina-embeddings-v3
68
+ api_key: API 密钥,vLLM 默认不需要
69
+ task: 任务类型 (jina-v3 支持)
70
+ dimensions: 输出维度 (Matryoshka)
71
+ local_truncate: 是否本地截取维度(适用于服务端不支持 dimensions 参数的情况)
72
+ timeout: 请求超时时间(秒)
73
+ max_retries: 最大重试次数
74
+ """
75
+ self.base_url = base_url.rstrip("/")
76
+ self.model = model
77
+ self.api_key = api_key
78
+ self.task = task
79
+ self._dimensions = dimensions
80
+ self.local_truncate = local_truncate
81
+ self.timeout = timeout
82
+ self.max_retries = max_retries
83
+ self._actual_dimension: Optional[int] = None
84
+
85
+ def _get_headers(self) -> dict:
86
+ return {
87
+ "Content-Type": "application/json",
88
+ "Authorization": f"Bearer {self.api_key}",
89
+ }
90
+
91
+ def _build_payload(
92
+ self,
93
+ texts: List[str],
94
+ task: Optional[TaskType] = None,
95
+ dimensions: Optional[int] = None,
96
+ include_dimensions: bool = True,
97
+ ) -> dict:
98
+ """构建请求 payload"""
99
+ payload = {
100
+ "model": self.model,
101
+ "input": texts,
102
+ }
103
+ task = task or self.task
104
+ dimensions = dimensions or self._dimensions
105
+
106
+ if task:
107
+ payload["task"] = task
108
+ if dimensions and include_dimensions:
109
+ payload["dimensions"] = dimensions
110
+
111
+ return payload
112
+
113
+ def _parse_response(
114
+ self, data: dict, texts: List[str], truncate_to: Optional[int] = None
115
+ ) -> EmbeddingResponse:
116
+ """解析 API 响应
117
+
118
+ Args:
119
+ data: API 响应数据
120
+ texts: 原始文本列表
121
+ truncate_to: 本地截取维度(MRL 模式用)
122
+ """
123
+ results = []
124
+ for item in data["data"]:
125
+ embedding = item["embedding"]
126
+ # 本地截取(MRL 模式)
127
+ if truncate_to and len(embedding) > truncate_to:
128
+ embedding = embedding[:truncate_to]
129
+ results.append(
130
+ EmbeddingResult(
131
+ index=item["index"],
132
+ embedding=embedding,
133
+ text=texts[item["index"]] if item["index"] < len(texts) else "",
134
+ )
135
+ )
136
+
137
+ # 记录实际维度(截取后的维度)
138
+ if results and not self._actual_dimension:
139
+ self._actual_dimension = len(results[0].embedding)
140
+
141
+ return EmbeddingResponse(
142
+ embeddings=sorted(results, key=lambda x: x.index),
143
+ model=data.get("model", self.model),
144
+ usage=data.get("usage", {}),
145
+ )
146
+
147
+ # ========== 实现 BaseEmbedding 接口 ==========
148
+
149
+ def embed(
150
+ self,
151
+ inputs: Union[str, List[str]],
152
+ task: Optional[TaskType] = None,
153
+ dimensions: Optional[int] = None,
154
+ ) -> List[List[float]]:
155
+ """
156
+ 向量化文本
157
+
158
+ Args:
159
+ inputs: 文本或文本列表
160
+ task: 任务类型
161
+ dimensions: 输出维度
162
+
163
+ Returns:
164
+ 向量列表
165
+ """
166
+ response = self.embed_with_response(inputs, task, dimensions)
167
+ return [r.embedding for r in response.embeddings]
168
+
169
+ async def aembed(
170
+ self,
171
+ inputs: Union[str, List[str]],
172
+ task: Optional[TaskType] = None,
173
+ dimensions: Optional[int] = None,
174
+ ) -> List[List[float]]:
175
+ """异步向量化文本"""
176
+ response = await self.aembed_with_response(inputs, task, dimensions)
177
+ return [r.embedding for r in response.embeddings]
178
+
179
+ @property
180
+ def dimension(self) -> int:
181
+ """向量维度"""
182
+ if self._dimensions:
183
+ return self._dimensions
184
+ if self._actual_dimension:
185
+ return self._actual_dimension
186
+ # 默认维度,实际调用后会更新
187
+ return 1024
188
+
189
+ @property
190
+ def supports_image(self) -> bool:
191
+ return False
192
+
193
+ # ========== 扩展方法 ==========
194
+
195
+ def embed_with_response(
196
+ self,
197
+ texts: Union[str, List[str]],
198
+ task: Optional[TaskType] = None,
199
+ dimensions: Optional[int] = None,
200
+ ) -> EmbeddingResponse:
201
+ """
202
+ 同步获取 embedding,返回完整响应
203
+
204
+ Args:
205
+ texts: 单个文本或文本列表
206
+ task: 任务类型
207
+ dimensions: 输出维度
208
+
209
+ Returns:
210
+ EmbeddingResponse 对象
211
+ """
212
+ if isinstance(texts, str):
213
+ texts = [texts]
214
+
215
+ url = f"{self.base_url}/v1/embeddings"
216
+ target_dim = dimensions or self._dimensions
217
+ # local_truncate=True 时不发送 dimensions 参数,获取完整向量后本地截取
218
+ include_dimensions = not self.local_truncate
219
+ payload = self._build_payload(texts, task, dimensions, include_dimensions=include_dimensions)
220
+
221
+ for attempt in range(self.max_retries):
222
+ try:
223
+ response = requests.post(
224
+ url,
225
+ json=payload,
226
+ headers=self._get_headers(),
227
+ timeout=self.timeout,
228
+ )
229
+ response.raise_for_status()
230
+ data = response.json()
231
+
232
+ # 本地截取模式
233
+ truncate_to = target_dim if self.local_truncate else None
234
+ return self._parse_response(data, texts, truncate_to=truncate_to)
235
+
236
+ except requests.RequestException:
237
+ if attempt == self.max_retries - 1:
238
+ raise
239
+ time.sleep(0.5 * (attempt + 1))
240
+
241
+ async def aembed_with_response(
242
+ self,
243
+ texts: Union[str, List[str]],
244
+ task: Optional[TaskType] = None,
245
+ dimensions: Optional[int] = None,
246
+ ) -> EmbeddingResponse:
247
+ """异步获取 embedding,返回完整响应"""
248
+ if isinstance(texts, str):
249
+ texts = [texts]
250
+
251
+ url = f"{self.base_url}/v1/embeddings"
252
+ target_dim = dimensions or self._dimensions
253
+ include_dimensions = not self.local_truncate
254
+ payload = self._build_payload(texts, task, dimensions, include_dimensions=include_dimensions)
255
+
256
+ timeout = aiohttp.ClientTimeout(total=self.timeout)
257
+ async with aiohttp.ClientSession(timeout=timeout) as session:
258
+ for attempt in range(self.max_retries):
259
+ try:
260
+ async with session.post(
261
+ url,
262
+ json=payload,
263
+ headers=self._get_headers(),
264
+ ) as response:
265
+ response.raise_for_status()
266
+ data = await response.json()
267
+
268
+ truncate_to = target_dim if self.local_truncate else None
269
+ return self._parse_response(data, texts, truncate_to=truncate_to)
270
+
271
+ except aiohttp.ClientError:
272
+ if attempt == self.max_retries - 1:
273
+ raise
274
+ await asyncio.sleep(0.5 * (attempt + 1))
275
+
276
+ def embed_batch(
277
+ self,
278
+ texts: List[str],
279
+ batch_size: int = 32,
280
+ task: Optional[TaskType] = None,
281
+ dimensions: Optional[int] = None,
282
+ ) -> List[List[float]]:
283
+ """同步批量获取 embedding"""
284
+ return asyncio.run(
285
+ self.aembed_batch(texts, batch_size, task, dimensions)
286
+ )
287
+
288
+ async def aembed_batch(
289
+ self,
290
+ texts: List[str],
291
+ batch_size: int = 32,
292
+ task: Optional[TaskType] = None,
293
+ dimensions: Optional[int] = None,
294
+ ) -> List[List[float]]:
295
+ """批量异步获取 embedding,自动分批处理"""
296
+ all_embeddings = [None] * len(texts)
297
+
298
+ for i in range(0, len(texts), batch_size):
299
+ batch = texts[i : i + batch_size]
300
+ vectors = await self.aembed(batch, task, dimensions)
301
+ for j, vec in enumerate(vectors):
302
+ all_embeddings[i + j] = vec
303
+
304
+ return all_embeddings
305
+
306
+ def __repr__(self) -> str:
307
+ return f"TextEmbedding(base_url={self.base_url!r}, model={self.model!r})"
308
+
309
+
310
+ # 向后兼容别名
311
+ EmbeddingClient = TextEmbedding
maque/git/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ """
2
+ Git 模块 - 纯 Python Git 操作
3
+
4
+ 基于 Dulwich 实现,不依赖 git 客户端。
5
+ """
6
+
7
+ try:
8
+ from .pure_git import (
9
+ PureGitRepo,
10
+ GitStatus,
11
+ GitCommitInfo,
12
+ GitStashEntry,
13
+ GitBlameEntry,
14
+ )
15
+ except ImportError:
16
+ pass
17
+
18
+ __all__ = [
19
+ 'PureGitRepo',
20
+ 'GitStatus',
21
+ 'GitCommitInfo',
22
+ 'GitStashEntry',
23
+ 'GitBlameEntry',
24
+ ]