maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
maque/llm/backend.py ADDED
@@ -0,0 +1,416 @@
1
+ #! /usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Transformers 后端实现
6
+
7
+ 基于 HuggingFace Transformers 的 LLM/MLLM 后端。
8
+ 支持通过配置动态选择模型类和处理器类。
9
+ """
10
+
11
+ from typing import List, Optional, Type
12
+
13
+ from .base import BaseLLMBackend, ChatMessage, GenerateConfig, ModelConfig
14
+
15
+
16
+ _awq_patched = False
17
+
18
+
19
+ def _patch_awq_compat():
20
+ """修复 autoawq 与新版 transformers 的兼容性问题
21
+
22
+ autoawq 已被官方弃用,但 transformers 加载 AWQ 模型仍依赖它。
23
+ 新版 transformers (>=4.50) 将 PytorchGELUTanh 重命名为 GELUTanh,
24
+ 导致 awq.quantize.scale 导入失败。
25
+
26
+ 此函数在运行时动态 patch,无需修改 awq 源文件。
27
+ """
28
+ global _awq_patched
29
+ if _awq_patched:
30
+ return
31
+
32
+ try:
33
+ # 先 patch transformers.activations,添加别名
34
+ from transformers import activations
35
+ if not hasattr(activations, "PytorchGELUTanh"):
36
+ if hasattr(activations, "GELUTanh"):
37
+ activations.PytorchGELUTanh = activations.GELUTanh
38
+ _awq_patched = True
39
+ except Exception:
40
+ pass
41
+
42
+
43
+ def _get_model_class(class_name: str) -> Type:
44
+ """动态获取模型类
45
+
46
+ Args:
47
+ class_name: 类名,如 "AutoModelForCausalLM", "Qwen3VLForConditionalGeneration"
48
+
49
+ Returns:
50
+ 模型类
51
+ """
52
+ import transformers
53
+
54
+ # 首先尝试从 transformers 直接获取
55
+ if hasattr(transformers, class_name):
56
+ return getattr(transformers, class_name)
57
+
58
+ # 尝试从 transformers.models 的子模块获取
59
+ # 例如 HunYuanVLForConditionalGeneration
60
+ for module_name in dir(transformers.models):
61
+ try:
62
+ module = getattr(transformers.models, module_name)
63
+ if hasattr(module, class_name):
64
+ return getattr(module, class_name)
65
+ except Exception:
66
+ continue
67
+
68
+ raise ValueError(f"无法找到模型类: {class_name}")
69
+
70
+
71
+ def _get_processor_class(class_name: str) -> Type:
72
+ """动态获取处理器类
73
+
74
+ Args:
75
+ class_name: 类名,如 "AutoTokenizer", "AutoProcessor"
76
+
77
+ Returns:
78
+ 处理器类
79
+ """
80
+ import transformers
81
+
82
+ if hasattr(transformers, class_name):
83
+ return getattr(transformers, class_name)
84
+
85
+ raise ValueError(f"无法找到处理器类: {class_name}")
86
+
87
+
88
+ class TransformersBackend(BaseLLMBackend):
89
+ """基于 Transformers 的 LLM 后端
90
+
91
+ 支持:
92
+ - 纯文本 LLM (AutoModelForCausalLM)
93
+ - 多模态 VL 模型 (AutoModelForVision2Seq 或其他)
94
+ - 流式输出 (TextIteratorStreamer)
95
+ - 动态模型类和处理器类配置
96
+
97
+ 配置示例:
98
+ # 使用 HunyuanOCR
99
+ config = ModelConfig(
100
+ model_id="tencent/HunyuanOCR",
101
+ model_class="HunYuanVLForConditionalGeneration",
102
+ processor_class="AutoProcessor",
103
+ vision_processor="general",
104
+ )
105
+
106
+ # 使用 Qwen3 带 thinking
107
+ config = ModelConfig(
108
+ model_id="Qwen/Qwen3-0.6B",
109
+ chat_template_kwargs={"enable_thinking": True},
110
+ )
111
+ """
112
+
113
+ def __init__(self):
114
+ super().__init__()
115
+ self._tokenizer = None
116
+ self._processor = None # 多模态用
117
+
118
+ # ============== 实现抽象方法 ==============
119
+
120
+ def _load_model_impl(self, model_path: str, config: ModelConfig) -> None:
121
+ """加载 Transformers 模型"""
122
+ import torch
123
+
124
+ # 修复 autoawq 与新版 transformers 的兼容性
125
+ _patch_awq_compat()
126
+
127
+ # 确定 dtype
128
+ if config.torch_dtype:
129
+ torch_dtype = getattr(torch, config.torch_dtype, torch.float16)
130
+ else:
131
+ # 自动选择最佳 dtype
132
+ # CUDA: 优先 bfloat16 > float16
133
+ # MPS/CPU: 使用 float32 (更稳定)
134
+ if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
135
+ torch_dtype = torch.bfloat16
136
+ elif torch.cuda.is_available():
137
+ torch_dtype = torch.float16
138
+ else:
139
+ # MPS 和 CPU 使用 float32 更稳定
140
+ torch_dtype = torch.float32
141
+
142
+ if self._is_multimodal:
143
+ self._load_multimodal_model(model_path, config, torch_dtype)
144
+ else:
145
+ self._load_text_model(model_path, config, torch_dtype)
146
+
147
+ def _generate_impl(
148
+ self, messages: List[ChatMessage], config: GenerateConfig
149
+ ) -> tuple[str, int, int]:
150
+ """Transformers 生成实现"""
151
+ import torch
152
+
153
+ # 构建输入
154
+ if self._is_multimodal:
155
+ inputs = self._build_multimodal_inputs(messages)
156
+ else:
157
+ inputs = self._build_text_inputs(messages)
158
+
159
+ prompt_tokens = inputs["input_ids"].shape[1]
160
+
161
+ # 生成参数
162
+ gen_kwargs = {
163
+ "max_new_tokens": config.max_tokens,
164
+ "temperature": config.temperature if config.temperature > 0 else 1.0,
165
+ "top_p": config.top_p,
166
+ "do_sample": config.temperature > 0,
167
+ "pad_token_id": self._get_pad_token_id(),
168
+ }
169
+
170
+ with torch.no_grad():
171
+ outputs = self._model.generate(**inputs, **gen_kwargs)
172
+
173
+ # 解码
174
+ new_tokens = outputs[0][prompt_tokens:]
175
+ completion_tokens = len(new_tokens)
176
+
177
+ if self._is_multimodal:
178
+ text = self._processor.decode(new_tokens, skip_special_tokens=True)
179
+ else:
180
+ text = self._tokenizer.decode(new_tokens, skip_special_tokens=True)
181
+
182
+ return text, prompt_tokens, completion_tokens
183
+
184
+ def _generate_stream_impl(
185
+ self, messages: List[ChatMessage], config: GenerateConfig
186
+ ):
187
+ """Transformers 流式生成实现"""
188
+ from threading import Thread
189
+ from transformers import TextIteratorStreamer
190
+
191
+ # 构建输入
192
+ if self._is_multimodal:
193
+ inputs = self._build_multimodal_inputs(messages)
194
+ tokenizer = self._processor.tokenizer
195
+ else:
196
+ inputs = self._build_text_inputs(messages)
197
+ tokenizer = self._tokenizer
198
+
199
+ # 创建 streamer
200
+ streamer = TextIteratorStreamer(
201
+ tokenizer,
202
+ skip_prompt=True,
203
+ skip_special_tokens=True,
204
+ )
205
+
206
+ gen_kwargs = {
207
+ **inputs,
208
+ "max_new_tokens": config.max_tokens,
209
+ "temperature": config.temperature if config.temperature > 0 else 1.0,
210
+ "top_p": config.top_p,
211
+ "do_sample": config.temperature > 0,
212
+ "pad_token_id": self._get_pad_token_id(),
213
+ "streamer": streamer,
214
+ }
215
+
216
+ # 在线程中运行生成
217
+ thread = Thread(target=self._model.generate, kwargs=gen_kwargs)
218
+ thread.start()
219
+
220
+ # 流式输出
221
+ for text in streamer:
222
+ yield text
223
+
224
+ thread.join()
225
+
226
+ # ============== 内部方法 ==============
227
+
228
+ def _load_text_model(self, model_path: str, config: ModelConfig, torch_dtype) -> None:
229
+ """加载纯文本模型"""
230
+ from transformers import AutoModelForCausalLM, AutoTokenizer
231
+
232
+ # 确定处理器类
233
+ processor_class_name = config.processor_class or "AutoTokenizer"
234
+ ProcessorClass = _get_processor_class(processor_class_name)
235
+
236
+ self._tokenizer = ProcessorClass.from_pretrained(
237
+ model_path, trust_remote_code=config.trust_remote_code
238
+ )
239
+
240
+ # 确定模型类
241
+ model_class_name = config.model_class or "AutoModelForCausalLM"
242
+ ModelClass = _get_model_class(model_class_name)
243
+
244
+ model_kwargs = {
245
+ "torch_dtype": torch_dtype,
246
+ "device_map": self._device,
247
+ "trust_remote_code": config.trust_remote_code,
248
+ }
249
+ if config.attn_implementation:
250
+ model_kwargs["attn_implementation"] = config.attn_implementation
251
+
252
+ self._model = ModelClass.from_pretrained(model_path, **model_kwargs)
253
+ self._model.eval()
254
+
255
+ def _load_multimodal_model(self, model_path: str, config: ModelConfig, torch_dtype) -> None:
256
+ """加载多模态模型"""
257
+ from transformers import AutoProcessor, AutoModelForVision2Seq
258
+
259
+ # 确定处理器类
260
+ processor_class_name = config.processor_class or "AutoProcessor"
261
+ ProcessorClass = _get_processor_class(processor_class_name)
262
+
263
+ processor_kwargs = {"trust_remote_code": config.trust_remote_code}
264
+ # HunyuanOCR 需要 use_fast=False
265
+ if "hunyuan" in model_path.lower():
266
+ processor_kwargs["use_fast"] = False
267
+
268
+ self._processor = ProcessorClass.from_pretrained(model_path, **processor_kwargs)
269
+
270
+ # 确定模型类
271
+ model_class_name = config.model_class or "AutoModelForVision2Seq"
272
+ ModelClass = _get_model_class(model_class_name)
273
+
274
+ model_kwargs = {
275
+ "torch_dtype": torch_dtype,
276
+ "device_map": self._device,
277
+ "trust_remote_code": config.trust_remote_code,
278
+ }
279
+ if config.attn_implementation:
280
+ model_kwargs["attn_implementation"] = config.attn_implementation
281
+
282
+ self._model = ModelClass.from_pretrained(model_path, **model_kwargs)
283
+ self._model.eval()
284
+
285
+ def _build_text_inputs(self, messages: List[ChatMessage]):
286
+ """构建纯文本输入"""
287
+ # 转换为标准格式
288
+ formatted = []
289
+ for msg in messages:
290
+ content = msg.content if isinstance(msg.content, str) else " ".join(
291
+ p.text for p in msg.content if p.type == "text" and p.text
292
+ )
293
+ formatted.append({"role": msg.role, "content": content})
294
+
295
+ # 获取 chat_template_kwargs
296
+ chat_kwargs = {"tokenize": False, "add_generation_prompt": True}
297
+ if self._config and self._config.chat_template_kwargs:
298
+ chat_kwargs.update(self._config.chat_template_kwargs)
299
+
300
+ text = self._tokenizer.apply_chat_template(formatted, **chat_kwargs)
301
+ inputs = self._tokenizer(text, return_tensors="pt")
302
+ return inputs.to(self._device)
303
+
304
+ def _build_multimodal_inputs(self, messages: List[ChatMessage]):
305
+ """构建多模态输入
306
+
307
+ 根据 vision_processor 配置选择不同的处理方式:
308
+ - qwen_vl: 使用 qwen_vl_utils.process_vision_info (Qwen-VL 系列)
309
+ - general: 通用处理方式 (HunyuanOCR, dots.ocr 等)
310
+ """
311
+ if self._vision_processor == "qwen_vl":
312
+ return self._build_qwen_vl_inputs(messages)
313
+ else:
314
+ return self._build_general_vl_inputs(messages)
315
+
316
+ def _build_qwen_vl_inputs(self, messages: List[ChatMessage]):
317
+ """构建 Qwen-VL 风格的输入"""
318
+ from qwen_vl_utils import process_vision_info
319
+
320
+ # 转换为 Qwen-VL 格式
321
+ qwen_messages = []
322
+ for msg in messages:
323
+ if isinstance(msg.content, str):
324
+ qwen_messages.append({"role": msg.role, "content": msg.content})
325
+ else:
326
+ content_parts = []
327
+ for part in msg.content:
328
+ if part.type == "text":
329
+ content_parts.append({"type": "text", "text": part.text})
330
+ elif part.type == "image_url" and part.image_url:
331
+ image_url = part.image_url.url
332
+ if image_url.startswith("data:"):
333
+ image = self._process_image(image_url)
334
+ content_parts.append({"type": "image", "image": image})
335
+ else:
336
+ content_parts.append({"type": "image", "image": image_url})
337
+ qwen_messages.append({"role": msg.role, "content": content_parts})
338
+
339
+ # 获取 chat_template_kwargs
340
+ chat_kwargs = {"tokenize": False, "add_generation_prompt": True}
341
+ if self._config and self._config.chat_template_kwargs:
342
+ chat_kwargs.update(self._config.chat_template_kwargs)
343
+
344
+ # 使用 processor 处理
345
+ text = self._processor.apply_chat_template(qwen_messages, **chat_kwargs)
346
+ image_inputs, video_inputs = process_vision_info(qwen_messages)
347
+
348
+ inputs = self._processor(
349
+ text=[text],
350
+ images=image_inputs,
351
+ videos=video_inputs,
352
+ padding=True,
353
+ return_tensors="pt",
354
+ )
355
+ return inputs.to(self._device)
356
+
357
+ def _build_general_vl_inputs(self, messages: List[ChatMessage]):
358
+ """构建通用多模态输入 (适用于大多数 VL 模型)"""
359
+ # 提取图片和文本
360
+ images = []
361
+ formatted_messages = []
362
+
363
+ for msg in messages:
364
+ if isinstance(msg.content, str):
365
+ formatted_messages.append({"role": msg.role, "content": msg.content})
366
+ else:
367
+ content_parts = []
368
+ for part in msg.content:
369
+ if part.type == "text" and part.text:
370
+ content_parts.append({"type": "text", "text": part.text})
371
+ elif part.type == "image_url" and part.image_url:
372
+ image = self._process_image(part.image_url.url)
373
+ images.append(image)
374
+ content_parts.append({"type": "image"})
375
+
376
+ formatted_messages.append({"role": msg.role, "content": content_parts})
377
+
378
+ # 获取 chat_template_kwargs
379
+ chat_kwargs = {"tokenize": False, "add_generation_prompt": True}
380
+ if self._config and self._config.chat_template_kwargs:
381
+ chat_kwargs.update(self._config.chat_template_kwargs)
382
+
383
+ # 应用 chat template
384
+ text = self._processor.apply_chat_template(formatted_messages, **chat_kwargs)
385
+
386
+ # 处理输入
387
+ if images:
388
+ inputs = self._processor(
389
+ text=[text],
390
+ images=images,
391
+ padding=True,
392
+ return_tensors="pt",
393
+ )
394
+ else:
395
+ inputs = self._processor(
396
+ text=[text],
397
+ padding=True,
398
+ return_tensors="pt",
399
+ )
400
+
401
+ return inputs.to(self._device)
402
+
403
+ def _get_pad_token_id(self) -> int:
404
+ """获取 pad token id"""
405
+ if self._is_multimodal:
406
+ tokenizer = self._processor.tokenizer
407
+ else:
408
+ tokenizer = self._tokenizer
409
+
410
+ if tokenizer.pad_token_id is not None:
411
+ return tokenizer.pad_token_id
412
+ return tokenizer.eos_token_id
413
+
414
+
415
+ # 默认后端别名
416
+ LLMBackend = TransformersBackend