maque 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maque/__init__.py +30 -0
- maque/__main__.py +926 -0
- maque/ai_platform/__init__.py +0 -0
- maque/ai_platform/crawl.py +45 -0
- maque/ai_platform/metrics.py +258 -0
- maque/ai_platform/nlp_preprocess.py +67 -0
- maque/ai_platform/webpage_screen_shot.py +195 -0
- maque/algorithms/__init__.py +78 -0
- maque/algorithms/bezier.py +15 -0
- maque/algorithms/bktree.py +117 -0
- maque/algorithms/core.py +104 -0
- maque/algorithms/hilbert.py +16 -0
- maque/algorithms/rate_function.py +92 -0
- maque/algorithms/transform.py +27 -0
- maque/algorithms/trie.py +272 -0
- maque/algorithms/utils.py +63 -0
- maque/algorithms/video.py +587 -0
- maque/api/__init__.py +1 -0
- maque/api/common.py +110 -0
- maque/api/fetch.py +26 -0
- maque/api/static/icon.png +0 -0
- maque/api/static/redoc.standalone.js +1782 -0
- maque/api/static/swagger-ui-bundle.js +3 -0
- maque/api/static/swagger-ui.css +3 -0
- maque/cli/__init__.py +1 -0
- maque/cli/clean_invisible_chars.py +324 -0
- maque/cli/core.py +34 -0
- maque/cli/groups/__init__.py +26 -0
- maque/cli/groups/config.py +205 -0
- maque/cli/groups/data.py +615 -0
- maque/cli/groups/doctor.py +259 -0
- maque/cli/groups/embedding.py +222 -0
- maque/cli/groups/git.py +29 -0
- maque/cli/groups/help.py +410 -0
- maque/cli/groups/llm.py +223 -0
- maque/cli/groups/mcp.py +241 -0
- maque/cli/groups/mllm.py +1795 -0
- maque/cli/groups/mllm_simple.py +60 -0
- maque/cli/groups/quant.py +210 -0
- maque/cli/groups/service.py +490 -0
- maque/cli/groups/system.py +570 -0
- maque/cli/mllm_run.py +1451 -0
- maque/cli/script.py +52 -0
- maque/cli/tree.py +49 -0
- maque/clustering/__init__.py +52 -0
- maque/clustering/analyzer.py +347 -0
- maque/clustering/clusterers.py +464 -0
- maque/clustering/sampler.py +134 -0
- maque/clustering/visualizer.py +205 -0
- maque/constant.py +13 -0
- maque/core.py +133 -0
- maque/cv/__init__.py +1 -0
- maque/cv/image.py +219 -0
- maque/cv/utils.py +68 -0
- maque/cv/video/__init__.py +3 -0
- maque/cv/video/keyframe_extractor.py +368 -0
- maque/embedding/__init__.py +43 -0
- maque/embedding/base.py +56 -0
- maque/embedding/multimodal.py +308 -0
- maque/embedding/server.py +523 -0
- maque/embedding/text.py +311 -0
- maque/git/__init__.py +24 -0
- maque/git/pure_git.py +912 -0
- maque/io/__init__.py +29 -0
- maque/io/core.py +38 -0
- maque/io/ops.py +194 -0
- maque/llm/__init__.py +111 -0
- maque/llm/backend.py +416 -0
- maque/llm/base.py +411 -0
- maque/llm/server.py +366 -0
- maque/mcp_server.py +1096 -0
- maque/mllm_data_processor_pipeline/__init__.py +17 -0
- maque/mllm_data_processor_pipeline/core.py +341 -0
- maque/mllm_data_processor_pipeline/example.py +291 -0
- maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
- maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
- maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
- maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
- maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
- maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
- maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
- maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
- maque/mllm_data_processor_pipeline/web_app.py +317 -0
- maque/nlp/__init__.py +14 -0
- maque/nlp/ngram.py +9 -0
- maque/nlp/parser.py +63 -0
- maque/nlp/risk_matcher.py +543 -0
- maque/nlp/sentence_splitter.py +202 -0
- maque/nlp/simple_tradition_cvt.py +31 -0
- maque/performance/__init__.py +21 -0
- maque/performance/_measure_time.py +70 -0
- maque/performance/_profiler.py +367 -0
- maque/performance/_stat_memory.py +51 -0
- maque/pipelines/__init__.py +15 -0
- maque/pipelines/clustering.py +252 -0
- maque/quantization/__init__.py +42 -0
- maque/quantization/auto_round.py +120 -0
- maque/quantization/base.py +145 -0
- maque/quantization/bitsandbytes.py +127 -0
- maque/quantization/llm_compressor.py +102 -0
- maque/retriever/__init__.py +35 -0
- maque/retriever/chroma.py +654 -0
- maque/retriever/document.py +140 -0
- maque/retriever/milvus.py +1140 -0
- maque/table_ops/__init__.py +1 -0
- maque/table_ops/core.py +133 -0
- maque/table_viewer/__init__.py +4 -0
- maque/table_viewer/download_assets.py +57 -0
- maque/table_viewer/server.py +698 -0
- maque/table_viewer/static/element-plus-icons.js +5791 -0
- maque/table_viewer/static/element-plus.css +1 -0
- maque/table_viewer/static/element-plus.js +65236 -0
- maque/table_viewer/static/main.css +268 -0
- maque/table_viewer/static/main.js +669 -0
- maque/table_viewer/static/vue.global.js +18227 -0
- maque/table_viewer/templates/index.html +401 -0
- maque/utils/__init__.py +56 -0
- maque/utils/color.py +68 -0
- maque/utils/color_string.py +45 -0
- maque/utils/compress.py +66 -0
- maque/utils/constant.py +183 -0
- maque/utils/core.py +261 -0
- maque/utils/cursor.py +143 -0
- maque/utils/distance.py +58 -0
- maque/utils/docker.py +96 -0
- maque/utils/downloads.py +51 -0
- maque/utils/excel_helper.py +542 -0
- maque/utils/helper_metrics.py +121 -0
- maque/utils/helper_parser.py +168 -0
- maque/utils/net.py +64 -0
- maque/utils/nvidia_stat.py +140 -0
- maque/utils/ops.py +53 -0
- maque/utils/packages.py +31 -0
- maque/utils/path.py +57 -0
- maque/utils/tar.py +260 -0
- maque/utils/untar.py +129 -0
- maque/web/__init__.py +0 -0
- maque/web/image_downloader.py +1410 -0
- maque-0.2.1.dist-info/METADATA +450 -0
- maque-0.2.1.dist-info/RECORD +143 -0
- maque-0.2.1.dist-info/WHEEL +4 -0
- maque-0.2.1.dist-info/entry_points.txt +3 -0
- maque-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""模型量化模块
|
|
2
|
+
|
|
3
|
+
提供多种量化方案的统一接口,支持 vLLM 推理和 QLoRA 微调场景。
|
|
4
|
+
|
|
5
|
+
支持的量化方法:
|
|
6
|
+
- auto-round: Intel SGD 优化权重舍入,精度好 (推荐)
|
|
7
|
+
- awq: Activation-aware Weight Quantization
|
|
8
|
+
- gptq: 经典 GPTQ 量化
|
|
9
|
+
- bnb-nf4: 4-bit NormalFloat 量化 (QLoRA)
|
|
10
|
+
- bnb-int8: 8-bit 整数量化
|
|
11
|
+
|
|
12
|
+
Examples:
|
|
13
|
+
>>> from maque.quantization import get_quantizer
|
|
14
|
+
>>> quantizer = get_quantizer("auto-round")
|
|
15
|
+
>>> quantizer.quantize("Qwen/Qwen3-4B", "./Qwen3-4B-quant")
|
|
16
|
+
|
|
17
|
+
>>> from maque.quantization import AutoRoundQuantizer
|
|
18
|
+
>>> quantizer = AutoRoundQuantizer(bits=4, group_size=128)
|
|
19
|
+
>>> quantizer.quantize(model_path, output_path)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from .base import (
|
|
23
|
+
BaseQuantizer,
|
|
24
|
+
QuantConfig,
|
|
25
|
+
get_quantizer,
|
|
26
|
+
list_methods,
|
|
27
|
+
QUANTIZATION_METHODS,
|
|
28
|
+
)
|
|
29
|
+
from .auto_round import AutoRoundQuantizer
|
|
30
|
+
from .llm_compressor import LLMCompressorQuantizer
|
|
31
|
+
from .bitsandbytes import BitsAndBytesQuantizer
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"BaseQuantizer",
|
|
35
|
+
"QuantConfig",
|
|
36
|
+
"get_quantizer",
|
|
37
|
+
"list_methods",
|
|
38
|
+
"QUANTIZATION_METHODS",
|
|
39
|
+
"AutoRoundQuantizer",
|
|
40
|
+
"LLMCompressorQuantizer",
|
|
41
|
+
"BitsAndBytesQuantizer",
|
|
42
|
+
]
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""AutoRound 量化器
|
|
2
|
+
|
|
3
|
+
使用 Intel 的 auto-round 库进行量化,采用 SGD 优化权重舍入,精度损失小。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .base import BaseQuantizer, QuantConfig
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AutoRoundQuantizer(BaseQuantizer):
|
|
12
|
+
"""AutoRound 量化器
|
|
13
|
+
|
|
14
|
+
使用 Intel 的 auto-round 库,通过 SGD 优化权重舍入实现高质量量化。
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
bits: 量化位数,默认 4
|
|
18
|
+
group_size: 量化分组大小,默认 128
|
|
19
|
+
sym: 是否对称量化,默认 True
|
|
20
|
+
iters: 优化迭代次数,默认 200
|
|
21
|
+
seqlen: 校准序列长度,默认 512
|
|
22
|
+
nsamples: 校准样本数,默认 256
|
|
23
|
+
batch_size: 批次大小,默认 4
|
|
24
|
+
low_gpu_mem_usage: 低显存模式,默认 True
|
|
25
|
+
format: 输出格式 (auto_round, auto_gptq),默认 auto_round
|
|
26
|
+
dataset: 校准数据集,默认 NeelNanda/pile-10k
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
>>> from maque.quantization import AutoRoundQuantizer
|
|
30
|
+
>>> quantizer = AutoRoundQuantizer(bits=4)
|
|
31
|
+
>>> quantizer.quantize("Qwen/Qwen3-4B", "./Qwen3-4B-quant")
|
|
32
|
+
|
|
33
|
+
# 使用自定义数据集
|
|
34
|
+
>>> quantizer = AutoRoundQuantizer(dataset="wikitext2")
|
|
35
|
+
>>> quantizer.quantize(model_path, output_path)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
bits: int = 4,
|
|
41
|
+
group_size: int = 128,
|
|
42
|
+
sym: bool = True,
|
|
43
|
+
iters: int = 200,
|
|
44
|
+
seqlen: int = 512,
|
|
45
|
+
nsamples: int = 256,
|
|
46
|
+
batch_size: int = 4,
|
|
47
|
+
low_gpu_mem_usage: bool = True,
|
|
48
|
+
format: str = "auto_round",
|
|
49
|
+
dataset: str = "NeelNanda/pile-10k",
|
|
50
|
+
**kwargs,
|
|
51
|
+
):
|
|
52
|
+
config = QuantConfig(
|
|
53
|
+
bits=bits,
|
|
54
|
+
group_size=group_size,
|
|
55
|
+
sym=sym,
|
|
56
|
+
seqlen=seqlen,
|
|
57
|
+
nsamples=nsamples,
|
|
58
|
+
batch_size=batch_size,
|
|
59
|
+
low_gpu_mem_usage=low_gpu_mem_usage,
|
|
60
|
+
)
|
|
61
|
+
super().__init__(config)
|
|
62
|
+
self.iters = iters
|
|
63
|
+
self.format = format
|
|
64
|
+
self.dataset = dataset
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def method_name(self) -> str:
|
|
68
|
+
return "auto-round"
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def supported_formats(self):
|
|
72
|
+
return ["auto_round", "auto_gptq"]
|
|
73
|
+
|
|
74
|
+
def quantize(self, model_path: str, output_path: str, **kwargs) -> str:
|
|
75
|
+
"""执行量化
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
model_path: 原始模型路径
|
|
79
|
+
output_path: 量化后模型保存路径
|
|
80
|
+
**kwargs: 额外参数传递给 AutoRound
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
量化后模型路径
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
from auto_round import AutoRound
|
|
87
|
+
except ImportError:
|
|
88
|
+
raise ImportError(
|
|
89
|
+
"auto-round 未安装,请运行: pip install auto-round"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
output_path = Path(output_path)
|
|
93
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
94
|
+
|
|
95
|
+
print(f"[auto-round] 加载模型: {model_path}")
|
|
96
|
+
print(f"[auto-round] 配置: bits={self.config.bits}, group_size={self.config.group_size}, "
|
|
97
|
+
f"seqlen={self.config.seqlen}, nsamples={self.config.nsamples}")
|
|
98
|
+
print(f"[auto-round] 校准数据集: {self.dataset}")
|
|
99
|
+
|
|
100
|
+
# 创建 AutoRound 实例
|
|
101
|
+
autoround = AutoRound(
|
|
102
|
+
model=model_path,
|
|
103
|
+
scheme="W4A16" if self.config.bits == 4 else f"W{self.config.bits}A16",
|
|
104
|
+
iters=self.iters,
|
|
105
|
+
seqlen=self.config.seqlen,
|
|
106
|
+
nsamples=self.config.nsamples,
|
|
107
|
+
batch_size=self.config.batch_size,
|
|
108
|
+
low_gpu_mem_usage=self.config.low_gpu_mem_usage,
|
|
109
|
+
dataset=self.dataset,
|
|
110
|
+
**kwargs,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
print(f"[auto-round] 开始量化 (iters={self.iters})...")
|
|
114
|
+
autoround.quantize()
|
|
115
|
+
|
|
116
|
+
print(f"[auto-round] 保存到: {output_path}")
|
|
117
|
+
autoround.save_quantized(str(output_path), format=self.format)
|
|
118
|
+
|
|
119
|
+
print(f"[auto-round] 量化完成!")
|
|
120
|
+
return str(output_path)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""量化器抽象基类"""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Optional, List, Literal
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class QuantConfig:
|
|
11
|
+
"""量化配置"""
|
|
12
|
+
bits: int = 4
|
|
13
|
+
group_size: int = 128
|
|
14
|
+
sym: bool = True
|
|
15
|
+
seqlen: int = 512
|
|
16
|
+
nsamples: int = 128
|
|
17
|
+
batch_size: int = 4
|
|
18
|
+
low_gpu_mem_usage: bool = True
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BaseQuantizer(ABC):
|
|
22
|
+
"""量化器抽象基类"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: QuantConfig = None):
|
|
25
|
+
self.config = config or QuantConfig()
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def quantize(self, model_path: str, output_path: str, **kwargs) -> str:
|
|
29
|
+
"""量化模型
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
model_path: 原始模型路径
|
|
33
|
+
output_path: 量化后模型保存路径
|
|
34
|
+
**kwargs: 额外参数
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
量化后模型路径
|
|
38
|
+
"""
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def method_name(self) -> str:
|
|
44
|
+
"""量化方法名称"""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def supported_formats(self) -> List[str]:
|
|
49
|
+
"""支持的输出格式"""
|
|
50
|
+
return ["auto"]
|
|
51
|
+
|
|
52
|
+
def get_model_info(self, model_path: str) -> dict:
|
|
53
|
+
"""获取模型的量化信息"""
|
|
54
|
+
import json
|
|
55
|
+
model_path = Path(model_path)
|
|
56
|
+
|
|
57
|
+
# 检查 quantization_config.json
|
|
58
|
+
quant_config_path = model_path / "quantization_config.json"
|
|
59
|
+
if quant_config_path.exists():
|
|
60
|
+
with open(quant_config_path, "r") as f:
|
|
61
|
+
return json.load(f)
|
|
62
|
+
|
|
63
|
+
# 检查 config.json 中的 quantization_config
|
|
64
|
+
config_path = model_path / "config.json"
|
|
65
|
+
if config_path.exists():
|
|
66
|
+
with open(config_path, "r") as f:
|
|
67
|
+
config = json.load(f)
|
|
68
|
+
if "quantization_config" in config:
|
|
69
|
+
return config["quantization_config"]
|
|
70
|
+
|
|
71
|
+
return {}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# 支持的量化方法
|
|
75
|
+
QUANTIZATION_METHODS = {
|
|
76
|
+
"auto-round": "AutoRoundQuantizer",
|
|
77
|
+
"awq": "LLMCompressorQuantizer",
|
|
78
|
+
"gptq": "LLMCompressorQuantizer",
|
|
79
|
+
"bnb-nf4": "BitsAndBytesQuantizer",
|
|
80
|
+
"bnb-int8": "BitsAndBytesQuantizer",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_quantizer(method: str, **kwargs) -> BaseQuantizer:
|
|
85
|
+
"""根据方法名获取量化器
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
method: 量化方法名称 (auto-round, awq, gptq, bnb-nf4, bnb-int8)
|
|
89
|
+
**kwargs: 传递给量化器的参数
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
BaseQuantizer 实例
|
|
93
|
+
"""
|
|
94
|
+
if method not in QUANTIZATION_METHODS:
|
|
95
|
+
available = ", ".join(QUANTIZATION_METHODS.keys())
|
|
96
|
+
raise ValueError(f"不支持的量化方法: {method},可用方法: {available}")
|
|
97
|
+
|
|
98
|
+
if method == "auto-round":
|
|
99
|
+
from .auto_round import AutoRoundQuantizer
|
|
100
|
+
return AutoRoundQuantizer(**kwargs)
|
|
101
|
+
elif method in ("awq", "gptq"):
|
|
102
|
+
from .llm_compressor import LLMCompressorQuantizer
|
|
103
|
+
return LLMCompressorQuantizer(scheme=method, **kwargs)
|
|
104
|
+
elif method in ("bnb-nf4", "bnb-int8"):
|
|
105
|
+
from .bitsandbytes import BitsAndBytesQuantizer
|
|
106
|
+
bits = 4 if method == "bnb-nf4" else 8
|
|
107
|
+
return BitsAndBytesQuantizer(bits=bits, **kwargs)
|
|
108
|
+
else:
|
|
109
|
+
raise ValueError(f"未实现的量化方法: {method}")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def list_methods() -> dict:
|
|
113
|
+
"""列出所有支持的量化方法及其描述"""
|
|
114
|
+
return {
|
|
115
|
+
"auto-round": {
|
|
116
|
+
"library": "auto-round",
|
|
117
|
+
"precision": "W4A16",
|
|
118
|
+
"description": "Intel 出品,SGD 优化权重舍入,精度好",
|
|
119
|
+
"use_case": "vLLM 推理",
|
|
120
|
+
},
|
|
121
|
+
"awq": {
|
|
122
|
+
"library": "llm-compressor",
|
|
123
|
+
"precision": "W4A16",
|
|
124
|
+
"description": "Activation-aware Weight Quantization",
|
|
125
|
+
"use_case": "vLLM 推理",
|
|
126
|
+
},
|
|
127
|
+
"gptq": {
|
|
128
|
+
"library": "llm-compressor",
|
|
129
|
+
"precision": "W4A16",
|
|
130
|
+
"description": "经典 GPTQ 量化",
|
|
131
|
+
"use_case": "通用推理",
|
|
132
|
+
},
|
|
133
|
+
"bnb-nf4": {
|
|
134
|
+
"library": "bitsandbytes",
|
|
135
|
+
"precision": "NF4",
|
|
136
|
+
"description": "4-bit NormalFloat 量化",
|
|
137
|
+
"use_case": "QLoRA 微调",
|
|
138
|
+
},
|
|
139
|
+
"bnb-int8": {
|
|
140
|
+
"library": "bitsandbytes",
|
|
141
|
+
"precision": "INT8",
|
|
142
|
+
"description": "8-bit 整数量化",
|
|
143
|
+
"use_case": "显存节省",
|
|
144
|
+
},
|
|
145
|
+
}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""BitsAndBytes 量化器
|
|
2
|
+
|
|
3
|
+
使用 bitsandbytes 库进行 NF4/INT8 量化,主要用于 QLoRA 微调场景。
|
|
4
|
+
注意:bitsandbytes 是推理时动态量化,不生成独立的量化模型文件。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .base import BaseQuantizer, QuantConfig
|
|
8
|
+
from typing import Literal
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BitsAndBytesQuantizer(BaseQuantizer):
|
|
13
|
+
"""BitsAndBytes 量化器
|
|
14
|
+
|
|
15
|
+
使用 bitsandbytes 库进行 NF4/INT8 量化。
|
|
16
|
+
|
|
17
|
+
注意:bitsandbytes 是推理时动态量化,调用 quantize() 会:
|
|
18
|
+
1. 加载模型并应用量化配置
|
|
19
|
+
2. 保存带有 quantization_config 的模型配置
|
|
20
|
+
|
|
21
|
+
加载时需要使用 load_in_4bit=True 或 load_in_8bit=True。
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
bits: 量化位数 (4 或 8),默认 4
|
|
25
|
+
bnb_4bit_compute_dtype: 4bit 计算精度,默认 bfloat16
|
|
26
|
+
bnb_4bit_quant_type: 4bit 量化类型 (nf4, fp4),默认 nf4
|
|
27
|
+
bnb_4bit_use_double_quant: 是否使用双重量化,默认 True
|
|
28
|
+
|
|
29
|
+
Examples:
|
|
30
|
+
>>> from maque.quantization import BitsAndBytesQuantizer
|
|
31
|
+
>>> quantizer = BitsAndBytesQuantizer(bits=4)
|
|
32
|
+
>>> quantizer.quantize("Qwen/Qwen3-4B", "./Qwen3-4B-bnb")
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
bits: Literal[4, 8] = 4,
|
|
38
|
+
bnb_4bit_compute_dtype: str = "bfloat16",
|
|
39
|
+
bnb_4bit_quant_type: Literal["nf4", "fp4"] = "nf4",
|
|
40
|
+
bnb_4bit_use_double_quant: bool = True,
|
|
41
|
+
**kwargs,
|
|
42
|
+
):
|
|
43
|
+
config = QuantConfig(bits=bits)
|
|
44
|
+
super().__init__(config)
|
|
45
|
+
self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
|
|
46
|
+
self.bnb_4bit_quant_type = bnb_4bit_quant_type
|
|
47
|
+
self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def method_name(self) -> str:
|
|
51
|
+
return f"bnb-{'nf4' if self.config.bits == 4 else 'int8'}"
|
|
52
|
+
|
|
53
|
+
def quantize(self, model_path: str, output_path: str, **kwargs) -> str:
|
|
54
|
+
"""应用量化配置并保存模型
|
|
55
|
+
|
|
56
|
+
注意:bitsandbytes 是推理时动态量化,此方法会:
|
|
57
|
+
1. 使用量化配置加载模型
|
|
58
|
+
2. 保存模型和带有 quantization_config 的配置文件
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
model_path: 原始模型路径
|
|
62
|
+
output_path: 输出路径
|
|
63
|
+
**kwargs: 额外参数
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
输出路径
|
|
67
|
+
"""
|
|
68
|
+
try:
|
|
69
|
+
import torch
|
|
70
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
71
|
+
except ImportError as e:
|
|
72
|
+
if "bitsandbytes" in str(e):
|
|
73
|
+
raise ImportError(
|
|
74
|
+
"bitsandbytes 未安装,请运行: pip install bitsandbytes"
|
|
75
|
+
)
|
|
76
|
+
raise
|
|
77
|
+
|
|
78
|
+
output_path = Path(output_path)
|
|
79
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
|
|
81
|
+
print(f"[bnb] 加载模型: {model_path}")
|
|
82
|
+
|
|
83
|
+
# 配置 BitsAndBytes
|
|
84
|
+
if self.config.bits == 4:
|
|
85
|
+
compute_dtype = getattr(torch, self.bnb_4bit_compute_dtype)
|
|
86
|
+
bnb_config = BitsAndBytesConfig(
|
|
87
|
+
load_in_4bit=True,
|
|
88
|
+
bnb_4bit_compute_dtype=compute_dtype,
|
|
89
|
+
bnb_4bit_quant_type=self.bnb_4bit_quant_type,
|
|
90
|
+
bnb_4bit_use_double_quant=self.bnb_4bit_use_double_quant,
|
|
91
|
+
)
|
|
92
|
+
print(f"[bnb] 配置: NF4, compute_dtype={self.bnb_4bit_compute_dtype}")
|
|
93
|
+
else:
|
|
94
|
+
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
|
|
95
|
+
print(f"[bnb] 配置: INT8")
|
|
96
|
+
|
|
97
|
+
# 加载模型
|
|
98
|
+
print(f"[bnb] 应用量化配置加载模型...")
|
|
99
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
100
|
+
model_path,
|
|
101
|
+
quantization_config=bnb_config,
|
|
102
|
+
device_map="auto",
|
|
103
|
+
)
|
|
104
|
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
105
|
+
|
|
106
|
+
# 保存模型和配置
|
|
107
|
+
print(f"[bnb] 保存到: {output_path}")
|
|
108
|
+
model.save_pretrained(str(output_path))
|
|
109
|
+
tokenizer.save_pretrained(str(output_path))
|
|
110
|
+
|
|
111
|
+
print(f"[bnb] 完成! 加载时请使用 load_in_{self.config.bits}bit=True")
|
|
112
|
+
return str(output_path)
|
|
113
|
+
|
|
114
|
+
def get_load_kwargs(self) -> dict:
|
|
115
|
+
"""获取加载量化模型时需要的参数"""
|
|
116
|
+
import torch
|
|
117
|
+
|
|
118
|
+
if self.config.bits == 4:
|
|
119
|
+
compute_dtype = getattr(torch, self.bnb_4bit_compute_dtype)
|
|
120
|
+
return {
|
|
121
|
+
"load_in_4bit": True,
|
|
122
|
+
"bnb_4bit_compute_dtype": compute_dtype,
|
|
123
|
+
"bnb_4bit_quant_type": self.bnb_4bit_quant_type,
|
|
124
|
+
"bnb_4bit_use_double_quant": self.bnb_4bit_use_double_quant,
|
|
125
|
+
}
|
|
126
|
+
else:
|
|
127
|
+
return {"load_in_8bit": True}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""LLM Compressor 量化器
|
|
2
|
+
|
|
3
|
+
使用 vLLM 官方的 llm-compressor 库进行 AWQ/GPTQ 量化。
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .base import BaseQuantizer, QuantConfig
|
|
7
|
+
from typing import Literal
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LLMCompressorQuantizer(BaseQuantizer):
|
|
12
|
+
"""LLM Compressor 量化器
|
|
13
|
+
|
|
14
|
+
使用 vLLM 官方的 llm-compressor 库,支持 AWQ 和 GPTQ 量化方案。
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
scheme: 量化方案 (awq, gptq),默认 awq
|
|
18
|
+
bits: 量化位数,默认 4
|
|
19
|
+
group_size: 量化分组大小,默认 128
|
|
20
|
+
sym: 是否对称量化,默认 True
|
|
21
|
+
|
|
22
|
+
Examples:
|
|
23
|
+
>>> from maque.quantization import LLMCompressorQuantizer
|
|
24
|
+
>>> quantizer = LLMCompressorQuantizer(scheme="awq")
|
|
25
|
+
>>> quantizer.quantize("Qwen/Qwen3-4B", "./Qwen3-4B-awq")
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
scheme: Literal["awq", "gptq"] = "awq",
|
|
31
|
+
bits: int = 4,
|
|
32
|
+
group_size: int = 128,
|
|
33
|
+
sym: bool = True,
|
|
34
|
+
**kwargs,
|
|
35
|
+
):
|
|
36
|
+
config = QuantConfig(
|
|
37
|
+
bits=bits,
|
|
38
|
+
group_size=group_size,
|
|
39
|
+
sym=sym,
|
|
40
|
+
)
|
|
41
|
+
super().__init__(config)
|
|
42
|
+
self.scheme = scheme
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def method_name(self) -> str:
|
|
46
|
+
return self.scheme
|
|
47
|
+
|
|
48
|
+
def quantize(self, model_path: str, output_path: str, **kwargs) -> str:
|
|
49
|
+
"""执行量化
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
model_path: 原始模型路径
|
|
53
|
+
output_path: 量化后模型保存路径
|
|
54
|
+
**kwargs: 额外参数
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
量化后模型路径
|
|
58
|
+
"""
|
|
59
|
+
try:
|
|
60
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
61
|
+
from llmcompressor.modifiers.quantization import QuantizationModifier
|
|
62
|
+
from llmcompressor import oneshot
|
|
63
|
+
except ImportError as e:
|
|
64
|
+
if "llmcompressor" in str(e) or "oneshot" in str(e):
|
|
65
|
+
raise ImportError(
|
|
66
|
+
"llm-compressor 未安装,请运行: pip install llmcompressor"
|
|
67
|
+
)
|
|
68
|
+
raise
|
|
69
|
+
|
|
70
|
+
output_path = Path(output_path)
|
|
71
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
|
|
73
|
+
print(f"[{self.scheme}] 加载模型: {model_path}")
|
|
74
|
+
|
|
75
|
+
# 加载模型
|
|
76
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
77
|
+
model_path,
|
|
78
|
+
device_map="auto",
|
|
79
|
+
torch_dtype="auto",
|
|
80
|
+
)
|
|
81
|
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
82
|
+
|
|
83
|
+
# 配置量化方案
|
|
84
|
+
scheme_name = f"W{self.config.bits}A16"
|
|
85
|
+
print(f"[{self.scheme}] 配置: scheme={scheme_name}, group_size={self.config.group_size}")
|
|
86
|
+
|
|
87
|
+
recipe = QuantizationModifier(
|
|
88
|
+
targets="Linear",
|
|
89
|
+
scheme=scheme_name,
|
|
90
|
+
ignore=["lm_head"],
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
print(f"[{self.scheme}] 开始量化...")
|
|
94
|
+
oneshot(
|
|
95
|
+
model=model,
|
|
96
|
+
tokenizer=tokenizer,
|
|
97
|
+
recipe=recipe,
|
|
98
|
+
output_dir=str(output_path),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
print(f"[{self.scheme}] 量化完成! 保存到: {output_path}")
|
|
102
|
+
return str(output_path)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#! /usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Retriever 模块 - 提供向量检索功能
|
|
6
|
+
|
|
7
|
+
支持 ChromaDB 和 Milvus 两种向量数据库后端,可独立使用。
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .document import Document, SearchResult, Modality
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from .chroma import ChromaRetriever
|
|
15
|
+
from .milvus import MilvusRetriever
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def __getattr__(name: str):
|
|
19
|
+
"""延迟导入,避免未使用的依赖"""
|
|
20
|
+
if name == "ChromaRetriever":
|
|
21
|
+
from .chroma import ChromaRetriever
|
|
22
|
+
return ChromaRetriever
|
|
23
|
+
elif name == "MilvusRetriever":
|
|
24
|
+
from .milvus import MilvusRetriever
|
|
25
|
+
return MilvusRetriever
|
|
26
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"Document",
|
|
31
|
+
"SearchResult",
|
|
32
|
+
"Modality",
|
|
33
|
+
"ChromaRetriever",
|
|
34
|
+
"MilvusRetriever",
|
|
35
|
+
]
|