gitinstall 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitinstall/__init__.py +61 -0
- gitinstall/_sdk.py +541 -0
- gitinstall/academic.py +831 -0
- gitinstall/admin.html +327 -0
- gitinstall/auto_update.py +384 -0
- gitinstall/autopilot.py +349 -0
- gitinstall/badge.py +476 -0
- gitinstall/checkpoint.py +330 -0
- gitinstall/cicd.py +499 -0
- gitinstall/clawhub.html +718 -0
- gitinstall/config_schema.py +353 -0
- gitinstall/db.py +984 -0
- gitinstall/db_backend.py +445 -0
- gitinstall/dep_chain.py +337 -0
- gitinstall/dependency_audit.py +1153 -0
- gitinstall/detector.py +542 -0
- gitinstall/doctor.py +493 -0
- gitinstall/education.py +869 -0
- gitinstall/enterprise.py +802 -0
- gitinstall/error_fixer.py +953 -0
- gitinstall/event_bus.py +251 -0
- gitinstall/executor.py +577 -0
- gitinstall/feature_flags.py +138 -0
- gitinstall/fetcher.py +921 -0
- gitinstall/huggingface.py +922 -0
- gitinstall/hw_detect.py +988 -0
- gitinstall/i18n.py +664 -0
- gitinstall/installer_registry.py +362 -0
- gitinstall/knowledge_base.py +379 -0
- gitinstall/license_check.py +605 -0
- gitinstall/llm.py +569 -0
- gitinstall/log.py +236 -0
- gitinstall/main.py +1408 -0
- gitinstall/mcp_agent.py +841 -0
- gitinstall/mcp_server.py +386 -0
- gitinstall/monorepo.py +810 -0
- gitinstall/multi_source.py +425 -0
- gitinstall/onboard.py +276 -0
- gitinstall/planner.py +222 -0
- gitinstall/planner_helpers.py +323 -0
- gitinstall/planner_known_projects.py +1010 -0
- gitinstall/planner_templates.py +996 -0
- gitinstall/remote_gpu.py +633 -0
- gitinstall/resilience.py +608 -0
- gitinstall/run_tests.py +572 -0
- gitinstall/skills.py +476 -0
- gitinstall/tool_schemas.py +324 -0
- gitinstall/trending.py +279 -0
- gitinstall/uninstaller.py +415 -0
- gitinstall/validate_top100.py +607 -0
- gitinstall/watchdog.py +180 -0
- gitinstall/web.py +1277 -0
- gitinstall/web_ui.html +2277 -0
- gitinstall-1.1.0.dist-info/METADATA +275 -0
- gitinstall-1.1.0.dist-info/RECORD +59 -0
- gitinstall-1.1.0.dist-info/WHEEL +5 -0
- gitinstall-1.1.0.dist-info/entry_points.txt +3 -0
- gitinstall-1.1.0.dist-info/licenses/LICENSE +21 -0
- gitinstall-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,922 @@
|
|
|
1
|
+
"""
|
|
2
|
+
huggingface.py - HuggingFace Hub 集成
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
填补个人/AI 开发者市场 5-6% 覆盖度缺口。
|
|
6
|
+
|
|
7
|
+
功能:
|
|
8
|
+
1. HuggingFace 模型/数据集元数据获取
|
|
9
|
+
2. VRAM 智能评估(根据模型参数量 + 量化方式)
|
|
10
|
+
3. 模型下载策略生成(全量 / GGUF / AWQ / GPTQ)
|
|
11
|
+
4. Gated Model(受限模型)访问检测
|
|
12
|
+
5. LFS 大文件智能处理
|
|
13
|
+
|
|
14
|
+
零外部依赖,纯 Python 标准库。
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import urllib.error
|
|
23
|
+
import urllib.request
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from typing import Optional
|
|
26
|
+
|
|
27
|
+
# ─────────────────────────────────────────────
|
|
28
|
+
# HuggingFace 模型 VRAM 数据库
|
|
29
|
+
# 来源:官方 Model Card + 社区实测
|
|
30
|
+
# ─────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
_MODEL_VRAM_DB: dict[str, dict] = {
|
|
33
|
+
# Meta Llama 系列
|
|
34
|
+
"meta-llama/Llama-2-7b-hf": {"params_b": 7, "family": "llama2"},
|
|
35
|
+
"meta-llama/Llama-2-13b-hf": {"params_b": 13, "family": "llama2"},
|
|
36
|
+
"meta-llama/Llama-2-70b-hf": {"params_b": 70, "family": "llama2"},
|
|
37
|
+
"meta-llama/Llama-3.1-8B": {"params_b": 8, "family": "llama3"},
|
|
38
|
+
"meta-llama/Llama-3.1-70B": {"params_b": 70, "family": "llama3"},
|
|
39
|
+
"meta-llama/Llama-3.1-405B": {"params_b": 405, "family": "llama3"},
|
|
40
|
+
"meta-llama/Llama-3.2-1B": {"params_b": 1, "family": "llama3"},
|
|
41
|
+
"meta-llama/Llama-3.2-3B": {"params_b": 3, "family": "llama3"},
|
|
42
|
+
"meta-llama/Llama-4-Scout-17B-16E": {"params_b": 109, "family": "llama4"},
|
|
43
|
+
"meta-llama/Llama-4-Maverick-17B-128E": {"params_b": 400, "family": "llama4"},
|
|
44
|
+
|
|
45
|
+
# Qwen(通义千问)系列
|
|
46
|
+
"Qwen/Qwen2.5-0.5B": {"params_b": 0.5, "family": "qwen2.5"},
|
|
47
|
+
"Qwen/Qwen2.5-1.5B": {"params_b": 1.5, "family": "qwen2.5"},
|
|
48
|
+
"Qwen/Qwen2.5-3B": {"params_b": 3, "family": "qwen2.5"},
|
|
49
|
+
"Qwen/Qwen2.5-7B": {"params_b": 7, "family": "qwen2.5"},
|
|
50
|
+
"Qwen/Qwen2.5-14B": {"params_b": 14, "family": "qwen2.5"},
|
|
51
|
+
"Qwen/Qwen2.5-32B": {"params_b": 32, "family": "qwen2.5"},
|
|
52
|
+
"Qwen/Qwen2.5-72B": {"params_b": 72, "family": "qwen2.5"},
|
|
53
|
+
"Qwen/Qwen3-8B": {"params_b": 8, "family": "qwen3"},
|
|
54
|
+
"Qwen/Qwen3-32B": {"params_b": 32, "family": "qwen3"},
|
|
55
|
+
"Qwen/Qwen3-235B-A22B": {"params_b": 235, "family": "qwen3_moe"},
|
|
56
|
+
"Qwen/QwQ-32B": {"params_b": 32, "family": "qwen3"},
|
|
57
|
+
"Qwen/Qwen2.5-Coder-7B": {"params_b": 7, "family": "qwen2.5-coder"},
|
|
58
|
+
"Qwen/Qwen2.5-Coder-32B": {"params_b": 32, "family": "qwen2.5-coder"},
|
|
59
|
+
|
|
60
|
+
# DeepSeek 系列
|
|
61
|
+
"deepseek-ai/DeepSeek-R1": {"params_b": 671, "family": "deepseek_moe"},
|
|
62
|
+
"deepseek-ai/DeepSeek-R1-0528": {"params_b": 671, "family": "deepseek_moe"},
|
|
63
|
+
"deepseek-ai/DeepSeek-V3": {"params_b": 671, "family": "deepseek_moe"},
|
|
64
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": {"params_b": 1.5, "family": "qwen2.5"},
|
|
65
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": {"params_b": 7, "family": "qwen2.5"},
|
|
66
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {"params_b": 32, "family": "qwen2.5"},
|
|
67
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {"params_b": 8, "family": "llama3"},
|
|
68
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {"params_b": 70, "family": "llama3"},
|
|
69
|
+
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": {"params_b": 16, "family": "deepseek"},
|
|
70
|
+
|
|
71
|
+
# Mistral 系列
|
|
72
|
+
"mistralai/Mistral-7B-v0.3": {"params_b": 7, "family": "mistral"},
|
|
73
|
+
"mistralai/Mixtral-8x7B-v0.1": {"params_b": 47, "family": "mixtral"},
|
|
74
|
+
"mistralai/Mistral-Small-24B": {"params_b": 24, "family": "mistral"},
|
|
75
|
+
"mistralai/Mistral-Large-2411": {"params_b": 123, "family": "mistral"},
|
|
76
|
+
"mistralai/Codestral-25.01": {"params_b": 22, "family": "mistral"},
|
|
77
|
+
|
|
78
|
+
# Google Gemma
|
|
79
|
+
"google/gemma-2-2b": {"params_b": 2, "family": "gemma2"},
|
|
80
|
+
"google/gemma-2-9b": {"params_b": 9, "family": "gemma2"},
|
|
81
|
+
"google/gemma-2-27b": {"params_b": 27, "family": "gemma2"},
|
|
82
|
+
"google/gemma-3-1b-it": {"params_b": 1, "family": "gemma3"},
|
|
83
|
+
"google/gemma-3-4b-it": {"params_b": 4, "family": "gemma3"},
|
|
84
|
+
"google/gemma-3-12b-it": {"params_b": 12, "family": "gemma3"},
|
|
85
|
+
"google/gemma-3-27b-it": {"params_b": 27, "family": "gemma3"},
|
|
86
|
+
|
|
87
|
+
# Microsoft Phi
|
|
88
|
+
"microsoft/phi-4": {"params_b": 14, "family": "phi"},
|
|
89
|
+
"microsoft/Phi-3.5-mini-instruct": {"params_b": 3.8, "family": "phi"},
|
|
90
|
+
"microsoft/Phi-3-medium-128k-instruct": {"params_b": 14, "family": "phi"},
|
|
91
|
+
|
|
92
|
+
# 百川 / GLM / 其他中国模型
|
|
93
|
+
"baichuan-inc/Baichuan2-13B-Chat": {"params_b": 13, "family": "baichuan"},
|
|
94
|
+
"THUDM/chatglm3-6b": {"params_b": 6, "family": "glm"},
|
|
95
|
+
"THUDM/glm-4-9b-chat": {"params_b": 9, "family": "glm4"},
|
|
96
|
+
"01-ai/Yi-1.5-34B-Chat": {"params_b": 34, "family": "yi"},
|
|
97
|
+
"internlm/internlm2_5-7b-chat": {"params_b": 7, "family": "internlm"},
|
|
98
|
+
|
|
99
|
+
# Stability AI
|
|
100
|
+
"stabilityai/stable-diffusion-3.5-large": {"params_b": 8.1, "family": "sd3"},
|
|
101
|
+
"stabilityai/stable-diffusion-xl-base-1.0": {"params_b": 3.5, "family": "sdxl"},
|
|
102
|
+
"black-forest-labs/FLUX.1-dev": {"params_b": 12, "family": "flux"},
|
|
103
|
+
"black-forest-labs/FLUX.1-schnell": {"params_b": 12, "family": "flux"},
|
|
104
|
+
|
|
105
|
+
# Whisper (语音)
|
|
106
|
+
"openai/whisper-large-v3": {"params_b": 1.5, "family": "whisper"},
|
|
107
|
+
"openai/whisper-large-v3-turbo": {"params_b": 0.8, "family": "whisper"},
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# 受限模型列表(需要 HF Token + 协议同意)
|
|
111
|
+
_GATED_MODELS = {
|
|
112
|
+
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf",
|
|
113
|
+
"meta-llama/Llama-2-70b-hf", "meta-llama/Llama-3.1-8B",
|
|
114
|
+
"meta-llama/Llama-3.1-70B", "meta-llama/Llama-3.1-405B",
|
|
115
|
+
"meta-llama/Llama-3.2-1B", "meta-llama/Llama-3.2-3B",
|
|
116
|
+
"meta-llama/Llama-4-Scout-17B-16E", "meta-llama/Llama-4-Maverick-17B-128E",
|
|
117
|
+
"google/gemma-2-2b", "google/gemma-2-9b", "google/gemma-2-27b",
|
|
118
|
+
"google/gemma-3-1b-it", "google/gemma-3-4b-it",
|
|
119
|
+
"google/gemma-3-12b-it", "google/gemma-3-27b-it",
|
|
120
|
+
"mistralai/Mistral-Large-2411",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class HFModelInfo:
|
|
126
|
+
"""HuggingFace 模型元数据"""
|
|
127
|
+
model_id: str
|
|
128
|
+
params_b: float = 0.0
|
|
129
|
+
family: str = ""
|
|
130
|
+
pipeline_tag: str = "" # text-generation, image-classification, ...
|
|
131
|
+
library_name: str = "" # transformers, diffusers, ...
|
|
132
|
+
is_gated: bool = False
|
|
133
|
+
license: str = ""
|
|
134
|
+
downloads: int = 0
|
|
135
|
+
likes: int = 0
|
|
136
|
+
tags: list[str] = field(default_factory=list)
|
|
137
|
+
siblings: list[str] = field(default_factory=list) # 文件列表
|
|
138
|
+
error: str = ""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass
|
|
142
|
+
class VRAMEstimate:
|
|
143
|
+
"""VRAM 估算结果"""
|
|
144
|
+
model_id: str
|
|
145
|
+
params_b: float
|
|
146
|
+
available_vram_gb: float
|
|
147
|
+
can_run: bool
|
|
148
|
+
recommended_method: str # "fp16" / "q8" / "q4_k" / "gguf" / "api_only"
|
|
149
|
+
vram_needed_gb: float
|
|
150
|
+
options: list[dict] = field(default_factory=list)
|
|
151
|
+
advice: str = ""
|
|
152
|
+
install_commands: list[str] = field(default_factory=list)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ─────────────────────────────────────────────
|
|
156
|
+
# HuggingFace API 交互(零依赖)
|
|
157
|
+
# ─────────────────────────────────────────────
|
|
158
|
+
|
|
159
|
+
def _hf_token() -> str:
|
|
160
|
+
"""获取 HuggingFace token"""
|
|
161
|
+
token = os.getenv("HF_TOKEN", "").strip()
|
|
162
|
+
if not token:
|
|
163
|
+
token = os.getenv("HUGGING_FACE_HUB_TOKEN", "").strip()
|
|
164
|
+
if not token:
|
|
165
|
+
token_path = os.path.expanduser("~/.cache/huggingface/token")
|
|
166
|
+
if os.path.exists(token_path):
|
|
167
|
+
with open(token_path) as f:
|
|
168
|
+
token = f.read().strip()
|
|
169
|
+
return token
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _hf_api_get(endpoint: str, timeout: int = 10) -> dict:
|
|
173
|
+
"""调用 HuggingFace API"""
|
|
174
|
+
url = f"https://huggingface.co/api/{endpoint}"
|
|
175
|
+
headers = {"User-Agent": "gitinstall/1.1"}
|
|
176
|
+
token = _hf_token()
|
|
177
|
+
if token:
|
|
178
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
179
|
+
req = urllib.request.Request(url, headers=headers)
|
|
180
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
181
|
+
return json.loads(resp.read().decode("utf-8"))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def is_huggingface_id(identifier: str) -> bool:
|
|
185
|
+
"""判断是否为 HuggingFace 模型 ID (owner/model 格式,非 GitHub URL)"""
|
|
186
|
+
if "/" not in identifier:
|
|
187
|
+
return False
|
|
188
|
+
if any(identifier.startswith(p) for p in ("http://", "https://", "git@")):
|
|
189
|
+
return False
|
|
190
|
+
# HF model ID 通常是 owner/model-name 格式
|
|
191
|
+
# 检查是否在已知模型库中,或以 huggingface.co 开头
|
|
192
|
+
if identifier in _MODEL_VRAM_DB:
|
|
193
|
+
return True
|
|
194
|
+
if "huggingface.co" in identifier:
|
|
195
|
+
return True
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def parse_hf_url(url: str) -> str:
|
|
200
|
+
"""从 HuggingFace URL 提取模型 ID"""
|
|
201
|
+
# https://huggingface.co/meta-llama/Llama-2-7b-hf → meta-llama/Llama-2-7b-hf
|
|
202
|
+
match = re.match(r'https?://huggingface\.co/([^/]+/[^/?#]+)', url)
|
|
203
|
+
if match:
|
|
204
|
+
return match.group(1)
|
|
205
|
+
return url
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def fetch_model_info(model_id: str) -> HFModelInfo:
|
|
209
|
+
"""获取 HuggingFace 模型详细信息"""
|
|
210
|
+
model_id = parse_hf_url(model_id)
|
|
211
|
+
info = HFModelInfo(model_id=model_id)
|
|
212
|
+
|
|
213
|
+
# 先查本地数据库
|
|
214
|
+
if model_id in _MODEL_VRAM_DB:
|
|
215
|
+
db = _MODEL_VRAM_DB[model_id]
|
|
216
|
+
info.params_b = db["params_b"]
|
|
217
|
+
info.family = db["family"]
|
|
218
|
+
|
|
219
|
+
# 检查受限模型
|
|
220
|
+
if model_id in _GATED_MODELS:
|
|
221
|
+
info.is_gated = True
|
|
222
|
+
|
|
223
|
+
# 尝试 API 获取更多信息
|
|
224
|
+
try:
|
|
225
|
+
data = _hf_api_get(f"models/{model_id}")
|
|
226
|
+
info.pipeline_tag = data.get("pipeline_tag", "")
|
|
227
|
+
info.library_name = data.get("library_name", "")
|
|
228
|
+
info.license = data.get("cardData", {}).get("license", "")
|
|
229
|
+
info.downloads = data.get("downloads", 0)
|
|
230
|
+
info.likes = data.get("likes", 0)
|
|
231
|
+
info.tags = data.get("tags", [])
|
|
232
|
+
info.is_gated = data.get("gated", False) or info.is_gated
|
|
233
|
+
# 文件列表
|
|
234
|
+
siblings = data.get("siblings", [])
|
|
235
|
+
info.siblings = [s.get("rfilename", "") for s in siblings]
|
|
236
|
+
# 如果本地数据库没有参数量,尝试从 safetensors metadata 推断
|
|
237
|
+
if not info.params_b:
|
|
238
|
+
info.params_b = _infer_params_from_metadata(data)
|
|
239
|
+
except Exception as e:
|
|
240
|
+
info.error = str(e)
|
|
241
|
+
|
|
242
|
+
return info
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _infer_params_from_metadata(data: dict) -> float:
|
|
246
|
+
"""从 HF API 响应推断模型参数量"""
|
|
247
|
+
# safetensors.parameters 字段
|
|
248
|
+
st = data.get("safetensors", {})
|
|
249
|
+
if isinstance(st, dict):
|
|
250
|
+
params = st.get("parameters", {})
|
|
251
|
+
if isinstance(params, dict):
|
|
252
|
+
total = sum(params.values())
|
|
253
|
+
if total > 0:
|
|
254
|
+
return round(total / 1e9, 1)
|
|
255
|
+
# 从 model ID 或 tag 推断
|
|
256
|
+
model_id = data.get("modelId", "")
|
|
257
|
+
for tag in data.get("tags", []) + [model_id]:
|
|
258
|
+
match = re.search(r'(\d+)[._-]?(\d*)\s*[bB]', str(tag))
|
|
259
|
+
if match:
|
|
260
|
+
b = float(match.group(1))
|
|
261
|
+
if match.group(2):
|
|
262
|
+
b = float(f"{match.group(1)}.{match.group(2)}")
|
|
263
|
+
if 0.1 <= b <= 1000:
|
|
264
|
+
return b
|
|
265
|
+
return 0.0
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# ─────────────────────────────────────────────
|
|
269
|
+
# VRAM 智能评估
|
|
270
|
+
# ─────────────────────────────────────────────
|
|
271
|
+
|
|
272
|
+
def estimate_vram(
|
|
273
|
+
model_id: str,
|
|
274
|
+
available_vram_gb: float,
|
|
275
|
+
use_case: str = "inference", # "inference" | "finetune" | "lora"
|
|
276
|
+
) -> VRAMEstimate:
|
|
277
|
+
"""
|
|
278
|
+
智能评估给定模型在目标硬件上的可用性。
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
VRAMEstimate 包含是否能运行、推荐方案、安装命令等
|
|
282
|
+
"""
|
|
283
|
+
# 优先从本地数据库获取参数量,避免网络请求
|
|
284
|
+
db_info = _MODEL_VRAM_DB.get(model_id)
|
|
285
|
+
if db_info:
|
|
286
|
+
info = HFModelInfo(model_id=model_id, params_b=db_info["params_b"])
|
|
287
|
+
elif model_id.startswith("http") or "/" in model_id:
|
|
288
|
+
info = fetch_model_info(model_id)
|
|
289
|
+
else:
|
|
290
|
+
info = HFModelInfo(model_id=model_id)
|
|
291
|
+
|
|
292
|
+
# 如果还没有参数量,尝试从 ID 推断
|
|
293
|
+
params_b = info.params_b
|
|
294
|
+
if not params_b:
|
|
295
|
+
match = re.search(r'(\d+\.?\d*)\s*[bB]', model_id)
|
|
296
|
+
if match:
|
|
297
|
+
params_b = float(match.group(1))
|
|
298
|
+
|
|
299
|
+
if not params_b:
|
|
300
|
+
return VRAMEstimate(
|
|
301
|
+
model_id=model_id, params_b=0, available_vram_gb=available_vram_gb,
|
|
302
|
+
can_run=False, recommended_method="unknown", vram_needed_gb=0,
|
|
303
|
+
advice=f"无法确定模型 {model_id} 的参数量,请手动指定",
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# 量化方案及其 VRAM 占用公式
|
|
307
|
+
quant_formulas = {
|
|
308
|
+
"fp32": lambda b: b * 4.0 + 2, # 4 bytes/param + overhead
|
|
309
|
+
"fp16": lambda b: b * 2.0 + 1.5, # 2 bytes/param
|
|
310
|
+
"q8": lambda b: b * 1.1 + 1.0, # ~1.1 bytes/param
|
|
311
|
+
"q6_k": lambda b: b * 0.85 + 0.8, # ~0.85 bytes/param
|
|
312
|
+
"q5_k": lambda b: b * 0.75 + 0.7, # ~0.75 bytes/param
|
|
313
|
+
"q4_k": lambda b: b * 0.6 + 0.5, # ~0.6 bytes/param
|
|
314
|
+
"q4_0": lambda b: b * 0.55 + 0.5, # ~0.55 bytes/param
|
|
315
|
+
"q3_k": lambda b: b * 0.45 + 0.4, # ~0.45 bytes/param
|
|
316
|
+
"q2_k": lambda b: b * 0.35 + 0.3, # ~0.35 bytes/param
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
# LoRA 微调额外开销
|
|
320
|
+
lora_overhead_gb = 0.0
|
|
321
|
+
if use_case == "lora":
|
|
322
|
+
lora_overhead_gb = params_b * 0.15 + 2.0 # ~15% 参数 + optimizer states
|
|
323
|
+
elif use_case == "finetune":
|
|
324
|
+
lora_overhead_gb = params_b * 3.0 + 4.0 # full finetune 需要 ~3x model size
|
|
325
|
+
|
|
326
|
+
quality_labels = {
|
|
327
|
+
"fp32": "完美(无损)", "fp16": "极好", "q8": "优秀",
|
|
328
|
+
"q6_k": "很好", "q5_k": "良好", "q4_k": "良好(性价比最佳)",
|
|
329
|
+
"q4_0": "可用", "q3_k": "一般", "q2_k": "较差",
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
options = []
|
|
333
|
+
recommended = None
|
|
334
|
+
for quant, formula in quant_formulas.items():
|
|
335
|
+
vram = round(formula(params_b) + lora_overhead_gb, 1)
|
|
336
|
+
fits = vram <= available_vram_gb
|
|
337
|
+
options.append({
|
|
338
|
+
"quant": quant, "vram_gb": vram, "fits": fits,
|
|
339
|
+
"quality": quality_labels.get(quant, ""),
|
|
340
|
+
})
|
|
341
|
+
if fits and recommended is None:
|
|
342
|
+
recommended = quant
|
|
343
|
+
|
|
344
|
+
can_run = recommended is not None
|
|
345
|
+
vram_needed = quant_formulas.get(recommended or "q2_k", lambda b: b * 0.35)(params_b)
|
|
346
|
+
|
|
347
|
+
# 生成安装命令
|
|
348
|
+
install_cmds = _generate_install_commands(model_id, info, recommended, use_case)
|
|
349
|
+
|
|
350
|
+
# 生成建议
|
|
351
|
+
if not can_run:
|
|
352
|
+
min_q = options[-1]["vram_gb"] if options else 0
|
|
353
|
+
advice = (
|
|
354
|
+
f"❌ 当前 VRAM {available_vram_gb}GB 不足以{_use_case_label(use_case)} {params_b}B 模型\n"
|
|
355
|
+
f" 最低需 {min_q}GB (Q2_K 量化)\n"
|
|
356
|
+
f" 建议: 使用更小的模型,或通过 API 调用"
|
|
357
|
+
)
|
|
358
|
+
elif recommended in ("fp32", "fp16"):
|
|
359
|
+
advice = f"✅ VRAM 充足!可以 {recommended.upper()} 全精度{_use_case_label(use_case)} {params_b}B 模型"
|
|
360
|
+
elif recommended in ("q8", "q6_k"):
|
|
361
|
+
advice = f"✅ 推荐 {recommended.upper()} 量化,质量损失极小"
|
|
362
|
+
else:
|
|
363
|
+
advice = f"⚠️ 推荐 {recommended.upper()} 量化(VRAM 紧张,会有一定质量损失)"
|
|
364
|
+
|
|
365
|
+
if info.is_gated:
|
|
366
|
+
advice += "\n⚠️ 这是受限模型,需要先在 HuggingFace 网站同意使用协议,并设置 HF_TOKEN"
|
|
367
|
+
|
|
368
|
+
return VRAMEstimate(
|
|
369
|
+
model_id=model_id, params_b=params_b,
|
|
370
|
+
available_vram_gb=available_vram_gb,
|
|
371
|
+
can_run=can_run, recommended_method=recommended or "api_only",
|
|
372
|
+
vram_needed_gb=round(vram_needed, 1),
|
|
373
|
+
options=options, advice=advice,
|
|
374
|
+
install_commands=install_cmds,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _use_case_label(use_case: str) -> str:
|
|
379
|
+
return {"inference": "推理运行", "finetune": "全量微调", "lora": "LoRA 微调"}.get(use_case, "运行")
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _generate_install_commands(
|
|
383
|
+
model_id: str, info: HFModelInfo,
|
|
384
|
+
recommended_quant: Optional[str], use_case: str,
|
|
385
|
+
) -> list[str]:
|
|
386
|
+
"""根据模型信息和推荐方案生成安装命令"""
|
|
387
|
+
cmds = []
|
|
388
|
+
|
|
389
|
+
if info.is_gated:
|
|
390
|
+
cmds.append("# 1. 请先设置 HuggingFace Token")
|
|
391
|
+
cmds.append("export HF_TOKEN=hf_your_token_here")
|
|
392
|
+
cmds.append("# 或: huggingface-cli login")
|
|
393
|
+
|
|
394
|
+
pipeline = info.pipeline_tag or "text-generation"
|
|
395
|
+
|
|
396
|
+
if pipeline in ("text-generation", "text2text-generation"):
|
|
397
|
+
# LLM 模型
|
|
398
|
+
if recommended_quant and recommended_quant.startswith("q"):
|
|
399
|
+
# 推荐使用 llama.cpp / Ollama 运行量化版本
|
|
400
|
+
cmds.append("# 方式一: 使用 Ollama(推荐,最简单)")
|
|
401
|
+
ollama_name = _model_to_ollama_name(model_id)
|
|
402
|
+
if ollama_name:
|
|
403
|
+
cmds.append(f"ollama pull {ollama_name}")
|
|
404
|
+
cmds.append(f"ollama run {ollama_name}")
|
|
405
|
+
cmds.append("")
|
|
406
|
+
cmds.append("# 方式二: 使用 llama.cpp + GGUF 量化文件")
|
|
407
|
+
cmds.append("pip install llama-cpp-python")
|
|
408
|
+
else:
|
|
409
|
+
# FP16/FP32 全精度
|
|
410
|
+
cmds.append("pip install transformers torch accelerate")
|
|
411
|
+
if use_case == "lora":
|
|
412
|
+
cmds.append("pip install peft bitsandbytes")
|
|
413
|
+
cmds.append(f"python -c \"from transformers import AutoModelForCausalLM, AutoTokenizer; "
|
|
414
|
+
f"m = AutoModelForCausalLM.from_pretrained('{model_id}', torch_dtype='auto', "
|
|
415
|
+
f"device_map='auto'); t = AutoTokenizer.from_pretrained('{model_id}')\"")
|
|
416
|
+
|
|
417
|
+
elif pipeline in ("text-to-image", "image-to-image"):
|
|
418
|
+
# Diffusion 模型
|
|
419
|
+
cmds.append("pip install diffusers transformers torch accelerate")
|
|
420
|
+
cmds.append(f"python -c \"from diffusers import DiffusionPipeline; "
|
|
421
|
+
f"pipe = DiffusionPipeline.from_pretrained('{model_id}', torch_dtype='auto')\"")
|
|
422
|
+
|
|
423
|
+
elif pipeline in ("automatic-speech-recognition",):
|
|
424
|
+
# ASR 模型 (Whisper 等)
|
|
425
|
+
cmds.append("pip install transformers torch torchaudio")
|
|
426
|
+
cmds.append(f"python -c \"from transformers import pipeline; "
|
|
427
|
+
f"pipe = pipeline('automatic-speech-recognition', model='{model_id}')\"")
|
|
428
|
+
|
|
429
|
+
else:
|
|
430
|
+
# 通用 transformers
|
|
431
|
+
cmds.append("pip install transformers torch")
|
|
432
|
+
cmds.append(f"python -c \"from transformers import AutoModel; "
|
|
433
|
+
f"m = AutoModel.from_pretrained('{model_id}')\"")
|
|
434
|
+
|
|
435
|
+
return cmds
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _model_to_ollama_name(model_id: str) -> Optional[str]:
|
|
439
|
+
"""将 HuggingFace 模型 ID 映射到 Ollama 模型名"""
|
|
440
|
+
mapping = {
|
|
441
|
+
"meta-llama/Llama-3.1-8B": "llama3.1:8b",
|
|
442
|
+
"meta-llama/Llama-3.1-70B": "llama3.1:70b",
|
|
443
|
+
"meta-llama/Llama-3.2-1B": "llama3.2:1b",
|
|
444
|
+
"meta-llama/Llama-3.2-3B": "llama3.2:3b",
|
|
445
|
+
"Qwen/Qwen2.5-0.5B": "qwen2.5:0.5b",
|
|
446
|
+
"Qwen/Qwen2.5-1.5B": "qwen2.5:1.5b",
|
|
447
|
+
"Qwen/Qwen2.5-3B": "qwen2.5:3b",
|
|
448
|
+
"Qwen/Qwen2.5-7B": "qwen2.5:7b",
|
|
449
|
+
"Qwen/Qwen2.5-14B": "qwen2.5:14b",
|
|
450
|
+
"Qwen/Qwen2.5-32B": "qwen2.5:32b",
|
|
451
|
+
"Qwen/Qwen2.5-72B": "qwen2.5:72b",
|
|
452
|
+
"Qwen/Qwen3-8B": "qwen3:8b",
|
|
453
|
+
"Qwen/Qwen3-32B": "qwen3:32b",
|
|
454
|
+
"Qwen/QwQ-32B": "qwq:32b",
|
|
455
|
+
"Qwen/Qwen2.5-Coder-7B": "qwen2.5-coder:7b",
|
|
456
|
+
"Qwen/Qwen2.5-Coder-32B": "qwen2.5-coder:32b",
|
|
457
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": "deepseek-r1:1.5b",
|
|
458
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": "deepseek-r1:7b",
|
|
459
|
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": "deepseek-r1:32b",
|
|
460
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B": "deepseek-r1:8b",
|
|
461
|
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B": "deepseek-r1:70b",
|
|
462
|
+
"mistralai/Mistral-7B-v0.3": "mistral:7b",
|
|
463
|
+
"google/gemma-2-2b": "gemma2:2b",
|
|
464
|
+
"google/gemma-2-9b": "gemma2:9b",
|
|
465
|
+
"google/gemma-2-27b": "gemma2:27b",
|
|
466
|
+
"google/gemma-3-1b-it": "gemma3:1b",
|
|
467
|
+
"google/gemma-3-4b-it": "gemma3:4b",
|
|
468
|
+
"google/gemma-3-12b-it": "gemma3:12b",
|
|
469
|
+
"google/gemma-3-27b-it": "gemma3:27b",
|
|
470
|
+
"microsoft/phi-4": "phi4:14b",
|
|
471
|
+
}
|
|
472
|
+
return mapping.get(model_id)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
# ─────────────────────────────────────────────
|
|
476
|
+
# 模型推荐
|
|
477
|
+
# ─────────────────────────────────────────────
|
|
478
|
+
|
|
479
|
+
def recommend_models_for_hardware(
|
|
480
|
+
vram_gb: float,
|
|
481
|
+
use_case: str = "inference",
|
|
482
|
+
language: str = "zh", # "zh" | "en" | "code"
|
|
483
|
+
) -> list[dict]:
|
|
484
|
+
"""
|
|
485
|
+
根据硬件 VRAM 推荐最适合的模型。
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
vram_gb: 可用 GPU 内存
|
|
489
|
+
use_case: "inference" / "lora" / "finetune"
|
|
490
|
+
language: "zh" (中文优先) / "en" (英文) / "code" (代码)
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
推荐列表,按适合程度排序
|
|
494
|
+
"""
|
|
495
|
+
recommendations = []
|
|
496
|
+
|
|
497
|
+
# 按语言偏好排序的模型家族
|
|
498
|
+
family_priority = {
|
|
499
|
+
"zh": ["qwen3", "qwen2.5", "qwen2.5-coder", "deepseek", "glm4", "llama3", "gemma3", "phi"],
|
|
500
|
+
"en": ["llama3", "gemma3", "mistral", "phi", "qwen2.5"],
|
|
501
|
+
"code": ["qwen2.5-coder", "deepseek", "llama3", "phi"],
|
|
502
|
+
}
|
|
503
|
+
preferred = family_priority.get(language, family_priority["en"])
|
|
504
|
+
|
|
505
|
+
for model_id, db_info in _MODEL_VRAM_DB.items():
|
|
506
|
+
params_b = db_info["params_b"]
|
|
507
|
+
family = db_info["family"]
|
|
508
|
+
|
|
509
|
+
est = estimate_vram(model_id, vram_gb, use_case)
|
|
510
|
+
if not est.can_run:
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
# 评分: 参数量越大越好 + 语言匹配加分
|
|
514
|
+
score = params_b * 10
|
|
515
|
+
if family in preferred:
|
|
516
|
+
score += (len(preferred) - preferred.index(family)) * 50
|
|
517
|
+
|
|
518
|
+
recommendations.append({
|
|
519
|
+
"model_id": model_id,
|
|
520
|
+
"params_b": params_b,
|
|
521
|
+
"family": family,
|
|
522
|
+
"quant": est.recommended_method,
|
|
523
|
+
"vram_needed": est.vram_needed_gb,
|
|
524
|
+
"score": score,
|
|
525
|
+
"advice": est.advice,
|
|
526
|
+
})
|
|
527
|
+
|
|
528
|
+
recommendations.sort(key=lambda x: x["score"], reverse=True)
|
|
529
|
+
return recommendations[:10]
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
# ─────────────────────────────────────────────
|
|
533
|
+
# 格式化输出
|
|
534
|
+
# ─────────────────────────────────────────────
|
|
535
|
+
|
|
536
|
+
def format_vram_estimate(est: VRAMEstimate) -> str:
|
|
537
|
+
"""格式化 VRAM 估算结果"""
|
|
538
|
+
lines = [
|
|
539
|
+
f"🧠 模型 VRAM 评估: {est.model_id}",
|
|
540
|
+
f" 参数量: {est.params_b}B",
|
|
541
|
+
f" 可用 VRAM: {est.available_vram_gb}GB",
|
|
542
|
+
"",
|
|
543
|
+
est.advice,
|
|
544
|
+
"",
|
|
545
|
+
]
|
|
546
|
+
|
|
547
|
+
if est.options:
|
|
548
|
+
lines.append(" 量化方案对比:")
|
|
549
|
+
for opt in est.options:
|
|
550
|
+
mark = "✅" if opt["fits"] else "❌"
|
|
551
|
+
lines.append(f" {mark} {opt['quant']:6s} → {opt['vram_gb']:6.1f}GB {opt['quality']}")
|
|
552
|
+
|
|
553
|
+
if est.install_commands:
|
|
554
|
+
lines.append("")
|
|
555
|
+
lines.append(" 安装命令:")
|
|
556
|
+
for cmd in est.install_commands:
|
|
557
|
+
lines.append(f" {cmd}")
|
|
558
|
+
|
|
559
|
+
return "\n".join(lines)
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def format_model_recommendations(recs: list[dict], vram_gb: float) -> str:
|
|
563
|
+
"""格式化模型推荐列表"""
|
|
564
|
+
lines = [
|
|
565
|
+
f"🤖 基于 {vram_gb}GB VRAM 的模型推荐:",
|
|
566
|
+
"",
|
|
567
|
+
]
|
|
568
|
+
for i, rec in enumerate(recs, 1):
|
|
569
|
+
lines.append(
|
|
570
|
+
f" {i}. {rec['model_id']} ({rec['params_b']}B)"
|
|
571
|
+
f" → {rec['quant'].upper()} ({rec['vram_needed']:.1f}GB)"
|
|
572
|
+
)
|
|
573
|
+
return "\n".join(lines)
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
# ─────────────────────────────────────────────
|
|
577
|
+
# AI 模型部署自动化 (Market Opportunity #1)
|
|
578
|
+
# ─────────────────────────────────────────────
|
|
579
|
+
|
|
580
|
+
@dataclass
|
|
581
|
+
class DeploymentPlan:
|
|
582
|
+
"""模型部署计划"""
|
|
583
|
+
model_id: str = ""
|
|
584
|
+
serving_engine: str = "" # vllm | tgi | ollama | llama_cpp | triton
|
|
585
|
+
quantization: str = "" # fp16 | int8 | int4 | awq | gptq | gguf
|
|
586
|
+
gpu_type: str = ""
|
|
587
|
+
gpu_count: int = 1
|
|
588
|
+
port: int = 8000
|
|
589
|
+
docker: bool = True
|
|
590
|
+
steps: list[str] = field(default_factory=list)
|
|
591
|
+
docker_compose: str = ""
|
|
592
|
+
env_vars: dict[str, str] = field(default_factory=dict)
|
|
593
|
+
estimated_vram_gb: float = 0.0
|
|
594
|
+
estimated_tps: float = 0.0 # tokens per second
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
_SERVING_ENGINES = {
|
|
598
|
+
"vllm": {
|
|
599
|
+
"name": "vLLM",
|
|
600
|
+
"docker_image": "vllm/vllm-openai:latest",
|
|
601
|
+
"default_port": 8000,
|
|
602
|
+
"api_compatible": "OpenAI",
|
|
603
|
+
"min_vram_gb": 16,
|
|
604
|
+
"supports": ["fp16", "awq", "gptq", "int8"],
|
|
605
|
+
},
|
|
606
|
+
"tgi": {
|
|
607
|
+
"name": "Text Generation Inference (TGI)",
|
|
608
|
+
"docker_image": "ghcr.io/huggingface/text-generation-inference:latest",
|
|
609
|
+
"default_port": 8080,
|
|
610
|
+
"api_compatible": "HuggingFace",
|
|
611
|
+
"min_vram_gb": 16,
|
|
612
|
+
"supports": ["fp16", "gptq", "awq", "int8", "int4"],
|
|
613
|
+
},
|
|
614
|
+
"ollama": {
|
|
615
|
+
"name": "Ollama",
|
|
616
|
+
"docker_image": "ollama/ollama:latest",
|
|
617
|
+
"default_port": 11434,
|
|
618
|
+
"api_compatible": "Ollama / OpenAI",
|
|
619
|
+
"min_vram_gb": 4,
|
|
620
|
+
"supports": ["gguf", "fp16"],
|
|
621
|
+
},
|
|
622
|
+
"llama_cpp": {
|
|
623
|
+
"name": "llama.cpp Server",
|
|
624
|
+
"docker_image": "",
|
|
625
|
+
"default_port": 8080,
|
|
626
|
+
"api_compatible": "OpenAI",
|
|
627
|
+
"min_vram_gb": 2,
|
|
628
|
+
"supports": ["gguf"],
|
|
629
|
+
},
|
|
630
|
+
"triton": {
|
|
631
|
+
"name": "NVIDIA Triton Inference Server",
|
|
632
|
+
"docker_image": "nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3",
|
|
633
|
+
"default_port": 8001,
|
|
634
|
+
"api_compatible": "gRPC / HTTP",
|
|
635
|
+
"min_vram_gb": 24,
|
|
636
|
+
"supports": ["fp16", "int8", "int4"],
|
|
637
|
+
},
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def select_serving_engine(
|
|
642
|
+
model_id: str,
|
|
643
|
+
vram_gb: float,
|
|
644
|
+
use_case: str = "inference",
|
|
645
|
+
prefer_docker: bool = True,
|
|
646
|
+
) -> str:
|
|
647
|
+
"""根据模型和硬件自动选择最佳推理引擎"""
|
|
648
|
+
params_b = _lookup_params(model_id)
|
|
649
|
+
|
|
650
|
+
# 小模型 → Ollama 最简单
|
|
651
|
+
if params_b <= 13 and vram_gb >= 8:
|
|
652
|
+
ollama_name = _model_to_ollama_name(model_id)
|
|
653
|
+
if ollama_name:
|
|
654
|
+
return "ollama"
|
|
655
|
+
|
|
656
|
+
# 大显存 + 大模型 → vLLM(吞吐量最高)
|
|
657
|
+
if vram_gb >= 24 and params_b >= 7:
|
|
658
|
+
return "vllm"
|
|
659
|
+
|
|
660
|
+
# 中等场景 → TGI
|
|
661
|
+
if vram_gb >= 16:
|
|
662
|
+
return "tgi"
|
|
663
|
+
|
|
664
|
+
# 低显存 → llama.cpp
|
|
665
|
+
return "llama_cpp"
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def _lookup_params(model_id: str) -> float:
|
|
669
|
+
"""从数据库查找模型参数量"""
|
|
670
|
+
mid = model_id.lower()
|
|
671
|
+
for key, val in _MODEL_VRAM_DB.items():
|
|
672
|
+
if key.lower() in mid:
|
|
673
|
+
return val.get("params_b", 7.0)
|
|
674
|
+
return 7.0 # 默认假设 7B
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def generate_deployment_plan(
|
|
678
|
+
model_id: str,
|
|
679
|
+
vram_gb: float = 24.0,
|
|
680
|
+
gpu_count: int = 1,
|
|
681
|
+
engine: str = "",
|
|
682
|
+
port: int = 0,
|
|
683
|
+
quantization: str = "",
|
|
684
|
+
) -> DeploymentPlan:
|
|
685
|
+
"""
|
|
686
|
+
生成完整的模型部署计划。
|
|
687
|
+
|
|
688
|
+
包含部署步骤、Docker Compose 配置、环境变量等。
|
|
689
|
+
"""
|
|
690
|
+
if not engine:
|
|
691
|
+
engine = select_serving_engine(model_id, vram_gb)
|
|
692
|
+
|
|
693
|
+
engine_info = _SERVING_ENGINES.get(engine, _SERVING_ENGINES["vllm"])
|
|
694
|
+
if not port:
|
|
695
|
+
port = engine_info["default_port"]
|
|
696
|
+
|
|
697
|
+
params_b = _lookup_params(model_id)
|
|
698
|
+
|
|
699
|
+
# 自动选择量化方案
|
|
700
|
+
if not quantization:
|
|
701
|
+
if engine == "ollama" or engine == "llama_cpp":
|
|
702
|
+
quantization = "gguf"
|
|
703
|
+
elif params_b * 2 > vram_gb * gpu_count * 0.85:
|
|
704
|
+
quantization = "awq" if engine == "vllm" else "int8"
|
|
705
|
+
else:
|
|
706
|
+
quantization = "fp16"
|
|
707
|
+
|
|
708
|
+
# 估算 VRAM
|
|
709
|
+
bytes_per_param = {"fp16": 2, "int8": 1.1, "int4": 0.6, "awq": 0.6, "gptq": 0.6, "gguf": 0.6}
|
|
710
|
+
est_vram = params_b * bytes_per_param.get(quantization, 2) * 1.15 # 15% overhead
|
|
711
|
+
|
|
712
|
+
plan = DeploymentPlan(
|
|
713
|
+
model_id=model_id,
|
|
714
|
+
serving_engine=engine,
|
|
715
|
+
quantization=quantization,
|
|
716
|
+
gpu_type="",
|
|
717
|
+
gpu_count=gpu_count,
|
|
718
|
+
port=port,
|
|
719
|
+
docker=bool(engine_info["docker_image"]),
|
|
720
|
+
estimated_vram_gb=round(est_vram, 1),
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
# 生成部署步骤
|
|
724
|
+
steps = []
|
|
725
|
+
env_vars: dict[str, str] = {}
|
|
726
|
+
|
|
727
|
+
if engine == "vllm":
|
|
728
|
+
env_vars["HF_TOKEN"] = "${HF_TOKEN}"
|
|
729
|
+
steps.append(f"# 拉取 vLLM Docker 镜像")
|
|
730
|
+
steps.append(f"docker pull {engine_info['docker_image']}")
|
|
731
|
+
steps.append(f"# 启动 vLLM 推理服务")
|
|
732
|
+
run_cmd = (
|
|
733
|
+
f"docker run --gpus all -p {port}:{port} "
|
|
734
|
+
f"-e HF_TOKEN=$HF_TOKEN "
|
|
735
|
+
f"-v ~/.cache/huggingface:/root/.cache/huggingface "
|
|
736
|
+
f"{engine_info['docker_image']} "
|
|
737
|
+
f"--model {model_id} "
|
|
738
|
+
f"--port {port} "
|
|
739
|
+
f"--tensor-parallel-size {gpu_count}"
|
|
740
|
+
)
|
|
741
|
+
if quantization in ("awq", "gptq"):
|
|
742
|
+
run_cmd += f" --quantization {quantization}"
|
|
743
|
+
steps.append(run_cmd)
|
|
744
|
+
steps.append(f"# 测试 API")
|
|
745
|
+
steps.append(
|
|
746
|
+
f'curl http://localhost:{port}/v1/chat/completions '
|
|
747
|
+
f'-H "Content-Type: application/json" '
|
|
748
|
+
f'-d \'{{"model":"{model_id}","messages":[{{"role":"user","content":"Hello"}}]}}\''
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
elif engine == "tgi":
|
|
752
|
+
env_vars["HF_TOKEN"] = "${HF_TOKEN}"
|
|
753
|
+
steps.append(f"docker pull {engine_info['docker_image']}")
|
|
754
|
+
run_cmd = (
|
|
755
|
+
f"docker run --gpus all -p {port}:{port} "
|
|
756
|
+
f"-e HF_TOKEN=$HF_TOKEN "
|
|
757
|
+
f"-v ~/.cache/huggingface:/data "
|
|
758
|
+
f"{engine_info['docker_image']} "
|
|
759
|
+
f"--model-id {model_id} "
|
|
760
|
+
f"--port {port} "
|
|
761
|
+
f"--num-shard {gpu_count}"
|
|
762
|
+
)
|
|
763
|
+
if quantization in ("gptq", "awq"):
|
|
764
|
+
run_cmd += f" --quantize {quantization}"
|
|
765
|
+
steps.append(run_cmd)
|
|
766
|
+
steps.append(
|
|
767
|
+
f'curl http://localhost:{port}/generate '
|
|
768
|
+
f'-H "Content-Type: application/json" '
|
|
769
|
+
f'-d \'{{"inputs":"Hello","parameters":{{"max_new_tokens":50}}}}\''
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
elif engine == "ollama":
|
|
773
|
+
ollama_name = _model_to_ollama_name(model_id) or model_id
|
|
774
|
+
steps.append("# 安装 Ollama")
|
|
775
|
+
steps.append("curl -fsSL https://ollama.com/install.sh | sh")
|
|
776
|
+
steps.append(f"# 拉取模型")
|
|
777
|
+
steps.append(f"ollama pull {ollama_name}")
|
|
778
|
+
steps.append(f"# 启动对话")
|
|
779
|
+
steps.append(f"ollama run {ollama_name}")
|
|
780
|
+
|
|
781
|
+
elif engine == "llama_cpp":
|
|
782
|
+
steps.append("# 安装 llama-cpp-python")
|
|
783
|
+
steps.append("pip install llama-cpp-python[server]")
|
|
784
|
+
steps.append(f"# 从 HuggingFace 中下载 GGUF 模型文件")
|
|
785
|
+
steps.append(f"# 启动服务")
|
|
786
|
+
steps.append(
|
|
787
|
+
f"python -m llama_cpp.server --model ./model.gguf "
|
|
788
|
+
f"--host 0.0.0.0 --port {port} --n_gpu_layers -1"
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
elif engine == "triton":
|
|
792
|
+
steps.append(f"docker pull {engine_info['docker_image']}")
|
|
793
|
+
steps.append("# 准备模型仓库 (需转为 TensorRT-LLM 格式)")
|
|
794
|
+
steps.append(f"# 启动 Triton")
|
|
795
|
+
steps.append(
|
|
796
|
+
f"docker run --gpus all -p {port}:{port} -p 8002:8002 "
|
|
797
|
+
f"-v ./model_repository:/models "
|
|
798
|
+
f"{engine_info['docker_image']} tritonserver --model-repository=/models"
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
plan.steps = steps
|
|
802
|
+
plan.env_vars = env_vars
|
|
803
|
+
|
|
804
|
+
# 生成 docker-compose
|
|
805
|
+
if plan.docker and engine_info["docker_image"]:
|
|
806
|
+
plan.docker_compose = _generate_docker_compose(plan, engine_info)
|
|
807
|
+
|
|
808
|
+
return plan
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
def _generate_docker_compose(plan: DeploymentPlan, engine_info: dict) -> str:
|
|
812
|
+
"""生成 docker-compose.yml"""
|
|
813
|
+
lines = [
|
|
814
|
+
"version: '3.8'",
|
|
815
|
+
"services:",
|
|
816
|
+
f" {plan.serving_engine}:",
|
|
817
|
+
f" image: {engine_info['docker_image']}",
|
|
818
|
+
" ports:",
|
|
819
|
+
f" - '{plan.port}:{plan.port}'",
|
|
820
|
+
" environment:",
|
|
821
|
+
" - HF_TOKEN=${HF_TOKEN}",
|
|
822
|
+
" volumes:",
|
|
823
|
+
" - ~/.cache/huggingface:/root/.cache/huggingface",
|
|
824
|
+
" deploy:",
|
|
825
|
+
" resources:",
|
|
826
|
+
" reservations:",
|
|
827
|
+
" devices:",
|
|
828
|
+
" - driver: nvidia",
|
|
829
|
+
f" count: {plan.gpu_count}",
|
|
830
|
+
" capabilities: [gpu]",
|
|
831
|
+
]
|
|
832
|
+
|
|
833
|
+
if plan.serving_engine == "vllm":
|
|
834
|
+
cmd = f"--model {plan.model_id} --port {plan.port} --tensor-parallel-size {plan.gpu_count}"
|
|
835
|
+
if plan.quantization in ("awq", "gptq"):
|
|
836
|
+
cmd += f" --quantization {plan.quantization}"
|
|
837
|
+
lines.append(f" command: {cmd}")
|
|
838
|
+
elif plan.serving_engine == "tgi":
|
|
839
|
+
cmd = f"--model-id {plan.model_id} --port {plan.port} --num-shard {plan.gpu_count}"
|
|
840
|
+
if plan.quantization in ("gptq", "awq"):
|
|
841
|
+
cmd += f" --quantize {plan.quantization}"
|
|
842
|
+
lines.append(f" command: {cmd}")
|
|
843
|
+
|
|
844
|
+
return "\n".join(lines)
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
def format_deployment_plan(plan: DeploymentPlan) -> str:
|
|
848
|
+
"""格式化部署计划为可读输出"""
|
|
849
|
+
engine_info = _SERVING_ENGINES.get(plan.serving_engine, {})
|
|
850
|
+
lines = [
|
|
851
|
+
"🚀 AI 模型部署计划",
|
|
852
|
+
f" 模型: {plan.model_id}",
|
|
853
|
+
f" 引擎: {engine_info.get('name', plan.serving_engine)}",
|
|
854
|
+
f" 量化: {plan.quantization.upper()}",
|
|
855
|
+
f" GPU: ×{plan.gpu_count} (预估 VRAM: {plan.estimated_vram_gb}GB)",
|
|
856
|
+
f" 端口: {plan.port}",
|
|
857
|
+
f" API: {engine_info.get('api_compatible', 'REST')} 兼容",
|
|
858
|
+
"",
|
|
859
|
+
"📋 部署步骤:",
|
|
860
|
+
]
|
|
861
|
+
for i, step in enumerate(plan.steps, 1):
|
|
862
|
+
if step.startswith("#"):
|
|
863
|
+
lines.append(f" {step}")
|
|
864
|
+
else:
|
|
865
|
+
lines.append(f" $ {step}")
|
|
866
|
+
|
|
867
|
+
if plan.docker_compose:
|
|
868
|
+
lines.extend(["", "📦 docker-compose.yml:", plan.docker_compose])
|
|
869
|
+
|
|
870
|
+
return "\n".join(lines)
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
def generate_model_api_client(model_id: str, engine: str = "vllm", port: int = 0) -> str:
|
|
874
|
+
"""生成 Python API 客户端代码"""
|
|
875
|
+
engine_info = _SERVING_ENGINES.get(engine, _SERVING_ENGINES["vllm"])
|
|
876
|
+
if not port:
|
|
877
|
+
port = engine_info["default_port"]
|
|
878
|
+
|
|
879
|
+
if engine in ("vllm", "ollama"):
|
|
880
|
+
return f'''import urllib.request, json
|
|
881
|
+
|
|
882
|
+
def chat(message: str, model: str = "{model_id}") -> str:
|
|
883
|
+
"""调用本地部署的 {engine_info.get("name", engine)} 模型"""
|
|
884
|
+
data = json.dumps({{
|
|
885
|
+
"model": model,
|
|
886
|
+
"messages": [{{"role": "user", "content": message}}],
|
|
887
|
+
"temperature": 0.7,
|
|
888
|
+
}}).encode()
|
|
889
|
+
req = urllib.request.Request(
|
|
890
|
+
"http://localhost:{port}/v1/chat/completions",
|
|
891
|
+
data=data,
|
|
892
|
+
headers={{"Content-Type": "application/json"}},
|
|
893
|
+
)
|
|
894
|
+
with urllib.request.urlopen(req) as resp:
|
|
895
|
+
result = json.loads(resp.read())
|
|
896
|
+
return result["choices"][0]["message"]["content"]
|
|
897
|
+
|
|
898
|
+
# 使用示例
|
|
899
|
+
# print(chat("你好,请介绍一下你自己"))
|
|
900
|
+
'''
|
|
901
|
+
elif engine == "tgi":
|
|
902
|
+
return f'''import urllib.request, json
|
|
903
|
+
|
|
904
|
+
def generate(prompt: str) -> str:
|
|
905
|
+
"""调用本地部署的 TGI 模型"""
|
|
906
|
+
data = json.dumps({{
|
|
907
|
+
"inputs": prompt,
|
|
908
|
+
"parameters": {{"max_new_tokens": 512, "temperature": 0.7}},
|
|
909
|
+
}}).encode()
|
|
910
|
+
req = urllib.request.Request(
|
|
911
|
+
"http://localhost:{port}/generate",
|
|
912
|
+
data=data,
|
|
913
|
+
headers={{"Content-Type": "application/json"}},
|
|
914
|
+
)
|
|
915
|
+
with urllib.request.urlopen(req) as resp:
|
|
916
|
+
result = json.loads(resp.read())
|
|
917
|
+
return result["generated_text"]
|
|
918
|
+
|
|
919
|
+
# 使用示例
|
|
920
|
+
# print(generate("你好"))
|
|
921
|
+
'''
|
|
922
|
+
return "# 暂不支持此引擎的客户端生成"
|