gitinstall 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. gitinstall/__init__.py +61 -0
  2. gitinstall/_sdk.py +541 -0
  3. gitinstall/academic.py +831 -0
  4. gitinstall/admin.html +327 -0
  5. gitinstall/auto_update.py +384 -0
  6. gitinstall/autopilot.py +349 -0
  7. gitinstall/badge.py +476 -0
  8. gitinstall/checkpoint.py +330 -0
  9. gitinstall/cicd.py +499 -0
  10. gitinstall/clawhub.html +718 -0
  11. gitinstall/config_schema.py +353 -0
  12. gitinstall/db.py +984 -0
  13. gitinstall/db_backend.py +445 -0
  14. gitinstall/dep_chain.py +337 -0
  15. gitinstall/dependency_audit.py +1153 -0
  16. gitinstall/detector.py +542 -0
  17. gitinstall/doctor.py +493 -0
  18. gitinstall/education.py +869 -0
  19. gitinstall/enterprise.py +802 -0
  20. gitinstall/error_fixer.py +953 -0
  21. gitinstall/event_bus.py +251 -0
  22. gitinstall/executor.py +577 -0
  23. gitinstall/feature_flags.py +138 -0
  24. gitinstall/fetcher.py +921 -0
  25. gitinstall/huggingface.py +922 -0
  26. gitinstall/hw_detect.py +988 -0
  27. gitinstall/i18n.py +664 -0
  28. gitinstall/installer_registry.py +362 -0
  29. gitinstall/knowledge_base.py +379 -0
  30. gitinstall/license_check.py +605 -0
  31. gitinstall/llm.py +569 -0
  32. gitinstall/log.py +236 -0
  33. gitinstall/main.py +1408 -0
  34. gitinstall/mcp_agent.py +841 -0
  35. gitinstall/mcp_server.py +386 -0
  36. gitinstall/monorepo.py +810 -0
  37. gitinstall/multi_source.py +425 -0
  38. gitinstall/onboard.py +276 -0
  39. gitinstall/planner.py +222 -0
  40. gitinstall/planner_helpers.py +323 -0
  41. gitinstall/planner_known_projects.py +1010 -0
  42. gitinstall/planner_templates.py +996 -0
  43. gitinstall/remote_gpu.py +633 -0
  44. gitinstall/resilience.py +608 -0
  45. gitinstall/run_tests.py +572 -0
  46. gitinstall/skills.py +476 -0
  47. gitinstall/tool_schemas.py +324 -0
  48. gitinstall/trending.py +279 -0
  49. gitinstall/uninstaller.py +415 -0
  50. gitinstall/validate_top100.py +607 -0
  51. gitinstall/watchdog.py +180 -0
  52. gitinstall/web.py +1277 -0
  53. gitinstall/web_ui.html +2277 -0
  54. gitinstall-1.1.0.dist-info/METADATA +275 -0
  55. gitinstall-1.1.0.dist-info/RECORD +59 -0
  56. gitinstall-1.1.0.dist-info/WHEEL +5 -0
  57. gitinstall-1.1.0.dist-info/entry_points.txt +3 -0
  58. gitinstall-1.1.0.dist-info/licenses/LICENSE +21 -0
  59. gitinstall-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,922 @@
1
+ """
2
+ huggingface.py - HuggingFace Hub 集成
3
+ ======================================
4
+
5
+ 填补个人/AI 开发者市场 5-6% 覆盖度缺口。
6
+
7
+ 功能:
8
+ 1. HuggingFace 模型/数据集元数据获取
9
+ 2. VRAM 智能评估(根据模型参数量 + 量化方式)
10
+ 3. 模型下载策略生成(全量 / GGUF / AWQ / GPTQ)
11
+ 4. Gated Model(受限模型)访问检测
12
+ 5. LFS 大文件智能处理
13
+
14
+ 零外部依赖,纯 Python 标准库。
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import os
21
+ import re
22
+ import urllib.error
23
+ import urllib.request
24
+ from dataclasses import dataclass, field
25
+ from typing import Optional
26
+
27
+ # ─────────────────────────────────────────────
28
+ # HuggingFace 模型 VRAM 数据库
29
+ # 来源:官方 Model Card + 社区实测
30
+ # ─────────────────────────────────────────────
31
+
32
+ _MODEL_VRAM_DB: dict[str, dict] = {
33
+ # Meta Llama 系列
34
+ "meta-llama/Llama-2-7b-hf": {"params_b": 7, "family": "llama2"},
35
+ "meta-llama/Llama-2-13b-hf": {"params_b": 13, "family": "llama2"},
36
+ "meta-llama/Llama-2-70b-hf": {"params_b": 70, "family": "llama2"},
37
+ "meta-llama/Llama-3.1-8B": {"params_b": 8, "family": "llama3"},
38
+ "meta-llama/Llama-3.1-70B": {"params_b": 70, "family": "llama3"},
39
+ "meta-llama/Llama-3.1-405B": {"params_b": 405, "family": "llama3"},
40
+ "meta-llama/Llama-3.2-1B": {"params_b": 1, "family": "llama3"},
41
+ "meta-llama/Llama-3.2-3B": {"params_b": 3, "family": "llama3"},
42
+ "meta-llama/Llama-4-Scout-17B-16E": {"params_b": 109, "family": "llama4"},
43
+ "meta-llama/Llama-4-Maverick-17B-128E": {"params_b": 400, "family": "llama4"},
44
+
45
+ # Qwen(通义千问)系列
46
+ "Qwen/Qwen2.5-0.5B": {"params_b": 0.5, "family": "qwen2.5"},
47
+ "Qwen/Qwen2.5-1.5B": {"params_b": 1.5, "family": "qwen2.5"},
48
+ "Qwen/Qwen2.5-3B": {"params_b": 3, "family": "qwen2.5"},
49
+ "Qwen/Qwen2.5-7B": {"params_b": 7, "family": "qwen2.5"},
50
+ "Qwen/Qwen2.5-14B": {"params_b": 14, "family": "qwen2.5"},
51
+ "Qwen/Qwen2.5-32B": {"params_b": 32, "family": "qwen2.5"},
52
+ "Qwen/Qwen2.5-72B": {"params_b": 72, "family": "qwen2.5"},
53
+ "Qwen/Qwen3-8B": {"params_b": 8, "family": "qwen3"},
54
+ "Qwen/Qwen3-32B": {"params_b": 32, "family": "qwen3"},
55
+ "Qwen/Qwen3-235B-A22B": {"params_b": 235, "family": "qwen3_moe"},
56
+ "Qwen/QwQ-32B": {"params_b": 32, "family": "qwen3"},
57
+ "Qwen/Qwen2.5-Coder-7B": {"params_b": 7, "family": "qwen2.5-coder"},
58
+ "Qwen/Qwen2.5-Coder-32B": {"params_b": 32, "family": "qwen2.5-coder"},
59
+
60
+ # DeepSeek 系列
61
+ "deepseek-ai/DeepSeek-R1": {"params_b": 671, "family": "deepseek_moe"},
62
+ "deepseek-ai/DeepSeek-R1-0528": {"params_b": 671, "family": "deepseek_moe"},
63
+ "deepseek-ai/DeepSeek-V3": {"params_b": 671, "family": "deepseek_moe"},
64
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": {"params_b": 1.5, "family": "qwen2.5"},
65
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": {"params_b": 7, "family": "qwen2.5"},
66
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {"params_b": 32, "family": "qwen2.5"},
67
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {"params_b": 8, "family": "llama3"},
68
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {"params_b": 70, "family": "llama3"},
69
+ "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": {"params_b": 16, "family": "deepseek"},
70
+
71
+ # Mistral 系列
72
+ "mistralai/Mistral-7B-v0.3": {"params_b": 7, "family": "mistral"},
73
+ "mistralai/Mixtral-8x7B-v0.1": {"params_b": 47, "family": "mixtral"},
74
+ "mistralai/Mistral-Small-24B": {"params_b": 24, "family": "mistral"},
75
+ "mistralai/Mistral-Large-2411": {"params_b": 123, "family": "mistral"},
76
+ "mistralai/Codestral-25.01": {"params_b": 22, "family": "mistral"},
77
+
78
+ # Google Gemma
79
+ "google/gemma-2-2b": {"params_b": 2, "family": "gemma2"},
80
+ "google/gemma-2-9b": {"params_b": 9, "family": "gemma2"},
81
+ "google/gemma-2-27b": {"params_b": 27, "family": "gemma2"},
82
+ "google/gemma-3-1b-it": {"params_b": 1, "family": "gemma3"},
83
+ "google/gemma-3-4b-it": {"params_b": 4, "family": "gemma3"},
84
+ "google/gemma-3-12b-it": {"params_b": 12, "family": "gemma3"},
85
+ "google/gemma-3-27b-it": {"params_b": 27, "family": "gemma3"},
86
+
87
+ # Microsoft Phi
88
+ "microsoft/phi-4": {"params_b": 14, "family": "phi"},
89
+ "microsoft/Phi-3.5-mini-instruct": {"params_b": 3.8, "family": "phi"},
90
+ "microsoft/Phi-3-medium-128k-instruct": {"params_b": 14, "family": "phi"},
91
+
92
+ # 百川 / GLM / 其他中国模型
93
+ "baichuan-inc/Baichuan2-13B-Chat": {"params_b": 13, "family": "baichuan"},
94
+ "THUDM/chatglm3-6b": {"params_b": 6, "family": "glm"},
95
+ "THUDM/glm-4-9b-chat": {"params_b": 9, "family": "glm4"},
96
+ "01-ai/Yi-1.5-34B-Chat": {"params_b": 34, "family": "yi"},
97
+ "internlm/internlm2_5-7b-chat": {"params_b": 7, "family": "internlm"},
98
+
99
+ # Stability AI
100
+ "stabilityai/stable-diffusion-3.5-large": {"params_b": 8.1, "family": "sd3"},
101
+ "stabilityai/stable-diffusion-xl-base-1.0": {"params_b": 3.5, "family": "sdxl"},
102
+ "black-forest-labs/FLUX.1-dev": {"params_b": 12, "family": "flux"},
103
+ "black-forest-labs/FLUX.1-schnell": {"params_b": 12, "family": "flux"},
104
+
105
+ # Whisper (语音)
106
+ "openai/whisper-large-v3": {"params_b": 1.5, "family": "whisper"},
107
+ "openai/whisper-large-v3-turbo": {"params_b": 0.8, "family": "whisper"},
108
+ }
109
+
110
+ # 受限模型列表(需要 HF Token + 协议同意)
111
+ _GATED_MODELS = {
112
+ "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf",
113
+ "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-3.1-8B",
114
+ "meta-llama/Llama-3.1-70B", "meta-llama/Llama-3.1-405B",
115
+ "meta-llama/Llama-3.2-1B", "meta-llama/Llama-3.2-3B",
116
+ "meta-llama/Llama-4-Scout-17B-16E", "meta-llama/Llama-4-Maverick-17B-128E",
117
+ "google/gemma-2-2b", "google/gemma-2-9b", "google/gemma-2-27b",
118
+ "google/gemma-3-1b-it", "google/gemma-3-4b-it",
119
+ "google/gemma-3-12b-it", "google/gemma-3-27b-it",
120
+ "mistralai/Mistral-Large-2411",
121
+ }
122
+
123
+
124
+ @dataclass
125
+ class HFModelInfo:
126
+ """HuggingFace 模型元数据"""
127
+ model_id: str
128
+ params_b: float = 0.0
129
+ family: str = ""
130
+ pipeline_tag: str = "" # text-generation, image-classification, ...
131
+ library_name: str = "" # transformers, diffusers, ...
132
+ is_gated: bool = False
133
+ license: str = ""
134
+ downloads: int = 0
135
+ likes: int = 0
136
+ tags: list[str] = field(default_factory=list)
137
+ siblings: list[str] = field(default_factory=list) # 文件列表
138
+ error: str = ""
139
+
140
+
141
+ @dataclass
142
+ class VRAMEstimate:
143
+ """VRAM 估算结果"""
144
+ model_id: str
145
+ params_b: float
146
+ available_vram_gb: float
147
+ can_run: bool
148
+ recommended_method: str # "fp16" / "q8" / "q4_k" / "gguf" / "api_only"
149
+ vram_needed_gb: float
150
+ options: list[dict] = field(default_factory=list)
151
+ advice: str = ""
152
+ install_commands: list[str] = field(default_factory=list)
153
+
154
+
155
+ # ─────────────────────────────────────────────
156
+ # HuggingFace API 交互(零依赖)
157
+ # ─────────────────────────────────────────────
158
+
159
+ def _hf_token() -> str:
160
+ """获取 HuggingFace token"""
161
+ token = os.getenv("HF_TOKEN", "").strip()
162
+ if not token:
163
+ token = os.getenv("HUGGING_FACE_HUB_TOKEN", "").strip()
164
+ if not token:
165
+ token_path = os.path.expanduser("~/.cache/huggingface/token")
166
+ if os.path.exists(token_path):
167
+ with open(token_path) as f:
168
+ token = f.read().strip()
169
+ return token
170
+
171
+
172
+ def _hf_api_get(endpoint: str, timeout: int = 10) -> dict:
173
+ """调用 HuggingFace API"""
174
+ url = f"https://huggingface.co/api/{endpoint}"
175
+ headers = {"User-Agent": "gitinstall/1.1"}
176
+ token = _hf_token()
177
+ if token:
178
+ headers["Authorization"] = f"Bearer {token}"
179
+ req = urllib.request.Request(url, headers=headers)
180
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
181
+ return json.loads(resp.read().decode("utf-8"))
182
+
183
+
184
+ def is_huggingface_id(identifier: str) -> bool:
185
+ """判断是否为 HuggingFace 模型 ID (owner/model 格式,非 GitHub URL)"""
186
+ if "/" not in identifier:
187
+ return False
188
+ if any(identifier.startswith(p) for p in ("http://", "https://", "git@")):
189
+ return False
190
+ # HF model ID 通常是 owner/model-name 格式
191
+ # 检查是否在已知模型库中,或以 huggingface.co 开头
192
+ if identifier in _MODEL_VRAM_DB:
193
+ return True
194
+ if "huggingface.co" in identifier:
195
+ return True
196
+ return False
197
+
198
+
199
+ def parse_hf_url(url: str) -> str:
200
+ """从 HuggingFace URL 提取模型 ID"""
201
+ # https://huggingface.co/meta-llama/Llama-2-7b-hf → meta-llama/Llama-2-7b-hf
202
+ match = re.match(r'https?://huggingface\.co/([^/]+/[^/?#]+)', url)
203
+ if match:
204
+ return match.group(1)
205
+ return url
206
+
207
+
208
+ def fetch_model_info(model_id: str) -> HFModelInfo:
209
+ """获取 HuggingFace 模型详细信息"""
210
+ model_id = parse_hf_url(model_id)
211
+ info = HFModelInfo(model_id=model_id)
212
+
213
+ # 先查本地数据库
214
+ if model_id in _MODEL_VRAM_DB:
215
+ db = _MODEL_VRAM_DB[model_id]
216
+ info.params_b = db["params_b"]
217
+ info.family = db["family"]
218
+
219
+ # 检查受限模型
220
+ if model_id in _GATED_MODELS:
221
+ info.is_gated = True
222
+
223
+ # 尝试 API 获取更多信息
224
+ try:
225
+ data = _hf_api_get(f"models/{model_id}")
226
+ info.pipeline_tag = data.get("pipeline_tag", "")
227
+ info.library_name = data.get("library_name", "")
228
+ info.license = data.get("cardData", {}).get("license", "")
229
+ info.downloads = data.get("downloads", 0)
230
+ info.likes = data.get("likes", 0)
231
+ info.tags = data.get("tags", [])
232
+ info.is_gated = data.get("gated", False) or info.is_gated
233
+ # 文件列表
234
+ siblings = data.get("siblings", [])
235
+ info.siblings = [s.get("rfilename", "") for s in siblings]
236
+ # 如果本地数据库没有参数量,尝试从 safetensors metadata 推断
237
+ if not info.params_b:
238
+ info.params_b = _infer_params_from_metadata(data)
239
+ except Exception as e:
240
+ info.error = str(e)
241
+
242
+ return info
243
+
244
+
245
+ def _infer_params_from_metadata(data: dict) -> float:
246
+ """从 HF API 响应推断模型参数量"""
247
+ # safetensors.parameters 字段
248
+ st = data.get("safetensors", {})
249
+ if isinstance(st, dict):
250
+ params = st.get("parameters", {})
251
+ if isinstance(params, dict):
252
+ total = sum(params.values())
253
+ if total > 0:
254
+ return round(total / 1e9, 1)
255
+ # 从 model ID 或 tag 推断
256
+ model_id = data.get("modelId", "")
257
+ for tag in data.get("tags", []) + [model_id]:
258
+ match = re.search(r'(\d+)[._-]?(\d*)\s*[bB]', str(tag))
259
+ if match:
260
+ b = float(match.group(1))
261
+ if match.group(2):
262
+ b = float(f"{match.group(1)}.{match.group(2)}")
263
+ if 0.1 <= b <= 1000:
264
+ return b
265
+ return 0.0
266
+
267
+
268
+ # ─────────────────────────────────────────────
269
+ # VRAM 智能评估
270
+ # ─────────────────────────────────────────────
271
+
272
+ def estimate_vram(
273
+ model_id: str,
274
+ available_vram_gb: float,
275
+ use_case: str = "inference", # "inference" | "finetune" | "lora"
276
+ ) -> VRAMEstimate:
277
+ """
278
+ 智能评估给定模型在目标硬件上的可用性。
279
+
280
+ Returns:
281
+ VRAMEstimate 包含是否能运行、推荐方案、安装命令等
282
+ """
283
+ # 优先从本地数据库获取参数量,避免网络请求
284
+ db_info = _MODEL_VRAM_DB.get(model_id)
285
+ if db_info:
286
+ info = HFModelInfo(model_id=model_id, params_b=db_info["params_b"])
287
+ elif model_id.startswith("http") or "/" in model_id:
288
+ info = fetch_model_info(model_id)
289
+ else:
290
+ info = HFModelInfo(model_id=model_id)
291
+
292
+ # 如果还没有参数量,尝试从 ID 推断
293
+ params_b = info.params_b
294
+ if not params_b:
295
+ match = re.search(r'(\d+\.?\d*)\s*[bB]', model_id)
296
+ if match:
297
+ params_b = float(match.group(1))
298
+
299
+ if not params_b:
300
+ return VRAMEstimate(
301
+ model_id=model_id, params_b=0, available_vram_gb=available_vram_gb,
302
+ can_run=False, recommended_method="unknown", vram_needed_gb=0,
303
+ advice=f"无法确定模型 {model_id} 的参数量,请手动指定",
304
+ )
305
+
306
+ # 量化方案及其 VRAM 占用公式
307
+ quant_formulas = {
308
+ "fp32": lambda b: b * 4.0 + 2, # 4 bytes/param + overhead
309
+ "fp16": lambda b: b * 2.0 + 1.5, # 2 bytes/param
310
+ "q8": lambda b: b * 1.1 + 1.0, # ~1.1 bytes/param
311
+ "q6_k": lambda b: b * 0.85 + 0.8, # ~0.85 bytes/param
312
+ "q5_k": lambda b: b * 0.75 + 0.7, # ~0.75 bytes/param
313
+ "q4_k": lambda b: b * 0.6 + 0.5, # ~0.6 bytes/param
314
+ "q4_0": lambda b: b * 0.55 + 0.5, # ~0.55 bytes/param
315
+ "q3_k": lambda b: b * 0.45 + 0.4, # ~0.45 bytes/param
316
+ "q2_k": lambda b: b * 0.35 + 0.3, # ~0.35 bytes/param
317
+ }
318
+
319
+ # LoRA 微调额外开销
320
+ lora_overhead_gb = 0.0
321
+ if use_case == "lora":
322
+ lora_overhead_gb = params_b * 0.15 + 2.0 # ~15% 参数 + optimizer states
323
+ elif use_case == "finetune":
324
+ lora_overhead_gb = params_b * 3.0 + 4.0 # full finetune 需要 ~3x model size
325
+
326
+ quality_labels = {
327
+ "fp32": "完美(无损)", "fp16": "极好", "q8": "优秀",
328
+ "q6_k": "很好", "q5_k": "良好", "q4_k": "良好(性价比最佳)",
329
+ "q4_0": "可用", "q3_k": "一般", "q2_k": "较差",
330
+ }
331
+
332
+ options = []
333
+ recommended = None
334
+ for quant, formula in quant_formulas.items():
335
+ vram = round(formula(params_b) + lora_overhead_gb, 1)
336
+ fits = vram <= available_vram_gb
337
+ options.append({
338
+ "quant": quant, "vram_gb": vram, "fits": fits,
339
+ "quality": quality_labels.get(quant, ""),
340
+ })
341
+ if fits and recommended is None:
342
+ recommended = quant
343
+
344
+ can_run = recommended is not None
345
+ vram_needed = quant_formulas.get(recommended or "q2_k", lambda b: b * 0.35)(params_b)
346
+
347
+ # 生成安装命令
348
+ install_cmds = _generate_install_commands(model_id, info, recommended, use_case)
349
+
350
+ # 生成建议
351
+ if not can_run:
352
+ min_q = options[-1]["vram_gb"] if options else 0
353
+ advice = (
354
+ f"❌ 当前 VRAM {available_vram_gb}GB 不足以{_use_case_label(use_case)} {params_b}B 模型\n"
355
+ f" 最低需 {min_q}GB (Q2_K 量化)\n"
356
+ f" 建议: 使用更小的模型,或通过 API 调用"
357
+ )
358
+ elif recommended in ("fp32", "fp16"):
359
+ advice = f"✅ VRAM 充足!可以 {recommended.upper()} 全精度{_use_case_label(use_case)} {params_b}B 模型"
360
+ elif recommended in ("q8", "q6_k"):
361
+ advice = f"✅ 推荐 {recommended.upper()} 量化,质量损失极小"
362
+ else:
363
+ advice = f"⚠️ 推荐 {recommended.upper()} 量化(VRAM 紧张,会有一定质量损失)"
364
+
365
+ if info.is_gated:
366
+ advice += "\n⚠️ 这是受限模型,需要先在 HuggingFace 网站同意使用协议,并设置 HF_TOKEN"
367
+
368
+ return VRAMEstimate(
369
+ model_id=model_id, params_b=params_b,
370
+ available_vram_gb=available_vram_gb,
371
+ can_run=can_run, recommended_method=recommended or "api_only",
372
+ vram_needed_gb=round(vram_needed, 1),
373
+ options=options, advice=advice,
374
+ install_commands=install_cmds,
375
+ )
376
+
377
+
378
+ def _use_case_label(use_case: str) -> str:
379
+ return {"inference": "推理运行", "finetune": "全量微调", "lora": "LoRA 微调"}.get(use_case, "运行")
380
+
381
+
382
+ def _generate_install_commands(
383
+ model_id: str, info: HFModelInfo,
384
+ recommended_quant: Optional[str], use_case: str,
385
+ ) -> list[str]:
386
+ """根据模型信息和推荐方案生成安装命令"""
387
+ cmds = []
388
+
389
+ if info.is_gated:
390
+ cmds.append("# 1. 请先设置 HuggingFace Token")
391
+ cmds.append("export HF_TOKEN=hf_your_token_here")
392
+ cmds.append("# 或: huggingface-cli login")
393
+
394
+ pipeline = info.pipeline_tag or "text-generation"
395
+
396
+ if pipeline in ("text-generation", "text2text-generation"):
397
+ # LLM 模型
398
+ if recommended_quant and recommended_quant.startswith("q"):
399
+ # 推荐使用 llama.cpp / Ollama 运行量化版本
400
+ cmds.append("# 方式一: 使用 Ollama(推荐,最简单)")
401
+ ollama_name = _model_to_ollama_name(model_id)
402
+ if ollama_name:
403
+ cmds.append(f"ollama pull {ollama_name}")
404
+ cmds.append(f"ollama run {ollama_name}")
405
+ cmds.append("")
406
+ cmds.append("# 方式二: 使用 llama.cpp + GGUF 量化文件")
407
+ cmds.append("pip install llama-cpp-python")
408
+ else:
409
+ # FP16/FP32 全精度
410
+ cmds.append("pip install transformers torch accelerate")
411
+ if use_case == "lora":
412
+ cmds.append("pip install peft bitsandbytes")
413
+ cmds.append(f"python -c \"from transformers import AutoModelForCausalLM, AutoTokenizer; "
414
+ f"m = AutoModelForCausalLM.from_pretrained('{model_id}', torch_dtype='auto', "
415
+ f"device_map='auto'); t = AutoTokenizer.from_pretrained('{model_id}')\"")
416
+
417
+ elif pipeline in ("text-to-image", "image-to-image"):
418
+ # Diffusion 模型
419
+ cmds.append("pip install diffusers transformers torch accelerate")
420
+ cmds.append(f"python -c \"from diffusers import DiffusionPipeline; "
421
+ f"pipe = DiffusionPipeline.from_pretrained('{model_id}', torch_dtype='auto')\"")
422
+
423
+ elif pipeline in ("automatic-speech-recognition",):
424
+ # ASR 模型 (Whisper 等)
425
+ cmds.append("pip install transformers torch torchaudio")
426
+ cmds.append(f"python -c \"from transformers import pipeline; "
427
+ f"pipe = pipeline('automatic-speech-recognition', model='{model_id}')\"")
428
+
429
+ else:
430
+ # 通用 transformers
431
+ cmds.append("pip install transformers torch")
432
+ cmds.append(f"python -c \"from transformers import AutoModel; "
433
+ f"m = AutoModel.from_pretrained('{model_id}')\"")
434
+
435
+ return cmds
436
+
437
+
438
+ def _model_to_ollama_name(model_id: str) -> Optional[str]:
439
+ """将 HuggingFace 模型 ID 映射到 Ollama 模型名"""
440
+ mapping = {
441
+ "meta-llama/Llama-3.1-8B": "llama3.1:8b",
442
+ "meta-llama/Llama-3.1-70B": "llama3.1:70b",
443
+ "meta-llama/Llama-3.2-1B": "llama3.2:1b",
444
+ "meta-llama/Llama-3.2-3B": "llama3.2:3b",
445
+ "Qwen/Qwen2.5-0.5B": "qwen2.5:0.5b",
446
+ "Qwen/Qwen2.5-1.5B": "qwen2.5:1.5b",
447
+ "Qwen/Qwen2.5-3B": "qwen2.5:3b",
448
+ "Qwen/Qwen2.5-7B": "qwen2.5:7b",
449
+ "Qwen/Qwen2.5-14B": "qwen2.5:14b",
450
+ "Qwen/Qwen2.5-32B": "qwen2.5:32b",
451
+ "Qwen/Qwen2.5-72B": "qwen2.5:72b",
452
+ "Qwen/Qwen3-8B": "qwen3:8b",
453
+ "Qwen/Qwen3-32B": "qwen3:32b",
454
+ "Qwen/QwQ-32B": "qwq:32b",
455
+ "Qwen/Qwen2.5-Coder-7B": "qwen2.5-coder:7b",
456
+ "Qwen/Qwen2.5-Coder-32B": "qwen2.5-coder:32b",
457
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": "deepseek-r1:1.5b",
458
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": "deepseek-r1:7b",
459
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": "deepseek-r1:32b",
460
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": "deepseek-r1:8b",
461
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": "deepseek-r1:70b",
462
+ "mistralai/Mistral-7B-v0.3": "mistral:7b",
463
+ "google/gemma-2-2b": "gemma2:2b",
464
+ "google/gemma-2-9b": "gemma2:9b",
465
+ "google/gemma-2-27b": "gemma2:27b",
466
+ "google/gemma-3-1b-it": "gemma3:1b",
467
+ "google/gemma-3-4b-it": "gemma3:4b",
468
+ "google/gemma-3-12b-it": "gemma3:12b",
469
+ "google/gemma-3-27b-it": "gemma3:27b",
470
+ "microsoft/phi-4": "phi4:14b",
471
+ }
472
+ return mapping.get(model_id)
473
+
474
+
475
+ # ─────────────────────────────────────────────
476
+ # 模型推荐
477
+ # ─────────────────────────────────────────────
478
+
479
+ def recommend_models_for_hardware(
480
+ vram_gb: float,
481
+ use_case: str = "inference",
482
+ language: str = "zh", # "zh" | "en" | "code"
483
+ ) -> list[dict]:
484
+ """
485
+ 根据硬件 VRAM 推荐最适合的模型。
486
+
487
+ Args:
488
+ vram_gb: 可用 GPU 内存
489
+ use_case: "inference" / "lora" / "finetune"
490
+ language: "zh" (中文优先) / "en" (英文) / "code" (代码)
491
+
492
+ Returns:
493
+ 推荐列表,按适合程度排序
494
+ """
495
+ recommendations = []
496
+
497
+ # 按语言偏好排序的模型家族
498
+ family_priority = {
499
+ "zh": ["qwen3", "qwen2.5", "qwen2.5-coder", "deepseek", "glm4", "llama3", "gemma3", "phi"],
500
+ "en": ["llama3", "gemma3", "mistral", "phi", "qwen2.5"],
501
+ "code": ["qwen2.5-coder", "deepseek", "llama3", "phi"],
502
+ }
503
+ preferred = family_priority.get(language, family_priority["en"])
504
+
505
+ for model_id, db_info in _MODEL_VRAM_DB.items():
506
+ params_b = db_info["params_b"]
507
+ family = db_info["family"]
508
+
509
+ est = estimate_vram(model_id, vram_gb, use_case)
510
+ if not est.can_run:
511
+ continue
512
+
513
+ # 评分: 参数量越大越好 + 语言匹配加分
514
+ score = params_b * 10
515
+ if family in preferred:
516
+ score += (len(preferred) - preferred.index(family)) * 50
517
+
518
+ recommendations.append({
519
+ "model_id": model_id,
520
+ "params_b": params_b,
521
+ "family": family,
522
+ "quant": est.recommended_method,
523
+ "vram_needed": est.vram_needed_gb,
524
+ "score": score,
525
+ "advice": est.advice,
526
+ })
527
+
528
+ recommendations.sort(key=lambda x: x["score"], reverse=True)
529
+ return recommendations[:10]
530
+
531
+
532
+ # ─────────────────────────────────────────────
533
+ # 格式化输出
534
+ # ─────────────────────────────────────────────
535
+
536
+ def format_vram_estimate(est: VRAMEstimate) -> str:
537
+ """格式化 VRAM 估算结果"""
538
+ lines = [
539
+ f"🧠 模型 VRAM 评估: {est.model_id}",
540
+ f" 参数量: {est.params_b}B",
541
+ f" 可用 VRAM: {est.available_vram_gb}GB",
542
+ "",
543
+ est.advice,
544
+ "",
545
+ ]
546
+
547
+ if est.options:
548
+ lines.append(" 量化方案对比:")
549
+ for opt in est.options:
550
+ mark = "✅" if opt["fits"] else "❌"
551
+ lines.append(f" {mark} {opt['quant']:6s} → {opt['vram_gb']:6.1f}GB {opt['quality']}")
552
+
553
+ if est.install_commands:
554
+ lines.append("")
555
+ lines.append(" 安装命令:")
556
+ for cmd in est.install_commands:
557
+ lines.append(f" {cmd}")
558
+
559
+ return "\n".join(lines)
560
+
561
+
562
+ def format_model_recommendations(recs: list[dict], vram_gb: float) -> str:
563
+ """格式化模型推荐列表"""
564
+ lines = [
565
+ f"🤖 基于 {vram_gb}GB VRAM 的模型推荐:",
566
+ "",
567
+ ]
568
+ for i, rec in enumerate(recs, 1):
569
+ lines.append(
570
+ f" {i}. {rec['model_id']} ({rec['params_b']}B)"
571
+ f" → {rec['quant'].upper()} ({rec['vram_needed']:.1f}GB)"
572
+ )
573
+ return "\n".join(lines)
574
+
575
+
576
+ # ─────────────────────────────────────────────
577
+ # AI 模型部署自动化 (Market Opportunity #1)
578
+ # ─────────────────────────────────────────────
579
+
580
+ @dataclass
581
+ class DeploymentPlan:
582
+ """模型部署计划"""
583
+ model_id: str = ""
584
+ serving_engine: str = "" # vllm | tgi | ollama | llama_cpp | triton
585
+ quantization: str = "" # fp16 | int8 | int4 | awq | gptq | gguf
586
+ gpu_type: str = ""
587
+ gpu_count: int = 1
588
+ port: int = 8000
589
+ docker: bool = True
590
+ steps: list[str] = field(default_factory=list)
591
+ docker_compose: str = ""
592
+ env_vars: dict[str, str] = field(default_factory=dict)
593
+ estimated_vram_gb: float = 0.0
594
+ estimated_tps: float = 0.0 # tokens per second
595
+
596
+
597
+ _SERVING_ENGINES = {
598
+ "vllm": {
599
+ "name": "vLLM",
600
+ "docker_image": "vllm/vllm-openai:latest",
601
+ "default_port": 8000,
602
+ "api_compatible": "OpenAI",
603
+ "min_vram_gb": 16,
604
+ "supports": ["fp16", "awq", "gptq", "int8"],
605
+ },
606
+ "tgi": {
607
+ "name": "Text Generation Inference (TGI)",
608
+ "docker_image": "ghcr.io/huggingface/text-generation-inference:latest",
609
+ "default_port": 8080,
610
+ "api_compatible": "HuggingFace",
611
+ "min_vram_gb": 16,
612
+ "supports": ["fp16", "gptq", "awq", "int8", "int4"],
613
+ },
614
+ "ollama": {
615
+ "name": "Ollama",
616
+ "docker_image": "ollama/ollama:latest",
617
+ "default_port": 11434,
618
+ "api_compatible": "Ollama / OpenAI",
619
+ "min_vram_gb": 4,
620
+ "supports": ["gguf", "fp16"],
621
+ },
622
+ "llama_cpp": {
623
+ "name": "llama.cpp Server",
624
+ "docker_image": "",
625
+ "default_port": 8080,
626
+ "api_compatible": "OpenAI",
627
+ "min_vram_gb": 2,
628
+ "supports": ["gguf"],
629
+ },
630
+ "triton": {
631
+ "name": "NVIDIA Triton Inference Server",
632
+ "docker_image": "nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3",
633
+ "default_port": 8001,
634
+ "api_compatible": "gRPC / HTTP",
635
+ "min_vram_gb": 24,
636
+ "supports": ["fp16", "int8", "int4"],
637
+ },
638
+ }
639
+
640
+
641
+ def select_serving_engine(
642
+ model_id: str,
643
+ vram_gb: float,
644
+ use_case: str = "inference",
645
+ prefer_docker: bool = True,
646
+ ) -> str:
647
+ """根据模型和硬件自动选择最佳推理引擎"""
648
+ params_b = _lookup_params(model_id)
649
+
650
+ # 小模型 → Ollama 最简单
651
+ if params_b <= 13 and vram_gb >= 8:
652
+ ollama_name = _model_to_ollama_name(model_id)
653
+ if ollama_name:
654
+ return "ollama"
655
+
656
+ # 大显存 + 大模型 → vLLM(吞吐量最高)
657
+ if vram_gb >= 24 and params_b >= 7:
658
+ return "vllm"
659
+
660
+ # 中等场景 → TGI
661
+ if vram_gb >= 16:
662
+ return "tgi"
663
+
664
+ # 低显存 → llama.cpp
665
+ return "llama_cpp"
666
+
667
+
668
+ def _lookup_params(model_id: str) -> float:
669
+ """从数据库查找模型参数量"""
670
+ mid = model_id.lower()
671
+ for key, val in _MODEL_VRAM_DB.items():
672
+ if key.lower() in mid:
673
+ return val.get("params_b", 7.0)
674
+ return 7.0 # 默认假设 7B
675
+
676
+
677
+ def generate_deployment_plan(
678
+ model_id: str,
679
+ vram_gb: float = 24.0,
680
+ gpu_count: int = 1,
681
+ engine: str = "",
682
+ port: int = 0,
683
+ quantization: str = "",
684
+ ) -> DeploymentPlan:
685
+ """
686
+ 生成完整的模型部署计划。
687
+
688
+ 包含部署步骤、Docker Compose 配置、环境变量等。
689
+ """
690
+ if not engine:
691
+ engine = select_serving_engine(model_id, vram_gb)
692
+
693
+ engine_info = _SERVING_ENGINES.get(engine, _SERVING_ENGINES["vllm"])
694
+ if not port:
695
+ port = engine_info["default_port"]
696
+
697
+ params_b = _lookup_params(model_id)
698
+
699
+ # 自动选择量化方案
700
+ if not quantization:
701
+ if engine == "ollama" or engine == "llama_cpp":
702
+ quantization = "gguf"
703
+ elif params_b * 2 > vram_gb * gpu_count * 0.85:
704
+ quantization = "awq" if engine == "vllm" else "int8"
705
+ else:
706
+ quantization = "fp16"
707
+
708
+ # 估算 VRAM
709
+ bytes_per_param = {"fp16": 2, "int8": 1.1, "int4": 0.6, "awq": 0.6, "gptq": 0.6, "gguf": 0.6}
710
+ est_vram = params_b * bytes_per_param.get(quantization, 2) * 1.15 # 15% overhead
711
+
712
+ plan = DeploymentPlan(
713
+ model_id=model_id,
714
+ serving_engine=engine,
715
+ quantization=quantization,
716
+ gpu_type="",
717
+ gpu_count=gpu_count,
718
+ port=port,
719
+ docker=bool(engine_info["docker_image"]),
720
+ estimated_vram_gb=round(est_vram, 1),
721
+ )
722
+
723
+ # 生成部署步骤
724
+ steps = []
725
+ env_vars: dict[str, str] = {}
726
+
727
+ if engine == "vllm":
728
+ env_vars["HF_TOKEN"] = "${HF_TOKEN}"
729
+ steps.append(f"# 拉取 vLLM Docker 镜像")
730
+ steps.append(f"docker pull {engine_info['docker_image']}")
731
+ steps.append(f"# 启动 vLLM 推理服务")
732
+ run_cmd = (
733
+ f"docker run --gpus all -p {port}:{port} "
734
+ f"-e HF_TOKEN=$HF_TOKEN "
735
+ f"-v ~/.cache/huggingface:/root/.cache/huggingface "
736
+ f"{engine_info['docker_image']} "
737
+ f"--model {model_id} "
738
+ f"--port {port} "
739
+ f"--tensor-parallel-size {gpu_count}"
740
+ )
741
+ if quantization in ("awq", "gptq"):
742
+ run_cmd += f" --quantization {quantization}"
743
+ steps.append(run_cmd)
744
+ steps.append(f"# 测试 API")
745
+ steps.append(
746
+ f'curl http://localhost:{port}/v1/chat/completions '
747
+ f'-H "Content-Type: application/json" '
748
+ f'-d \'{{"model":"{model_id}","messages":[{{"role":"user","content":"Hello"}}]}}\''
749
+ )
750
+
751
+ elif engine == "tgi":
752
+ env_vars["HF_TOKEN"] = "${HF_TOKEN}"
753
+ steps.append(f"docker pull {engine_info['docker_image']}")
754
+ run_cmd = (
755
+ f"docker run --gpus all -p {port}:{port} "
756
+ f"-e HF_TOKEN=$HF_TOKEN "
757
+ f"-v ~/.cache/huggingface:/data "
758
+ f"{engine_info['docker_image']} "
759
+ f"--model-id {model_id} "
760
+ f"--port {port} "
761
+ f"--num-shard {gpu_count}"
762
+ )
763
+ if quantization in ("gptq", "awq"):
764
+ run_cmd += f" --quantize {quantization}"
765
+ steps.append(run_cmd)
766
+ steps.append(
767
+ f'curl http://localhost:{port}/generate '
768
+ f'-H "Content-Type: application/json" '
769
+ f'-d \'{{"inputs":"Hello","parameters":{{"max_new_tokens":50}}}}\''
770
+ )
771
+
772
+ elif engine == "ollama":
773
+ ollama_name = _model_to_ollama_name(model_id) or model_id
774
+ steps.append("# 安装 Ollama")
775
+ steps.append("curl -fsSL https://ollama.com/install.sh | sh")
776
+ steps.append(f"# 拉取模型")
777
+ steps.append(f"ollama pull {ollama_name}")
778
+ steps.append(f"# 启动对话")
779
+ steps.append(f"ollama run {ollama_name}")
780
+
781
+ elif engine == "llama_cpp":
782
+ steps.append("# 安装 llama-cpp-python")
783
+ steps.append("pip install llama-cpp-python[server]")
784
+ steps.append(f"# 从 HuggingFace 中下载 GGUF 模型文件")
785
+ steps.append(f"# 启动服务")
786
+ steps.append(
787
+ f"python -m llama_cpp.server --model ./model.gguf "
788
+ f"--host 0.0.0.0 --port {port} --n_gpu_layers -1"
789
+ )
790
+
791
+ elif engine == "triton":
792
+ steps.append(f"docker pull {engine_info['docker_image']}")
793
+ steps.append("# 准备模型仓库 (需转为 TensorRT-LLM 格式)")
794
+ steps.append(f"# 启动 Triton")
795
+ steps.append(
796
+ f"docker run --gpus all -p {port}:{port} -p 8002:8002 "
797
+ f"-v ./model_repository:/models "
798
+ f"{engine_info['docker_image']} tritonserver --model-repository=/models"
799
+ )
800
+
801
+ plan.steps = steps
802
+ plan.env_vars = env_vars
803
+
804
+ # 生成 docker-compose
805
+ if plan.docker and engine_info["docker_image"]:
806
+ plan.docker_compose = _generate_docker_compose(plan, engine_info)
807
+
808
+ return plan
809
+
810
+
811
+ def _generate_docker_compose(plan: DeploymentPlan, engine_info: dict) -> str:
812
+ """生成 docker-compose.yml"""
813
+ lines = [
814
+ "version: '3.8'",
815
+ "services:",
816
+ f" {plan.serving_engine}:",
817
+ f" image: {engine_info['docker_image']}",
818
+ " ports:",
819
+ f" - '{plan.port}:{plan.port}'",
820
+ " environment:",
821
+ " - HF_TOKEN=${HF_TOKEN}",
822
+ " volumes:",
823
+ " - ~/.cache/huggingface:/root/.cache/huggingface",
824
+ " deploy:",
825
+ " resources:",
826
+ " reservations:",
827
+ " devices:",
828
+ " - driver: nvidia",
829
+ f" count: {plan.gpu_count}",
830
+ " capabilities: [gpu]",
831
+ ]
832
+
833
+ if plan.serving_engine == "vllm":
834
+ cmd = f"--model {plan.model_id} --port {plan.port} --tensor-parallel-size {plan.gpu_count}"
835
+ if plan.quantization in ("awq", "gptq"):
836
+ cmd += f" --quantization {plan.quantization}"
837
+ lines.append(f" command: {cmd}")
838
+ elif plan.serving_engine == "tgi":
839
+ cmd = f"--model-id {plan.model_id} --port {plan.port} --num-shard {plan.gpu_count}"
840
+ if plan.quantization in ("gptq", "awq"):
841
+ cmd += f" --quantize {plan.quantization}"
842
+ lines.append(f" command: {cmd}")
843
+
844
+ return "\n".join(lines)
845
+
846
+
847
+ def format_deployment_plan(plan: DeploymentPlan) -> str:
848
+ """格式化部署计划为可读输出"""
849
+ engine_info = _SERVING_ENGINES.get(plan.serving_engine, {})
850
+ lines = [
851
+ "🚀 AI 模型部署计划",
852
+ f" 模型: {plan.model_id}",
853
+ f" 引擎: {engine_info.get('name', plan.serving_engine)}",
854
+ f" 量化: {plan.quantization.upper()}",
855
+ f" GPU: ×{plan.gpu_count} (预估 VRAM: {plan.estimated_vram_gb}GB)",
856
+ f" 端口: {plan.port}",
857
+ f" API: {engine_info.get('api_compatible', 'REST')} 兼容",
858
+ "",
859
+ "📋 部署步骤:",
860
+ ]
861
+ for i, step in enumerate(plan.steps, 1):
862
+ if step.startswith("#"):
863
+ lines.append(f" {step}")
864
+ else:
865
+ lines.append(f" $ {step}")
866
+
867
+ if plan.docker_compose:
868
+ lines.extend(["", "📦 docker-compose.yml:", plan.docker_compose])
869
+
870
+ return "\n".join(lines)
871
+
872
+
873
+ def generate_model_api_client(model_id: str, engine: str = "vllm", port: int = 0) -> str:
874
+ """生成 Python API 客户端代码"""
875
+ engine_info = _SERVING_ENGINES.get(engine, _SERVING_ENGINES["vllm"])
876
+ if not port:
877
+ port = engine_info["default_port"]
878
+
879
+ if engine in ("vllm", "ollama"):
880
+ return f'''import urllib.request, json
881
+
882
+ def chat(message: str, model: str = "{model_id}") -> str:
883
+ """调用本地部署的 {engine_info.get("name", engine)} 模型"""
884
+ data = json.dumps({{
885
+ "model": model,
886
+ "messages": [{{"role": "user", "content": message}}],
887
+ "temperature": 0.7,
888
+ }}).encode()
889
+ req = urllib.request.Request(
890
+ "http://localhost:{port}/v1/chat/completions",
891
+ data=data,
892
+ headers={{"Content-Type": "application/json"}},
893
+ )
894
+ with urllib.request.urlopen(req) as resp:
895
+ result = json.loads(resp.read())
896
+ return result["choices"][0]["message"]["content"]
897
+
898
+ # 使用示例
899
+ # print(chat("你好,请介绍一下你自己"))
900
+ '''
901
+ elif engine == "tgi":
902
+ return f'''import urllib.request, json
903
+
904
+ def generate(prompt: str) -> str:
905
+ """调用本地部署的 TGI 模型"""
906
+ data = json.dumps({{
907
+ "inputs": prompt,
908
+ "parameters": {{"max_new_tokens": 512, "temperature": 0.7}},
909
+ }}).encode()
910
+ req = urllib.request.Request(
911
+ "http://localhost:{port}/generate",
912
+ data=data,
913
+ headers={{"Content-Type": "application/json"}},
914
+ )
915
+ with urllib.request.urlopen(req) as resp:
916
+ result = json.loads(resp.read())
917
+ return result["generated_text"]
918
+
919
+ # 使用示例
920
+ # print(generate("你好"))
921
+ '''
922
+ return "# 暂不支持此引擎的客户端生成"