gitinstall 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. gitinstall/__init__.py +61 -0
  2. gitinstall/_sdk.py +541 -0
  3. gitinstall/academic.py +831 -0
  4. gitinstall/admin.html +327 -0
  5. gitinstall/auto_update.py +384 -0
  6. gitinstall/autopilot.py +349 -0
  7. gitinstall/badge.py +476 -0
  8. gitinstall/checkpoint.py +330 -0
  9. gitinstall/cicd.py +499 -0
  10. gitinstall/clawhub.html +718 -0
  11. gitinstall/config_schema.py +353 -0
  12. gitinstall/db.py +984 -0
  13. gitinstall/db_backend.py +445 -0
  14. gitinstall/dep_chain.py +337 -0
  15. gitinstall/dependency_audit.py +1153 -0
  16. gitinstall/detector.py +542 -0
  17. gitinstall/doctor.py +493 -0
  18. gitinstall/education.py +869 -0
  19. gitinstall/enterprise.py +802 -0
  20. gitinstall/error_fixer.py +953 -0
  21. gitinstall/event_bus.py +251 -0
  22. gitinstall/executor.py +577 -0
  23. gitinstall/feature_flags.py +138 -0
  24. gitinstall/fetcher.py +921 -0
  25. gitinstall/huggingface.py +922 -0
  26. gitinstall/hw_detect.py +988 -0
  27. gitinstall/i18n.py +664 -0
  28. gitinstall/installer_registry.py +362 -0
  29. gitinstall/knowledge_base.py +379 -0
  30. gitinstall/license_check.py +605 -0
  31. gitinstall/llm.py +569 -0
  32. gitinstall/log.py +236 -0
  33. gitinstall/main.py +1408 -0
  34. gitinstall/mcp_agent.py +841 -0
  35. gitinstall/mcp_server.py +386 -0
  36. gitinstall/monorepo.py +810 -0
  37. gitinstall/multi_source.py +425 -0
  38. gitinstall/onboard.py +276 -0
  39. gitinstall/planner.py +222 -0
  40. gitinstall/planner_helpers.py +323 -0
  41. gitinstall/planner_known_projects.py +1010 -0
  42. gitinstall/planner_templates.py +996 -0
  43. gitinstall/remote_gpu.py +633 -0
  44. gitinstall/resilience.py +608 -0
  45. gitinstall/run_tests.py +572 -0
  46. gitinstall/skills.py +476 -0
  47. gitinstall/tool_schemas.py +324 -0
  48. gitinstall/trending.py +279 -0
  49. gitinstall/uninstaller.py +415 -0
  50. gitinstall/validate_top100.py +607 -0
  51. gitinstall/watchdog.py +180 -0
  52. gitinstall/web.py +1277 -0
  53. gitinstall/web_ui.html +2277 -0
  54. gitinstall-1.1.0.dist-info/METADATA +275 -0
  55. gitinstall-1.1.0.dist-info/RECORD +59 -0
  56. gitinstall-1.1.0.dist-info/WHEEL +5 -0
  57. gitinstall-1.1.0.dist-info/entry_points.txt +3 -0
  58. gitinstall-1.1.0.dist-info/licenses/LICENSE +21 -0
  59. gitinstall-1.1.0.dist-info/top_level.txt +1 -0
gitinstall/academic.py ADDED
@@ -0,0 +1,831 @@
1
+ """
2
+ academic.py — 学术论文代码复现引擎
3
+ ====================================
4
+
5
+ 目标市场:学术论文代码复现(15万篇/年,★★★★☆)
6
+
7
+ 功能:
8
+ 1. arXiv / Semantic Scholar / Papers With Code 论文元数据提取
9
+ 2. 论文→代码仓库自动关联
10
+ 3. 可复现性评分(Reproducibility Score)
11
+ 4. 环境快照 & 恢复(冻结 Python/CUDA/库版本)
12
+ 5. 实验追踪(参数、指标、结果对比)
13
+ 6. BibTeX / 引用管理
14
+
15
+ 零外部依赖,纯 Python 标准库。
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import hashlib
21
+ import json
22
+ import os
23
+ import re
24
+ import time
25
+ import urllib.error
26
+ import urllib.request
27
+ from dataclasses import dataclass, field
28
+ from datetime import datetime, timezone
29
+ from pathlib import Path
30
+ from typing import Any, Optional
31
+
32
+
33
+ # ─────────────────────────────────────────────
34
+ # 数据结构
35
+ # ─────────────────────────────────────────────
36
+
37
+ @dataclass
38
+ class PaperInfo:
39
+ """论文元数据"""
40
+ paper_id: str = "" # arXiv ID 或 DOI
41
+ title: str = ""
42
+ authors: list[str] = field(default_factory=list)
43
+ abstract: str = ""
44
+ published: str = "" # ISO 日期
45
+ categories: list[str] = field(default_factory=list)
46
+ pdf_url: str = ""
47
+ code_urls: list[str] = field(default_factory=list) # GitHub 链接
48
+ bibtex: str = ""
49
+ source: str = "" # "arxiv" | "semantic_scholar" | "pwc"
50
+
51
+
52
+ @dataclass
53
+ class ReproducibilityScore:
54
+ """可复现性评分"""
55
+ total: float = 0.0 # 0-100
56
+ has_code: bool = False
57
+ has_requirements: bool = False
58
+ has_dockerfile: bool = False
59
+ has_pretrained: bool = False
60
+ has_data_script: bool = False
61
+ has_readme_instructions: bool = False
62
+ has_config_files: bool = False
63
+ has_tests: bool = False
64
+ has_ci: bool = False
65
+ pinned_deps: float = 0.0 # 0-1 比例
66
+ details: dict[str, Any] = field(default_factory=dict)
67
+
68
+
69
+ @dataclass
70
+ class EnvironmentSnapshot:
71
+ """环境快照"""
72
+ snapshot_id: str = ""
73
+ paper_id: str = ""
74
+ created_at: str = ""
75
+ python_version: str = ""
76
+ cuda_version: str = ""
77
+ os_info: str = ""
78
+ pip_freeze: list[str] = field(default_factory=list)
79
+ env_vars: dict[str, str] = field(default_factory=dict)
80
+ gpu_info: str = ""
81
+ notes: str = ""
82
+
83
+
84
+ @dataclass
85
+ class ExperimentRun:
86
+ """实验运行记录"""
87
+ run_id: str = ""
88
+ paper_id: str = ""
89
+ timestamp: str = ""
90
+ command: str = ""
91
+ params: dict[str, Any] = field(default_factory=dict)
92
+ metrics: dict[str, float] = field(default_factory=dict)
93
+ duration_sec: float = 0.0
94
+ status: str = "pending" # pending | running | success | failed
95
+ output_path: str = ""
96
+ snapshot_id: str = ""
97
+ notes: str = ""
98
+
99
+
100
+ # ─────────────────────────────────────────────
101
+ # arXiv API
102
+ # ─────────────────────────────────────────────
103
+
104
+ _ARXIV_ID_PATTERN = re.compile(r'(\d{4}\.\d{4,5})(v\d+)?')
105
+ _ARXIV_URL_PATTERN = re.compile(r'arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})')
106
+
107
+
108
+ def parse_arxiv_id(text: str) -> str:
109
+ """从文本中提取 arXiv ID"""
110
+ m = _ARXIV_URL_PATTERN.search(text)
111
+ if m:
112
+ return m.group(1)
113
+ m = _ARXIV_ID_PATTERN.search(text)
114
+ if m:
115
+ return m.group(1)
116
+ return ""
117
+
118
+
119
+ def fetch_arxiv_paper(arxiv_id: str, timeout: int = 15) -> PaperInfo:
120
+ """
121
+ 通过 arXiv API 获取论文信息。
122
+
123
+ >>> info = fetch_arxiv_paper("2301.13688")
124
+ """
125
+ arxiv_id = parse_arxiv_id(arxiv_id) or arxiv_id
126
+ url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
127
+
128
+ try:
129
+ req = urllib.request.Request(url, headers={"User-Agent": "gitinstall/1.1"})
130
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
131
+ xml_text = resp.read().decode("utf-8")
132
+ except (urllib.error.URLError, OSError):
133
+ return PaperInfo(paper_id=arxiv_id, source="arxiv")
134
+
135
+ return _parse_arxiv_xml(xml_text, arxiv_id)
136
+
137
+
138
+ def _parse_arxiv_xml(xml_text: str, arxiv_id: str) -> PaperInfo:
139
+ """解析 arXiv Atom XML(纯正则,不依赖 xml 库)"""
140
+ def _extract(tag: str, text: str) -> str:
141
+ m = re.search(rf'<{tag}[^>]*>(.*?)</{tag}>', text, re.DOTALL)
142
+ return m.group(1).strip() if m else ""
143
+
144
+ def _extract_all(tag: str, text: str) -> list[str]:
145
+ return [m.strip() for m in re.findall(rf'<{tag}[^>]*>(.*?)</{tag}>', text, re.DOTALL)]
146
+
147
+ # 取第一个 entry
148
+ entry_m = re.search(r'<entry>(.*?)</entry>', xml_text, re.DOTALL)
149
+ if not entry_m:
150
+ return PaperInfo(paper_id=arxiv_id, source="arxiv")
151
+
152
+ entry = entry_m.group(1)
153
+
154
+ title = _extract("title", entry)
155
+ title = re.sub(r'\s+', ' ', title) # 去除多余空白
156
+
157
+ abstract = _extract("summary", entry)
158
+ abstract = re.sub(r'\s+', ' ', abstract)
159
+
160
+ authors = []
161
+ for author_block in re.findall(r'<author>(.*?)</author>', entry, re.DOTALL):
162
+ name = _extract("name", author_block)
163
+ if name:
164
+ authors.append(name)
165
+
166
+ published = _extract("published", entry)[:10] # YYYY-MM-DD
167
+
168
+ categories = re.findall(r'<category[^>]*term="([^"]+)"', entry)
169
+
170
+ # 提取 PDF 链接
171
+ pdf_url = ""
172
+ for link in re.findall(r'<link[^>]*>', entry):
173
+ if 'title="pdf"' in link:
174
+ m2 = re.search(r'href="([^"]+)"', link)
175
+ if m2:
176
+ pdf_url = m2.group(1)
177
+
178
+ if not pdf_url:
179
+ pdf_url = f"https://arxiv.org/pdf/{arxiv_id}"
180
+
181
+ # 从摘要中提取 GitHub 链接
182
+ code_urls = re.findall(r'https?://github\.com/[\w.-]+/[\w.-]+', abstract)
183
+
184
+ # 生成 BibTeX
185
+ first_author = authors[0].split()[-1] if authors else "unknown"
186
+ year = published[:4] if published else "2024"
187
+ bibtex = (
188
+ f"@article{{{first_author}{year}arxiv,\n"
189
+ f" title={{{title}}},\n"
190
+ f" author={{{' and '.join(authors)}}},\n"
191
+ f" journal={{arXiv preprint arXiv:{arxiv_id}}},\n"
192
+ f" year={{{year}}}\n"
193
+ f"}}"
194
+ )
195
+
196
+ return PaperInfo(
197
+ paper_id=arxiv_id,
198
+ title=title,
199
+ authors=authors,
200
+ abstract=abstract,
201
+ published=published,
202
+ categories=categories,
203
+ pdf_url=pdf_url,
204
+ code_urls=code_urls,
205
+ bibtex=bibtex,
206
+ source="arxiv",
207
+ )
208
+
209
+
210
+ # ─────────────────────────────────────────────
211
+ # Papers With Code 关联
212
+ # ─────────────────────────────────────────────
213
+
214
+ def search_papers_with_code(arxiv_id: str, timeout: int = 10) -> list[str]:
215
+ """
216
+ 通过 Papers With Code API 查找论文关联的代码仓库。
217
+
218
+ 返回 GitHub URL 列表。
219
+ """
220
+ url = f"https://paperswithcode.com/api/v1/papers/?arxiv_id={arxiv_id}"
221
+ try:
222
+ req = urllib.request.Request(url, headers={"User-Agent": "gitinstall/1.1"})
223
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
224
+ data = json.loads(resp.read().decode("utf-8"))
225
+ except (urllib.error.URLError, OSError, json.JSONDecodeError):
226
+ return []
227
+
228
+ repos = []
229
+ results = data.get("results", [])
230
+ if not results:
231
+ return repos
232
+
233
+ paper_id = results[0].get("id", "")
234
+ if not paper_id:
235
+ return repos
236
+
237
+ # 查询关联仓库
238
+ repo_url = f"https://paperswithcode.com/api/v1/papers/{paper_id}/repositories/"
239
+ try:
240
+ req = urllib.request.Request(repo_url, headers={"User-Agent": "gitinstall/1.1"})
241
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
242
+ repo_data = json.loads(resp.read().decode("utf-8"))
243
+ except (urllib.error.URLError, OSError, json.JSONDecodeError):
244
+ return repos
245
+
246
+ for r in repo_data.get("results", []):
247
+ gh_url = r.get("url", "")
248
+ if "github.com" in gh_url:
249
+ repos.append(gh_url)
250
+
251
+ return repos
252
+
253
+
254
+ # ─────────────────────────────────────────────
255
+ # 可复现性评分
256
+ # ─────────────────────────────────────────────
257
+
258
+ def score_reproducibility(project_dir: str) -> ReproducibilityScore:
259
+ """
260
+ 对项目目录评分,判断论文代码的可复现性。
261
+
262
+ 评分维度(满分100):
263
+ - 有代码仓库: 15分
264
+ - 有 requirements/依赖声明: 15分
265
+ - 有 Dockerfile: 10分
266
+ - 有预训练模型链接 / weights: 10分
267
+ - 有数据下载脚本: 10分
268
+ - README 有安装/运行说明: 10分
269
+ - 有配置文件(yaml/json/toml): 5分
270
+ - 有测试: 10分
271
+ - 有 CI 配置: 5分
272
+ - 依赖版本固定比例: 10分
273
+ """
274
+ score = ReproducibilityScore(has_code=True)
275
+ total = 15.0 # 有代码本身
276
+
277
+ root = Path(project_dir)
278
+ if not root.is_dir():
279
+ return ReproducibilityScore()
280
+
281
+ # 依赖声明
282
+ dep_files = [
283
+ "requirements.txt", "setup.py", "pyproject.toml", "setup.cfg",
284
+ "environment.yml", "Pipfile", "package.json", "Cargo.toml", "go.mod",
285
+ ]
286
+ for f in dep_files:
287
+ if (root / f).exists():
288
+ score.has_requirements = True
289
+ total += 15
290
+ break
291
+
292
+ # Dockerfile
293
+ for f in ("Dockerfile", "docker-compose.yml", "docker-compose.yaml", ".devcontainer/devcontainer.json"):
294
+ if (root / f).exists():
295
+ score.has_dockerfile = True
296
+ total += 10
297
+ break
298
+
299
+ # 预训练模型
300
+ readme_text = ""
301
+ for rname in ("README.md", "README.rst", "README.txt", "README"):
302
+ rpath = root / rname
303
+ if rpath.exists():
304
+ try:
305
+ readme_text = rpath.read_text(encoding="utf-8", errors="ignore")[:50000]
306
+ except OSError:
307
+ pass
308
+ break
309
+
310
+ all_text = readme_text
311
+ for pyf in root.glob("**/*.py"):
312
+ if pyf.stat().st_size < 100000:
313
+ try:
314
+ all_text += "\n" + pyf.read_text(encoding="utf-8", errors="ignore")
315
+ except OSError:
316
+ pass
317
+ if len(all_text) > 500000:
318
+ break
319
+
320
+ pretrained_patterns = [
321
+ r'huggingface\.co/', r'drive\.google\.com', r'\.ckpt', r'\.safetensors',
322
+ r'pretrained', r'model\.pth', r'weights/', r'checkpoint',
323
+ r'from_pretrained', r'load_state_dict',
324
+ ]
325
+ for pat in pretrained_patterns:
326
+ if re.search(pat, all_text, re.IGNORECASE):
327
+ score.has_pretrained = True
328
+ total += 10
329
+ break
330
+
331
+ # 数据下载脚本
332
+ data_patterns = [
333
+ r'download.*data', r'prepare.*dataset', r'fetch.*data',
334
+ r'wget\s+.*\.tar', r'curl\s+.*\.zip',
335
+ ]
336
+ data_files = ["download_data.sh", "prepare_data.py", "data/download.sh", "scripts/download.sh"]
337
+ for f in data_files:
338
+ if (root / f).exists():
339
+ score.has_data_script = True
340
+ total += 10
341
+ break
342
+ if not score.has_data_script:
343
+ for pat in data_patterns:
344
+ if re.search(pat, all_text, re.IGNORECASE):
345
+ score.has_data_script = True
346
+ total += 10
347
+ break
348
+
349
+ # README 安装说明
350
+ install_patterns = [
351
+ r'##.*install', r'##.*setup', r'##.*getting.started',
352
+ r'##.*usage', r'##.*quick.start', r'pip install',
353
+ r'conda install', r'npm install',
354
+ ]
355
+ for pat in install_patterns:
356
+ if re.search(pat, readme_text, re.IGNORECASE):
357
+ score.has_readme_instructions = True
358
+ total += 10
359
+ break
360
+
361
+ # 配置文件
362
+ config_patterns = ["*.yaml", "*.yml", "*.toml", "*.json", "*.cfg", "*.ini"]
363
+ config_dirs = ["configs", "config", "conf"]
364
+ for cd in config_dirs:
365
+ if (root / cd).is_dir():
366
+ score.has_config_files = True
367
+ total += 5
368
+ break
369
+ if not score.has_config_files:
370
+ for pat in config_patterns:
371
+ matches = list(root.glob(pat))
372
+ # 排除 package.json 等非配置文件
373
+ real_configs = [m for m in matches if m.name not in (
374
+ "package.json", "package-lock.json", "tsconfig.json",
375
+ "pyproject.toml", "Cargo.toml",
376
+ )]
377
+ if real_configs:
378
+ score.has_config_files = True
379
+ total += 5
380
+ break
381
+
382
+ # 测试
383
+ test_indicators = ["tests/", "test/", "test_*.py", "*_test.py", "*_test.go", "*.test.js", "*.test.ts"]
384
+ for pat in test_indicators:
385
+ if list(root.glob(pat)):
386
+ score.has_tests = True
387
+ total += 10
388
+ break
389
+
390
+ # CI
391
+ ci_files = [
392
+ ".github/workflows", ".gitlab-ci.yml", "Jenkinsfile",
393
+ ".circleci", ".travis.yml", "azure-pipelines.yml",
394
+ ]
395
+ for f in ci_files:
396
+ if (root / f).exists():
397
+ score.has_ci = True
398
+ total += 5
399
+ break
400
+
401
+ # 依赖版本固定比例
402
+ pinned = _check_pinned_deps(root)
403
+ score.pinned_deps = pinned
404
+ total += pinned * 10
405
+
406
+ score.total = min(total, 100.0)
407
+ return score
408
+
409
+
410
+ def _check_pinned_deps(root: Path) -> float:
411
+ """检查依赖版本固定比例"""
412
+ req_path = root / "requirements.txt"
413
+ if not req_path.exists():
414
+ return 0.0
415
+
416
+ try:
417
+ content = req_path.read_text(encoding="utf-8", errors="ignore")
418
+ except OSError:
419
+ return 0.0
420
+
421
+ total_deps = 0
422
+ pinned_deps = 0
423
+ for line in content.splitlines():
424
+ line = line.strip()
425
+ if not line or line.startswith("#") or line.startswith("-"):
426
+ continue
427
+ total_deps += 1
428
+ if "==" in line:
429
+ pinned_deps += 1
430
+
431
+ return pinned_deps / total_deps if total_deps > 0 else 0.0
432
+
433
+
434
+ def format_reproducibility_score(score: ReproducibilityScore) -> str:
435
+ """格式化可复现性评分"""
436
+ if score.total >= 80:
437
+ grade = "A (优秀)"
438
+ elif score.total >= 60:
439
+ grade = "B (良好)"
440
+ elif score.total >= 40:
441
+ grade = "C (一般)"
442
+ elif score.total >= 20:
443
+ grade = "D (较差)"
444
+ else:
445
+ grade = "F (极差)"
446
+
447
+ checks = [
448
+ ("📦 代码仓库", score.has_code),
449
+ ("📋 依赖声明", score.has_requirements),
450
+ ("🐳 Dockerfile", score.has_dockerfile),
451
+ ("🧠 预训练权重", score.has_pretrained),
452
+ ("📊 数据下载", score.has_data_script),
453
+ ("📖 安装说明", score.has_readme_instructions),
454
+ ("⚙️ 配置文件", score.has_config_files),
455
+ ("🧪 测试用例", score.has_tests),
456
+ ("🔄 CI 配置", score.has_ci),
457
+ ]
458
+
459
+ lines = [
460
+ f"📊 可复现性评分: {score.total:.0f}/100 [{grade}]",
461
+ "",
462
+ ]
463
+ for label, ok in checks:
464
+ lines.append(f" {label}: {'✅' if ok else '❌'}")
465
+ lines.append(f" 📌 依赖固定率: {score.pinned_deps:.0%}")
466
+
467
+ return "\n".join(lines)
468
+
469
+
470
+ # ─────────────────────────────────────────────
471
+ # 环境快照
472
+ # ─────────────────────────────────────────────
473
+
474
+ _SNAPSHOT_DIR = os.path.expanduser("~/.gitinstall/snapshots")
475
+
476
+
477
+ def create_snapshot(paper_id: str = "", notes: str = "") -> EnvironmentSnapshot:
478
+ """
479
+ 创建当前环境快照。
480
+
481
+ 捕获 Python 版本、CUDA 版本、pip freeze、环境变量等。
482
+ """
483
+ import platform
484
+ import subprocess
485
+ import sys
486
+
487
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
488
+ snap_id = hashlib.sha256(f"{now}-{paper_id}".encode()).hexdigest()[:12]
489
+
490
+ # pip freeze
491
+ pip_freeze = []
492
+ try:
493
+ result = subprocess.run(
494
+ [sys.executable, "-m", "pip", "freeze", "--local"],
495
+ capture_output=True, text=True, timeout=30,
496
+ )
497
+ if result.returncode == 0:
498
+ pip_freeze = [l.strip() for l in result.stdout.splitlines() if l.strip()]
499
+ except (subprocess.TimeoutExpired, FileNotFoundError):
500
+ pass
501
+
502
+ # CUDA 版本
503
+ cuda_version = ""
504
+ try:
505
+ result = subprocess.run(
506
+ ["nvcc", "--version"], capture_output=True, text=True, timeout=5,
507
+ )
508
+ m = re.search(r'release (\d+\.\d+)', result.stdout)
509
+ if m:
510
+ cuda_version = m.group(1)
511
+ except (FileNotFoundError, subprocess.TimeoutExpired):
512
+ cuda_version = os.environ.get("CUDA_VERSION", "")
513
+
514
+ # GPU 信息
515
+ gpu_info = ""
516
+ try:
517
+ result = subprocess.run(
518
+ ["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"],
519
+ capture_output=True, text=True, timeout=5,
520
+ )
521
+ if result.returncode == 0:
522
+ gpu_info = result.stdout.strip()
523
+ except (FileNotFoundError, subprocess.TimeoutExpired):
524
+ # macOS Apple Silicon
525
+ if platform.machine() == "arm64" and platform.system() == "Darwin":
526
+ try:
527
+ result = subprocess.run(
528
+ ["sysctl", "-n", "machdep.cpu.brand_string"],
529
+ capture_output=True, text=True, timeout=5,
530
+ )
531
+ gpu_info = f"Apple Silicon ({result.stdout.strip()})"
532
+ except (FileNotFoundError, subprocess.TimeoutExpired):
533
+ pass
534
+
535
+ # 安全过滤环境变量(只保留开发相关的,不泄露密钥)
536
+ safe_env_prefixes = (
537
+ "PYTHON", "CUDA", "CONDA", "VIRTUAL_ENV", "PATH",
538
+ "LD_LIBRARY", "DYLD_", "CC", "CXX", "CMAKE",
539
+ )
540
+ env_vars = {}
541
+ for k, v in os.environ.items():
542
+ if any(k.startswith(p) for p in safe_env_prefixes):
543
+ # 不记录包含 KEY/TOKEN/SECRET/PASSWORD 的变量
544
+ if not any(s in k.upper() for s in ("KEY", "TOKEN", "SECRET", "PASSWORD")):
545
+ env_vars[k] = v
546
+
547
+ snap = EnvironmentSnapshot(
548
+ snapshot_id=snap_id,
549
+ paper_id=paper_id,
550
+ created_at=now,
551
+ python_version=f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
552
+ cuda_version=cuda_version,
553
+ os_info=f"{platform.system()} {platform.release()} ({platform.machine()})",
554
+ pip_freeze=pip_freeze,
555
+ env_vars=env_vars,
556
+ gpu_info=gpu_info,
557
+ notes=notes,
558
+ )
559
+
560
+ # 保存到磁盘
561
+ _save_snapshot(snap)
562
+ return snap
563
+
564
+
565
+ def _save_snapshot(snap: EnvironmentSnapshot) -> str:
566
+ """保存快照到文件"""
567
+ os.makedirs(_SNAPSHOT_DIR, exist_ok=True)
568
+ path = os.path.join(_SNAPSHOT_DIR, f"{snap.snapshot_id}.json")
569
+ data = {
570
+ "snapshot_id": snap.snapshot_id,
571
+ "paper_id": snap.paper_id,
572
+ "created_at": snap.created_at,
573
+ "python_version": snap.python_version,
574
+ "cuda_version": snap.cuda_version,
575
+ "os_info": snap.os_info,
576
+ "pip_freeze": snap.pip_freeze,
577
+ "env_vars": snap.env_vars,
578
+ "gpu_info": snap.gpu_info,
579
+ "notes": snap.notes,
580
+ }
581
+ with open(path, "w", encoding="utf-8") as f:
582
+ json.dump(data, f, indent=2, ensure_ascii=False)
583
+ return path
584
+
585
+
586
+ def load_snapshot(snapshot_id: str) -> EnvironmentSnapshot:
587
+ """加载快照"""
588
+ path = os.path.join(_SNAPSHOT_DIR, f"{snapshot_id}.json")
589
+ if not os.path.isfile(path):
590
+ return EnvironmentSnapshot()
591
+ with open(path, "r", encoding="utf-8") as f:
592
+ data = json.load(f)
593
+ return EnvironmentSnapshot(**data)
594
+
595
+
596
+ def list_snapshots() -> list[dict]:
597
+ """列出所有快照"""
598
+ if not os.path.isdir(_SNAPSHOT_DIR):
599
+ return []
600
+ result = []
601
+ for fname in sorted(os.listdir(_SNAPSHOT_DIR)):
602
+ if fname.endswith(".json"):
603
+ path = os.path.join(_SNAPSHOT_DIR, fname)
604
+ try:
605
+ with open(path, "r", encoding="utf-8") as f:
606
+ data = json.load(f)
607
+ result.append({
608
+ "snapshot_id": data.get("snapshot_id", ""),
609
+ "paper_id": data.get("paper_id", ""),
610
+ "created_at": data.get("created_at", ""),
611
+ "python_version": data.get("python_version", ""),
612
+ })
613
+ except (json.JSONDecodeError, OSError):
614
+ pass
615
+ return result
616
+
617
+
618
+ def generate_restore_commands(snapshot_id: str) -> list[str]:
619
+ """生成恢复环境的命令列表"""
620
+ snap = load_snapshot(snapshot_id)
621
+ if not snap.snapshot_id:
622
+ return [f"# 快照 {snapshot_id} 不存在"]
623
+
624
+ cmds = [
625
+ f"# 环境恢复 — 快照 {snap.snapshot_id}",
626
+ f"# 创建于: {snap.created_at}",
627
+ f"# Python: {snap.python_version}",
628
+ "",
629
+ ]
630
+
631
+ if snap.cuda_version:
632
+ cmds.append(f"# CUDA: {snap.cuda_version}")
633
+
634
+ cmds.extend([
635
+ f"python -m venv .venv-{snap.snapshot_id}",
636
+ f"source .venv-{snap.snapshot_id}/bin/activate",
637
+ ])
638
+
639
+ if snap.pip_freeze:
640
+ req_file = f"requirements-{snap.snapshot_id}.txt"
641
+ cmds.append(f"cat > {req_file} << 'EOF'")
642
+ cmds.extend(snap.pip_freeze)
643
+ cmds.append("EOF")
644
+ cmds.append(f"pip install -r {req_file}")
645
+
646
+ return cmds
647
+
648
+
649
+ # ─────────────────────────────────────────────
650
+ # 实验追踪
651
+ # ─────────────────────────────────────────────
652
+
653
+ _EXPERIMENTS_DIR = os.path.expanduser("~/.gitinstall/experiments")
654
+
655
+
656
+ def log_experiment(
657
+ paper_id: str,
658
+ command: str,
659
+ params: dict[str, Any] | None = None,
660
+ metrics: dict[str, float] | None = None,
661
+ duration_sec: float = 0.0,
662
+ status: str = "success",
663
+ output_path: str = "",
664
+ snapshot_id: str = "",
665
+ notes: str = "",
666
+ ) -> ExperimentRun:
667
+ """记录一次实验运行"""
668
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
669
+ run_id = hashlib.sha256(f"{now}-{paper_id}-{command}".encode()).hexdigest()[:12]
670
+
671
+ run = ExperimentRun(
672
+ run_id=run_id,
673
+ paper_id=paper_id,
674
+ timestamp=now,
675
+ command=command,
676
+ params=params or {},
677
+ metrics=metrics or {},
678
+ duration_sec=duration_sec,
679
+ status=status,
680
+ output_path=output_path,
681
+ snapshot_id=snapshot_id,
682
+ notes=notes,
683
+ )
684
+
685
+ # 持久化
686
+ paper_dir = os.path.join(_EXPERIMENTS_DIR, paper_id.replace("/", "_"))
687
+ os.makedirs(paper_dir, exist_ok=True)
688
+ path = os.path.join(paper_dir, f"{run_id}.json")
689
+ with open(path, "w", encoding="utf-8") as f:
690
+ json.dump({
691
+ "run_id": run.run_id,
692
+ "paper_id": run.paper_id,
693
+ "timestamp": run.timestamp,
694
+ "command": run.command,
695
+ "params": run.params,
696
+ "metrics": run.metrics,
697
+ "duration_sec": run.duration_sec,
698
+ "status": run.status,
699
+ "output_path": run.output_path,
700
+ "snapshot_id": run.snapshot_id,
701
+ "notes": run.notes,
702
+ }, f, indent=2, ensure_ascii=False)
703
+
704
+ return run
705
+
706
+
707
+ def list_experiments(paper_id: str) -> list[ExperimentRun]:
708
+ """列出论文的所有实验"""
709
+ paper_dir = os.path.join(_EXPERIMENTS_DIR, paper_id.replace("/", "_"))
710
+ if not os.path.isdir(paper_dir):
711
+ return []
712
+
713
+ runs = []
714
+ for fname in sorted(os.listdir(paper_dir)):
715
+ if fname.endswith(".json"):
716
+ path = os.path.join(paper_dir, fname)
717
+ try:
718
+ with open(path, "r", encoding="utf-8") as f:
719
+ data = json.load(f)
720
+ runs.append(ExperimentRun(**data))
721
+ except (json.JSONDecodeError, OSError, TypeError):
722
+ pass
723
+ return runs
724
+
725
+
726
+ def compare_experiments(paper_id: str) -> str:
727
+ """对比论文的所有实验结果"""
728
+ runs = list_experiments(paper_id)
729
+ if not runs:
730
+ return f"论文 {paper_id} 没有实验记录"
731
+
732
+ # 收集所有指标
733
+ all_metrics = set()
734
+ for r in runs:
735
+ all_metrics.update(r.metrics.keys())
736
+ all_metrics = sorted(all_metrics)
737
+
738
+ lines = [
739
+ f"📊 实验对比 — {paper_id}",
740
+ f" 共 {len(runs)} 次实验",
741
+ "",
742
+ "运行ID | 状态 | 耗时 | " + " | ".join(all_metrics),
743
+ "-" * (50 + 15 * len(all_metrics)),
744
+ ]
745
+
746
+ for r in runs:
747
+ duration_str = f"{r.duration_sec:.1f}s" if r.duration_sec else "N/A"
748
+ metric_vals = [f"{r.metrics.get(m, '-')}" for m in all_metrics]
749
+ lines.append(
750
+ f"{r.run_id:10s} | {r.status:7s} | {duration_str:8s} | " +
751
+ " | ".join(f"{v:>12s}" for v in metric_vals)
752
+ )
753
+
754
+ return "\n".join(lines)
755
+
756
+
757
+ # ─────────────────────────────────────────────
758
+ # 论文→安装 一键流水线
759
+ # ─────────────────────────────────────────────
760
+
761
+ def paper_to_install_plan(paper_input: str) -> dict:
762
+ """
763
+ 从论文 ID/URL 生成安装计划。
764
+
765
+ 流程:
766
+ 1. 解析 arXiv ID
767
+ 2. 获取论文元数据
768
+ 3. 查找关联代码仓库(Papers With Code)
769
+ 4. 返回 repo URL + 论文信息
770
+
771
+ Args:
772
+ paper_input: arXiv ID、URL、或论文标题
773
+
774
+ Returns:
775
+ {"paper": PaperInfo, "repos": [...], "suggested_repo": "...", "install_cmd": "..."}
776
+ """
777
+ arxiv_id = parse_arxiv_id(paper_input)
778
+ if not arxiv_id:
779
+ return {
780
+ "error": f"无法识别 arXiv ID: {paper_input}",
781
+ "hint": "请输入 arXiv ID (如 2301.13688) 或 URL",
782
+ }
783
+
784
+ paper = fetch_arxiv_paper(arxiv_id)
785
+
786
+ # 查找代码仓库
787
+ repos = list(paper.code_urls) # 从摘要中提取的
788
+ pwc_repos = search_papers_with_code(arxiv_id)
789
+ for r in pwc_repos:
790
+ if r not in repos:
791
+ repos.append(r)
792
+
793
+ suggested = repos[0] if repos else ""
794
+
795
+ # 生成安装命令
796
+ install_cmd = ""
797
+ if suggested:
798
+ # 提取 owner/repo
799
+ m = re.search(r'github\.com/([\w.-]+/[\w.-]+)', suggested)
800
+ if m:
801
+ install_cmd = f"gitinstall install {m.group(1)}"
802
+
803
+ return {
804
+ "paper": {
805
+ "id": paper.paper_id,
806
+ "title": paper.title,
807
+ "authors": paper.authors,
808
+ "published": paper.published,
809
+ "categories": paper.categories,
810
+ "pdf_url": paper.pdf_url,
811
+ "bibtex": paper.bibtex,
812
+ },
813
+ "repos": repos,
814
+ "suggested_repo": suggested,
815
+ "install_cmd": install_cmd,
816
+ }
817
+
818
+
819
+ def format_paper_info(paper: PaperInfo) -> str:
820
+ """格式化论文信息"""
821
+ lines = [
822
+ f"📄 {paper.title}",
823
+ f" 作者: {', '.join(paper.authors[:5])}" + (" 等" if len(paper.authors) > 5 else ""),
824
+ f" 发布: {paper.published}",
825
+ f" 分类: {', '.join(paper.categories[:3])}",
826
+ ]
827
+ if paper.pdf_url:
828
+ lines.append(f" PDF: {paper.pdf_url}")
829
+ if paper.code_urls:
830
+ lines.append(f" 代码: {', '.join(paper.code_urls)}")
831
+ return "\n".join(lines)