gitinstall 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitinstall/__init__.py +61 -0
- gitinstall/_sdk.py +541 -0
- gitinstall/academic.py +831 -0
- gitinstall/admin.html +327 -0
- gitinstall/auto_update.py +384 -0
- gitinstall/autopilot.py +349 -0
- gitinstall/badge.py +476 -0
- gitinstall/checkpoint.py +330 -0
- gitinstall/cicd.py +499 -0
- gitinstall/clawhub.html +718 -0
- gitinstall/config_schema.py +353 -0
- gitinstall/db.py +984 -0
- gitinstall/db_backend.py +445 -0
- gitinstall/dep_chain.py +337 -0
- gitinstall/dependency_audit.py +1153 -0
- gitinstall/detector.py +542 -0
- gitinstall/doctor.py +493 -0
- gitinstall/education.py +869 -0
- gitinstall/enterprise.py +802 -0
- gitinstall/error_fixer.py +953 -0
- gitinstall/event_bus.py +251 -0
- gitinstall/executor.py +577 -0
- gitinstall/feature_flags.py +138 -0
- gitinstall/fetcher.py +921 -0
- gitinstall/huggingface.py +922 -0
- gitinstall/hw_detect.py +988 -0
- gitinstall/i18n.py +664 -0
- gitinstall/installer_registry.py +362 -0
- gitinstall/knowledge_base.py +379 -0
- gitinstall/license_check.py +605 -0
- gitinstall/llm.py +569 -0
- gitinstall/log.py +236 -0
- gitinstall/main.py +1408 -0
- gitinstall/mcp_agent.py +841 -0
- gitinstall/mcp_server.py +386 -0
- gitinstall/monorepo.py +810 -0
- gitinstall/multi_source.py +425 -0
- gitinstall/onboard.py +276 -0
- gitinstall/planner.py +222 -0
- gitinstall/planner_helpers.py +323 -0
- gitinstall/planner_known_projects.py +1010 -0
- gitinstall/planner_templates.py +996 -0
- gitinstall/remote_gpu.py +633 -0
- gitinstall/resilience.py +608 -0
- gitinstall/run_tests.py +572 -0
- gitinstall/skills.py +476 -0
- gitinstall/tool_schemas.py +324 -0
- gitinstall/trending.py +279 -0
- gitinstall/uninstaller.py +415 -0
- gitinstall/validate_top100.py +607 -0
- gitinstall/watchdog.py +180 -0
- gitinstall/web.py +1277 -0
- gitinstall/web_ui.html +2277 -0
- gitinstall-1.1.0.dist-info/METADATA +275 -0
- gitinstall-1.1.0.dist-info/RECORD +59 -0
- gitinstall-1.1.0.dist-info/WHEEL +5 -0
- gitinstall-1.1.0.dist-info/entry_points.txt +3 -0
- gitinstall-1.1.0.dist-info/licenses/LICENSE +21 -0
- gitinstall-1.1.0.dist-info/top_level.txt +1 -0
gitinstall/academic.py
ADDED
|
@@ -0,0 +1,831 @@
|
|
|
1
|
+
"""
|
|
2
|
+
academic.py — 学术论文代码复现引擎
|
|
3
|
+
====================================
|
|
4
|
+
|
|
5
|
+
目标市场:学术论文代码复现(15万篇/年,★★★★☆)
|
|
6
|
+
|
|
7
|
+
功能:
|
|
8
|
+
1. arXiv / Semantic Scholar / Papers With Code 论文元数据提取
|
|
9
|
+
2. 论文→代码仓库自动关联
|
|
10
|
+
3. 可复现性评分(Reproducibility Score)
|
|
11
|
+
4. 环境快照 & 恢复(冻结 Python/CUDA/库版本)
|
|
12
|
+
5. 实验追踪(参数、指标、结果对比)
|
|
13
|
+
6. BibTeX / 引用管理
|
|
14
|
+
|
|
15
|
+
零外部依赖,纯 Python 标准库。
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import hashlib
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import re
|
|
24
|
+
import time
|
|
25
|
+
import urllib.error
|
|
26
|
+
import urllib.request
|
|
27
|
+
from dataclasses import dataclass, field
|
|
28
|
+
from datetime import datetime, timezone
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any, Optional
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ─────────────────────────────────────────────
|
|
34
|
+
# 数据结构
|
|
35
|
+
# ─────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class PaperInfo:
|
|
39
|
+
"""论文元数据"""
|
|
40
|
+
paper_id: str = "" # arXiv ID 或 DOI
|
|
41
|
+
title: str = ""
|
|
42
|
+
authors: list[str] = field(default_factory=list)
|
|
43
|
+
abstract: str = ""
|
|
44
|
+
published: str = "" # ISO 日期
|
|
45
|
+
categories: list[str] = field(default_factory=list)
|
|
46
|
+
pdf_url: str = ""
|
|
47
|
+
code_urls: list[str] = field(default_factory=list) # GitHub 链接
|
|
48
|
+
bibtex: str = ""
|
|
49
|
+
source: str = "" # "arxiv" | "semantic_scholar" | "pwc"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ReproducibilityScore:
|
|
54
|
+
"""可复现性评分"""
|
|
55
|
+
total: float = 0.0 # 0-100
|
|
56
|
+
has_code: bool = False
|
|
57
|
+
has_requirements: bool = False
|
|
58
|
+
has_dockerfile: bool = False
|
|
59
|
+
has_pretrained: bool = False
|
|
60
|
+
has_data_script: bool = False
|
|
61
|
+
has_readme_instructions: bool = False
|
|
62
|
+
has_config_files: bool = False
|
|
63
|
+
has_tests: bool = False
|
|
64
|
+
has_ci: bool = False
|
|
65
|
+
pinned_deps: float = 0.0 # 0-1 比例
|
|
66
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class EnvironmentSnapshot:
|
|
71
|
+
"""环境快照"""
|
|
72
|
+
snapshot_id: str = ""
|
|
73
|
+
paper_id: str = ""
|
|
74
|
+
created_at: str = ""
|
|
75
|
+
python_version: str = ""
|
|
76
|
+
cuda_version: str = ""
|
|
77
|
+
os_info: str = ""
|
|
78
|
+
pip_freeze: list[str] = field(default_factory=list)
|
|
79
|
+
env_vars: dict[str, str] = field(default_factory=dict)
|
|
80
|
+
gpu_info: str = ""
|
|
81
|
+
notes: str = ""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class ExperimentRun:
|
|
86
|
+
"""实验运行记录"""
|
|
87
|
+
run_id: str = ""
|
|
88
|
+
paper_id: str = ""
|
|
89
|
+
timestamp: str = ""
|
|
90
|
+
command: str = ""
|
|
91
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
92
|
+
metrics: dict[str, float] = field(default_factory=dict)
|
|
93
|
+
duration_sec: float = 0.0
|
|
94
|
+
status: str = "pending" # pending | running | success | failed
|
|
95
|
+
output_path: str = ""
|
|
96
|
+
snapshot_id: str = ""
|
|
97
|
+
notes: str = ""
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ─────────────────────────────────────────────
|
|
101
|
+
# arXiv API
|
|
102
|
+
# ─────────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
_ARXIV_ID_PATTERN = re.compile(r'(\d{4}\.\d{4,5})(v\d+)?')
|
|
105
|
+
_ARXIV_URL_PATTERN = re.compile(r'arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5})')
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def parse_arxiv_id(text: str) -> str:
|
|
109
|
+
"""从文本中提取 arXiv ID"""
|
|
110
|
+
m = _ARXIV_URL_PATTERN.search(text)
|
|
111
|
+
if m:
|
|
112
|
+
return m.group(1)
|
|
113
|
+
m = _ARXIV_ID_PATTERN.search(text)
|
|
114
|
+
if m:
|
|
115
|
+
return m.group(1)
|
|
116
|
+
return ""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def fetch_arxiv_paper(arxiv_id: str, timeout: int = 15) -> PaperInfo:
|
|
120
|
+
"""
|
|
121
|
+
通过 arXiv API 获取论文信息。
|
|
122
|
+
|
|
123
|
+
>>> info = fetch_arxiv_paper("2301.13688")
|
|
124
|
+
"""
|
|
125
|
+
arxiv_id = parse_arxiv_id(arxiv_id) or arxiv_id
|
|
126
|
+
url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
req = urllib.request.Request(url, headers={"User-Agent": "gitinstall/1.1"})
|
|
130
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
131
|
+
xml_text = resp.read().decode("utf-8")
|
|
132
|
+
except (urllib.error.URLError, OSError):
|
|
133
|
+
return PaperInfo(paper_id=arxiv_id, source="arxiv")
|
|
134
|
+
|
|
135
|
+
return _parse_arxiv_xml(xml_text, arxiv_id)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _parse_arxiv_xml(xml_text: str, arxiv_id: str) -> PaperInfo:
|
|
139
|
+
"""解析 arXiv Atom XML(纯正则,不依赖 xml 库)"""
|
|
140
|
+
def _extract(tag: str, text: str) -> str:
|
|
141
|
+
m = re.search(rf'<{tag}[^>]*>(.*?)</{tag}>', text, re.DOTALL)
|
|
142
|
+
return m.group(1).strip() if m else ""
|
|
143
|
+
|
|
144
|
+
def _extract_all(tag: str, text: str) -> list[str]:
|
|
145
|
+
return [m.strip() for m in re.findall(rf'<{tag}[^>]*>(.*?)</{tag}>', text, re.DOTALL)]
|
|
146
|
+
|
|
147
|
+
# 取第一个 entry
|
|
148
|
+
entry_m = re.search(r'<entry>(.*?)</entry>', xml_text, re.DOTALL)
|
|
149
|
+
if not entry_m:
|
|
150
|
+
return PaperInfo(paper_id=arxiv_id, source="arxiv")
|
|
151
|
+
|
|
152
|
+
entry = entry_m.group(1)
|
|
153
|
+
|
|
154
|
+
title = _extract("title", entry)
|
|
155
|
+
title = re.sub(r'\s+', ' ', title) # 去除多余空白
|
|
156
|
+
|
|
157
|
+
abstract = _extract("summary", entry)
|
|
158
|
+
abstract = re.sub(r'\s+', ' ', abstract)
|
|
159
|
+
|
|
160
|
+
authors = []
|
|
161
|
+
for author_block in re.findall(r'<author>(.*?)</author>', entry, re.DOTALL):
|
|
162
|
+
name = _extract("name", author_block)
|
|
163
|
+
if name:
|
|
164
|
+
authors.append(name)
|
|
165
|
+
|
|
166
|
+
published = _extract("published", entry)[:10] # YYYY-MM-DD
|
|
167
|
+
|
|
168
|
+
categories = re.findall(r'<category[^>]*term="([^"]+)"', entry)
|
|
169
|
+
|
|
170
|
+
# 提取 PDF 链接
|
|
171
|
+
pdf_url = ""
|
|
172
|
+
for link in re.findall(r'<link[^>]*>', entry):
|
|
173
|
+
if 'title="pdf"' in link:
|
|
174
|
+
m2 = re.search(r'href="([^"]+)"', link)
|
|
175
|
+
if m2:
|
|
176
|
+
pdf_url = m2.group(1)
|
|
177
|
+
|
|
178
|
+
if not pdf_url:
|
|
179
|
+
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}"
|
|
180
|
+
|
|
181
|
+
# 从摘要中提取 GitHub 链接
|
|
182
|
+
code_urls = re.findall(r'https?://github\.com/[\w.-]+/[\w.-]+', abstract)
|
|
183
|
+
|
|
184
|
+
# 生成 BibTeX
|
|
185
|
+
first_author = authors[0].split()[-1] if authors else "unknown"
|
|
186
|
+
year = published[:4] if published else "2024"
|
|
187
|
+
bibtex = (
|
|
188
|
+
f"@article{{{first_author}{year}arxiv,\n"
|
|
189
|
+
f" title={{{title}}},\n"
|
|
190
|
+
f" author={{{' and '.join(authors)}}},\n"
|
|
191
|
+
f" journal={{arXiv preprint arXiv:{arxiv_id}}},\n"
|
|
192
|
+
f" year={{{year}}}\n"
|
|
193
|
+
f"}}"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return PaperInfo(
|
|
197
|
+
paper_id=arxiv_id,
|
|
198
|
+
title=title,
|
|
199
|
+
authors=authors,
|
|
200
|
+
abstract=abstract,
|
|
201
|
+
published=published,
|
|
202
|
+
categories=categories,
|
|
203
|
+
pdf_url=pdf_url,
|
|
204
|
+
code_urls=code_urls,
|
|
205
|
+
bibtex=bibtex,
|
|
206
|
+
source="arxiv",
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# ─────────────────────────────────────────────
|
|
211
|
+
# Papers With Code 关联
|
|
212
|
+
# ─────────────────────────────────────────────
|
|
213
|
+
|
|
214
|
+
def search_papers_with_code(arxiv_id: str, timeout: int = 10) -> list[str]:
|
|
215
|
+
"""
|
|
216
|
+
通过 Papers With Code API 查找论文关联的代码仓库。
|
|
217
|
+
|
|
218
|
+
返回 GitHub URL 列表。
|
|
219
|
+
"""
|
|
220
|
+
url = f"https://paperswithcode.com/api/v1/papers/?arxiv_id={arxiv_id}"
|
|
221
|
+
try:
|
|
222
|
+
req = urllib.request.Request(url, headers={"User-Agent": "gitinstall/1.1"})
|
|
223
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
224
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
225
|
+
except (urllib.error.URLError, OSError, json.JSONDecodeError):
|
|
226
|
+
return []
|
|
227
|
+
|
|
228
|
+
repos = []
|
|
229
|
+
results = data.get("results", [])
|
|
230
|
+
if not results:
|
|
231
|
+
return repos
|
|
232
|
+
|
|
233
|
+
paper_id = results[0].get("id", "")
|
|
234
|
+
if not paper_id:
|
|
235
|
+
return repos
|
|
236
|
+
|
|
237
|
+
# 查询关联仓库
|
|
238
|
+
repo_url = f"https://paperswithcode.com/api/v1/papers/{paper_id}/repositories/"
|
|
239
|
+
try:
|
|
240
|
+
req = urllib.request.Request(repo_url, headers={"User-Agent": "gitinstall/1.1"})
|
|
241
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
242
|
+
repo_data = json.loads(resp.read().decode("utf-8"))
|
|
243
|
+
except (urllib.error.URLError, OSError, json.JSONDecodeError):
|
|
244
|
+
return repos
|
|
245
|
+
|
|
246
|
+
for r in repo_data.get("results", []):
|
|
247
|
+
gh_url = r.get("url", "")
|
|
248
|
+
if "github.com" in gh_url:
|
|
249
|
+
repos.append(gh_url)
|
|
250
|
+
|
|
251
|
+
return repos
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# ─────────────────────────────────────────────
|
|
255
|
+
# 可复现性评分
|
|
256
|
+
# ─────────────────────────────────────────────
|
|
257
|
+
|
|
258
|
+
def score_reproducibility(project_dir: str) -> ReproducibilityScore:
|
|
259
|
+
"""
|
|
260
|
+
对项目目录评分,判断论文代码的可复现性。
|
|
261
|
+
|
|
262
|
+
评分维度(满分100):
|
|
263
|
+
- 有代码仓库: 15分
|
|
264
|
+
- 有 requirements/依赖声明: 15分
|
|
265
|
+
- 有 Dockerfile: 10分
|
|
266
|
+
- 有预训练模型链接 / weights: 10分
|
|
267
|
+
- 有数据下载脚本: 10分
|
|
268
|
+
- README 有安装/运行说明: 10分
|
|
269
|
+
- 有配置文件(yaml/json/toml): 5分
|
|
270
|
+
- 有测试: 10分
|
|
271
|
+
- 有 CI 配置: 5分
|
|
272
|
+
- 依赖版本固定比例: 10分
|
|
273
|
+
"""
|
|
274
|
+
score = ReproducibilityScore(has_code=True)
|
|
275
|
+
total = 15.0 # 有代码本身
|
|
276
|
+
|
|
277
|
+
root = Path(project_dir)
|
|
278
|
+
if not root.is_dir():
|
|
279
|
+
return ReproducibilityScore()
|
|
280
|
+
|
|
281
|
+
# 依赖声明
|
|
282
|
+
dep_files = [
|
|
283
|
+
"requirements.txt", "setup.py", "pyproject.toml", "setup.cfg",
|
|
284
|
+
"environment.yml", "Pipfile", "package.json", "Cargo.toml", "go.mod",
|
|
285
|
+
]
|
|
286
|
+
for f in dep_files:
|
|
287
|
+
if (root / f).exists():
|
|
288
|
+
score.has_requirements = True
|
|
289
|
+
total += 15
|
|
290
|
+
break
|
|
291
|
+
|
|
292
|
+
# Dockerfile
|
|
293
|
+
for f in ("Dockerfile", "docker-compose.yml", "docker-compose.yaml", ".devcontainer/devcontainer.json"):
|
|
294
|
+
if (root / f).exists():
|
|
295
|
+
score.has_dockerfile = True
|
|
296
|
+
total += 10
|
|
297
|
+
break
|
|
298
|
+
|
|
299
|
+
# 预训练模型
|
|
300
|
+
readme_text = ""
|
|
301
|
+
for rname in ("README.md", "README.rst", "README.txt", "README"):
|
|
302
|
+
rpath = root / rname
|
|
303
|
+
if rpath.exists():
|
|
304
|
+
try:
|
|
305
|
+
readme_text = rpath.read_text(encoding="utf-8", errors="ignore")[:50000]
|
|
306
|
+
except OSError:
|
|
307
|
+
pass
|
|
308
|
+
break
|
|
309
|
+
|
|
310
|
+
all_text = readme_text
|
|
311
|
+
for pyf in root.glob("**/*.py"):
|
|
312
|
+
if pyf.stat().st_size < 100000:
|
|
313
|
+
try:
|
|
314
|
+
all_text += "\n" + pyf.read_text(encoding="utf-8", errors="ignore")
|
|
315
|
+
except OSError:
|
|
316
|
+
pass
|
|
317
|
+
if len(all_text) > 500000:
|
|
318
|
+
break
|
|
319
|
+
|
|
320
|
+
pretrained_patterns = [
|
|
321
|
+
r'huggingface\.co/', r'drive\.google\.com', r'\.ckpt', r'\.safetensors',
|
|
322
|
+
r'pretrained', r'model\.pth', r'weights/', r'checkpoint',
|
|
323
|
+
r'from_pretrained', r'load_state_dict',
|
|
324
|
+
]
|
|
325
|
+
for pat in pretrained_patterns:
|
|
326
|
+
if re.search(pat, all_text, re.IGNORECASE):
|
|
327
|
+
score.has_pretrained = True
|
|
328
|
+
total += 10
|
|
329
|
+
break
|
|
330
|
+
|
|
331
|
+
# 数据下载脚本
|
|
332
|
+
data_patterns = [
|
|
333
|
+
r'download.*data', r'prepare.*dataset', r'fetch.*data',
|
|
334
|
+
r'wget\s+.*\.tar', r'curl\s+.*\.zip',
|
|
335
|
+
]
|
|
336
|
+
data_files = ["download_data.sh", "prepare_data.py", "data/download.sh", "scripts/download.sh"]
|
|
337
|
+
for f in data_files:
|
|
338
|
+
if (root / f).exists():
|
|
339
|
+
score.has_data_script = True
|
|
340
|
+
total += 10
|
|
341
|
+
break
|
|
342
|
+
if not score.has_data_script:
|
|
343
|
+
for pat in data_patterns:
|
|
344
|
+
if re.search(pat, all_text, re.IGNORECASE):
|
|
345
|
+
score.has_data_script = True
|
|
346
|
+
total += 10
|
|
347
|
+
break
|
|
348
|
+
|
|
349
|
+
# README 安装说明
|
|
350
|
+
install_patterns = [
|
|
351
|
+
r'##.*install', r'##.*setup', r'##.*getting.started',
|
|
352
|
+
r'##.*usage', r'##.*quick.start', r'pip install',
|
|
353
|
+
r'conda install', r'npm install',
|
|
354
|
+
]
|
|
355
|
+
for pat in install_patterns:
|
|
356
|
+
if re.search(pat, readme_text, re.IGNORECASE):
|
|
357
|
+
score.has_readme_instructions = True
|
|
358
|
+
total += 10
|
|
359
|
+
break
|
|
360
|
+
|
|
361
|
+
# 配置文件
|
|
362
|
+
config_patterns = ["*.yaml", "*.yml", "*.toml", "*.json", "*.cfg", "*.ini"]
|
|
363
|
+
config_dirs = ["configs", "config", "conf"]
|
|
364
|
+
for cd in config_dirs:
|
|
365
|
+
if (root / cd).is_dir():
|
|
366
|
+
score.has_config_files = True
|
|
367
|
+
total += 5
|
|
368
|
+
break
|
|
369
|
+
if not score.has_config_files:
|
|
370
|
+
for pat in config_patterns:
|
|
371
|
+
matches = list(root.glob(pat))
|
|
372
|
+
# 排除 package.json 等非配置文件
|
|
373
|
+
real_configs = [m for m in matches if m.name not in (
|
|
374
|
+
"package.json", "package-lock.json", "tsconfig.json",
|
|
375
|
+
"pyproject.toml", "Cargo.toml",
|
|
376
|
+
)]
|
|
377
|
+
if real_configs:
|
|
378
|
+
score.has_config_files = True
|
|
379
|
+
total += 5
|
|
380
|
+
break
|
|
381
|
+
|
|
382
|
+
# 测试
|
|
383
|
+
test_indicators = ["tests/", "test/", "test_*.py", "*_test.py", "*_test.go", "*.test.js", "*.test.ts"]
|
|
384
|
+
for pat in test_indicators:
|
|
385
|
+
if list(root.glob(pat)):
|
|
386
|
+
score.has_tests = True
|
|
387
|
+
total += 10
|
|
388
|
+
break
|
|
389
|
+
|
|
390
|
+
# CI
|
|
391
|
+
ci_files = [
|
|
392
|
+
".github/workflows", ".gitlab-ci.yml", "Jenkinsfile",
|
|
393
|
+
".circleci", ".travis.yml", "azure-pipelines.yml",
|
|
394
|
+
]
|
|
395
|
+
for f in ci_files:
|
|
396
|
+
if (root / f).exists():
|
|
397
|
+
score.has_ci = True
|
|
398
|
+
total += 5
|
|
399
|
+
break
|
|
400
|
+
|
|
401
|
+
# 依赖版本固定比例
|
|
402
|
+
pinned = _check_pinned_deps(root)
|
|
403
|
+
score.pinned_deps = pinned
|
|
404
|
+
total += pinned * 10
|
|
405
|
+
|
|
406
|
+
score.total = min(total, 100.0)
|
|
407
|
+
return score
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _check_pinned_deps(root: Path) -> float:
|
|
411
|
+
"""检查依赖版本固定比例"""
|
|
412
|
+
req_path = root / "requirements.txt"
|
|
413
|
+
if not req_path.exists():
|
|
414
|
+
return 0.0
|
|
415
|
+
|
|
416
|
+
try:
|
|
417
|
+
content = req_path.read_text(encoding="utf-8", errors="ignore")
|
|
418
|
+
except OSError:
|
|
419
|
+
return 0.0
|
|
420
|
+
|
|
421
|
+
total_deps = 0
|
|
422
|
+
pinned_deps = 0
|
|
423
|
+
for line in content.splitlines():
|
|
424
|
+
line = line.strip()
|
|
425
|
+
if not line or line.startswith("#") or line.startswith("-"):
|
|
426
|
+
continue
|
|
427
|
+
total_deps += 1
|
|
428
|
+
if "==" in line:
|
|
429
|
+
pinned_deps += 1
|
|
430
|
+
|
|
431
|
+
return pinned_deps / total_deps if total_deps > 0 else 0.0
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def format_reproducibility_score(score: ReproducibilityScore) -> str:
|
|
435
|
+
"""格式化可复现性评分"""
|
|
436
|
+
if score.total >= 80:
|
|
437
|
+
grade = "A (优秀)"
|
|
438
|
+
elif score.total >= 60:
|
|
439
|
+
grade = "B (良好)"
|
|
440
|
+
elif score.total >= 40:
|
|
441
|
+
grade = "C (一般)"
|
|
442
|
+
elif score.total >= 20:
|
|
443
|
+
grade = "D (较差)"
|
|
444
|
+
else:
|
|
445
|
+
grade = "F (极差)"
|
|
446
|
+
|
|
447
|
+
checks = [
|
|
448
|
+
("📦 代码仓库", score.has_code),
|
|
449
|
+
("📋 依赖声明", score.has_requirements),
|
|
450
|
+
("🐳 Dockerfile", score.has_dockerfile),
|
|
451
|
+
("🧠 预训练权重", score.has_pretrained),
|
|
452
|
+
("📊 数据下载", score.has_data_script),
|
|
453
|
+
("📖 安装说明", score.has_readme_instructions),
|
|
454
|
+
("⚙️ 配置文件", score.has_config_files),
|
|
455
|
+
("🧪 测试用例", score.has_tests),
|
|
456
|
+
("🔄 CI 配置", score.has_ci),
|
|
457
|
+
]
|
|
458
|
+
|
|
459
|
+
lines = [
|
|
460
|
+
f"📊 可复现性评分: {score.total:.0f}/100 [{grade}]",
|
|
461
|
+
"",
|
|
462
|
+
]
|
|
463
|
+
for label, ok in checks:
|
|
464
|
+
lines.append(f" {label}: {'✅' if ok else '❌'}")
|
|
465
|
+
lines.append(f" 📌 依赖固定率: {score.pinned_deps:.0%}")
|
|
466
|
+
|
|
467
|
+
return "\n".join(lines)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
# ─────────────────────────────────────────────
|
|
471
|
+
# 环境快照
|
|
472
|
+
# ─────────────────────────────────────────────
|
|
473
|
+
|
|
474
|
+
_SNAPSHOT_DIR = os.path.expanduser("~/.gitinstall/snapshots")
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def create_snapshot(paper_id: str = "", notes: str = "") -> EnvironmentSnapshot:
|
|
478
|
+
"""
|
|
479
|
+
创建当前环境快照。
|
|
480
|
+
|
|
481
|
+
捕获 Python 版本、CUDA 版本、pip freeze、环境变量等。
|
|
482
|
+
"""
|
|
483
|
+
import platform
|
|
484
|
+
import subprocess
|
|
485
|
+
import sys
|
|
486
|
+
|
|
487
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
488
|
+
snap_id = hashlib.sha256(f"{now}-{paper_id}".encode()).hexdigest()[:12]
|
|
489
|
+
|
|
490
|
+
# pip freeze
|
|
491
|
+
pip_freeze = []
|
|
492
|
+
try:
|
|
493
|
+
result = subprocess.run(
|
|
494
|
+
[sys.executable, "-m", "pip", "freeze", "--local"],
|
|
495
|
+
capture_output=True, text=True, timeout=30,
|
|
496
|
+
)
|
|
497
|
+
if result.returncode == 0:
|
|
498
|
+
pip_freeze = [l.strip() for l in result.stdout.splitlines() if l.strip()]
|
|
499
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
500
|
+
pass
|
|
501
|
+
|
|
502
|
+
# CUDA 版本
|
|
503
|
+
cuda_version = ""
|
|
504
|
+
try:
|
|
505
|
+
result = subprocess.run(
|
|
506
|
+
["nvcc", "--version"], capture_output=True, text=True, timeout=5,
|
|
507
|
+
)
|
|
508
|
+
m = re.search(r'release (\d+\.\d+)', result.stdout)
|
|
509
|
+
if m:
|
|
510
|
+
cuda_version = m.group(1)
|
|
511
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
512
|
+
cuda_version = os.environ.get("CUDA_VERSION", "")
|
|
513
|
+
|
|
514
|
+
# GPU 信息
|
|
515
|
+
gpu_info = ""
|
|
516
|
+
try:
|
|
517
|
+
result = subprocess.run(
|
|
518
|
+
["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"],
|
|
519
|
+
capture_output=True, text=True, timeout=5,
|
|
520
|
+
)
|
|
521
|
+
if result.returncode == 0:
|
|
522
|
+
gpu_info = result.stdout.strip()
|
|
523
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
524
|
+
# macOS Apple Silicon
|
|
525
|
+
if platform.machine() == "arm64" and platform.system() == "Darwin":
|
|
526
|
+
try:
|
|
527
|
+
result = subprocess.run(
|
|
528
|
+
["sysctl", "-n", "machdep.cpu.brand_string"],
|
|
529
|
+
capture_output=True, text=True, timeout=5,
|
|
530
|
+
)
|
|
531
|
+
gpu_info = f"Apple Silicon ({result.stdout.strip()})"
|
|
532
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
533
|
+
pass
|
|
534
|
+
|
|
535
|
+
# 安全过滤环境变量(只保留开发相关的,不泄露密钥)
|
|
536
|
+
safe_env_prefixes = (
|
|
537
|
+
"PYTHON", "CUDA", "CONDA", "VIRTUAL_ENV", "PATH",
|
|
538
|
+
"LD_LIBRARY", "DYLD_", "CC", "CXX", "CMAKE",
|
|
539
|
+
)
|
|
540
|
+
env_vars = {}
|
|
541
|
+
for k, v in os.environ.items():
|
|
542
|
+
if any(k.startswith(p) for p in safe_env_prefixes):
|
|
543
|
+
# 不记录包含 KEY/TOKEN/SECRET/PASSWORD 的变量
|
|
544
|
+
if not any(s in k.upper() for s in ("KEY", "TOKEN", "SECRET", "PASSWORD")):
|
|
545
|
+
env_vars[k] = v
|
|
546
|
+
|
|
547
|
+
snap = EnvironmentSnapshot(
|
|
548
|
+
snapshot_id=snap_id,
|
|
549
|
+
paper_id=paper_id,
|
|
550
|
+
created_at=now,
|
|
551
|
+
python_version=f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
|
|
552
|
+
cuda_version=cuda_version,
|
|
553
|
+
os_info=f"{platform.system()} {platform.release()} ({platform.machine()})",
|
|
554
|
+
pip_freeze=pip_freeze,
|
|
555
|
+
env_vars=env_vars,
|
|
556
|
+
gpu_info=gpu_info,
|
|
557
|
+
notes=notes,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# 保存到磁盘
|
|
561
|
+
_save_snapshot(snap)
|
|
562
|
+
return snap
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def _save_snapshot(snap: EnvironmentSnapshot) -> str:
|
|
566
|
+
"""保存快照到文件"""
|
|
567
|
+
os.makedirs(_SNAPSHOT_DIR, exist_ok=True)
|
|
568
|
+
path = os.path.join(_SNAPSHOT_DIR, f"{snap.snapshot_id}.json")
|
|
569
|
+
data = {
|
|
570
|
+
"snapshot_id": snap.snapshot_id,
|
|
571
|
+
"paper_id": snap.paper_id,
|
|
572
|
+
"created_at": snap.created_at,
|
|
573
|
+
"python_version": snap.python_version,
|
|
574
|
+
"cuda_version": snap.cuda_version,
|
|
575
|
+
"os_info": snap.os_info,
|
|
576
|
+
"pip_freeze": snap.pip_freeze,
|
|
577
|
+
"env_vars": snap.env_vars,
|
|
578
|
+
"gpu_info": snap.gpu_info,
|
|
579
|
+
"notes": snap.notes,
|
|
580
|
+
}
|
|
581
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
582
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
583
|
+
return path
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def load_snapshot(snapshot_id: str) -> EnvironmentSnapshot:
|
|
587
|
+
"""加载快照"""
|
|
588
|
+
path = os.path.join(_SNAPSHOT_DIR, f"{snapshot_id}.json")
|
|
589
|
+
if not os.path.isfile(path):
|
|
590
|
+
return EnvironmentSnapshot()
|
|
591
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
592
|
+
data = json.load(f)
|
|
593
|
+
return EnvironmentSnapshot(**data)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
def list_snapshots() -> list[dict]:
|
|
597
|
+
"""列出所有快照"""
|
|
598
|
+
if not os.path.isdir(_SNAPSHOT_DIR):
|
|
599
|
+
return []
|
|
600
|
+
result = []
|
|
601
|
+
for fname in sorted(os.listdir(_SNAPSHOT_DIR)):
|
|
602
|
+
if fname.endswith(".json"):
|
|
603
|
+
path = os.path.join(_SNAPSHOT_DIR, fname)
|
|
604
|
+
try:
|
|
605
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
606
|
+
data = json.load(f)
|
|
607
|
+
result.append({
|
|
608
|
+
"snapshot_id": data.get("snapshot_id", ""),
|
|
609
|
+
"paper_id": data.get("paper_id", ""),
|
|
610
|
+
"created_at": data.get("created_at", ""),
|
|
611
|
+
"python_version": data.get("python_version", ""),
|
|
612
|
+
})
|
|
613
|
+
except (json.JSONDecodeError, OSError):
|
|
614
|
+
pass
|
|
615
|
+
return result
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def generate_restore_commands(snapshot_id: str) -> list[str]:
|
|
619
|
+
"""生成恢复环境的命令列表"""
|
|
620
|
+
snap = load_snapshot(snapshot_id)
|
|
621
|
+
if not snap.snapshot_id:
|
|
622
|
+
return [f"# 快照 {snapshot_id} 不存在"]
|
|
623
|
+
|
|
624
|
+
cmds = [
|
|
625
|
+
f"# 环境恢复 — 快照 {snap.snapshot_id}",
|
|
626
|
+
f"# 创建于: {snap.created_at}",
|
|
627
|
+
f"# Python: {snap.python_version}",
|
|
628
|
+
"",
|
|
629
|
+
]
|
|
630
|
+
|
|
631
|
+
if snap.cuda_version:
|
|
632
|
+
cmds.append(f"# CUDA: {snap.cuda_version}")
|
|
633
|
+
|
|
634
|
+
cmds.extend([
|
|
635
|
+
f"python -m venv .venv-{snap.snapshot_id}",
|
|
636
|
+
f"source .venv-{snap.snapshot_id}/bin/activate",
|
|
637
|
+
])
|
|
638
|
+
|
|
639
|
+
if snap.pip_freeze:
|
|
640
|
+
req_file = f"requirements-{snap.snapshot_id}.txt"
|
|
641
|
+
cmds.append(f"cat > {req_file} << 'EOF'")
|
|
642
|
+
cmds.extend(snap.pip_freeze)
|
|
643
|
+
cmds.append("EOF")
|
|
644
|
+
cmds.append(f"pip install -r {req_file}")
|
|
645
|
+
|
|
646
|
+
return cmds
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
# ─────────────────────────────────────────────
|
|
650
|
+
# 实验追踪
|
|
651
|
+
# ─────────────────────────────────────────────
|
|
652
|
+
|
|
653
|
+
_EXPERIMENTS_DIR = os.path.expanduser("~/.gitinstall/experiments")
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def log_experiment(
|
|
657
|
+
paper_id: str,
|
|
658
|
+
command: str,
|
|
659
|
+
params: dict[str, Any] | None = None,
|
|
660
|
+
metrics: dict[str, float] | None = None,
|
|
661
|
+
duration_sec: float = 0.0,
|
|
662
|
+
status: str = "success",
|
|
663
|
+
output_path: str = "",
|
|
664
|
+
snapshot_id: str = "",
|
|
665
|
+
notes: str = "",
|
|
666
|
+
) -> ExperimentRun:
|
|
667
|
+
"""记录一次实验运行"""
|
|
668
|
+
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
669
|
+
run_id = hashlib.sha256(f"{now}-{paper_id}-{command}".encode()).hexdigest()[:12]
|
|
670
|
+
|
|
671
|
+
run = ExperimentRun(
|
|
672
|
+
run_id=run_id,
|
|
673
|
+
paper_id=paper_id,
|
|
674
|
+
timestamp=now,
|
|
675
|
+
command=command,
|
|
676
|
+
params=params or {},
|
|
677
|
+
metrics=metrics or {},
|
|
678
|
+
duration_sec=duration_sec,
|
|
679
|
+
status=status,
|
|
680
|
+
output_path=output_path,
|
|
681
|
+
snapshot_id=snapshot_id,
|
|
682
|
+
notes=notes,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# 持久化
|
|
686
|
+
paper_dir = os.path.join(_EXPERIMENTS_DIR, paper_id.replace("/", "_"))
|
|
687
|
+
os.makedirs(paper_dir, exist_ok=True)
|
|
688
|
+
path = os.path.join(paper_dir, f"{run_id}.json")
|
|
689
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
690
|
+
json.dump({
|
|
691
|
+
"run_id": run.run_id,
|
|
692
|
+
"paper_id": run.paper_id,
|
|
693
|
+
"timestamp": run.timestamp,
|
|
694
|
+
"command": run.command,
|
|
695
|
+
"params": run.params,
|
|
696
|
+
"metrics": run.metrics,
|
|
697
|
+
"duration_sec": run.duration_sec,
|
|
698
|
+
"status": run.status,
|
|
699
|
+
"output_path": run.output_path,
|
|
700
|
+
"snapshot_id": run.snapshot_id,
|
|
701
|
+
"notes": run.notes,
|
|
702
|
+
}, f, indent=2, ensure_ascii=False)
|
|
703
|
+
|
|
704
|
+
return run
|
|
705
|
+
|
|
706
|
+
|
|
707
|
+
def list_experiments(paper_id: str) -> list[ExperimentRun]:
|
|
708
|
+
"""列出论文的所有实验"""
|
|
709
|
+
paper_dir = os.path.join(_EXPERIMENTS_DIR, paper_id.replace("/", "_"))
|
|
710
|
+
if not os.path.isdir(paper_dir):
|
|
711
|
+
return []
|
|
712
|
+
|
|
713
|
+
runs = []
|
|
714
|
+
for fname in sorted(os.listdir(paper_dir)):
|
|
715
|
+
if fname.endswith(".json"):
|
|
716
|
+
path = os.path.join(paper_dir, fname)
|
|
717
|
+
try:
|
|
718
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
719
|
+
data = json.load(f)
|
|
720
|
+
runs.append(ExperimentRun(**data))
|
|
721
|
+
except (json.JSONDecodeError, OSError, TypeError):
|
|
722
|
+
pass
|
|
723
|
+
return runs
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def compare_experiments(paper_id: str) -> str:
|
|
727
|
+
"""对比论文的所有实验结果"""
|
|
728
|
+
runs = list_experiments(paper_id)
|
|
729
|
+
if not runs:
|
|
730
|
+
return f"论文 {paper_id} 没有实验记录"
|
|
731
|
+
|
|
732
|
+
# 收集所有指标
|
|
733
|
+
all_metrics = set()
|
|
734
|
+
for r in runs:
|
|
735
|
+
all_metrics.update(r.metrics.keys())
|
|
736
|
+
all_metrics = sorted(all_metrics)
|
|
737
|
+
|
|
738
|
+
lines = [
|
|
739
|
+
f"📊 实验对比 — {paper_id}",
|
|
740
|
+
f" 共 {len(runs)} 次实验",
|
|
741
|
+
"",
|
|
742
|
+
"运行ID | 状态 | 耗时 | " + " | ".join(all_metrics),
|
|
743
|
+
"-" * (50 + 15 * len(all_metrics)),
|
|
744
|
+
]
|
|
745
|
+
|
|
746
|
+
for r in runs:
|
|
747
|
+
duration_str = f"{r.duration_sec:.1f}s" if r.duration_sec else "N/A"
|
|
748
|
+
metric_vals = [f"{r.metrics.get(m, '-')}" for m in all_metrics]
|
|
749
|
+
lines.append(
|
|
750
|
+
f"{r.run_id:10s} | {r.status:7s} | {duration_str:8s} | " +
|
|
751
|
+
" | ".join(f"{v:>12s}" for v in metric_vals)
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
return "\n".join(lines)
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
# ─────────────────────────────────────────────
|
|
758
|
+
# 论文→安装 一键流水线
|
|
759
|
+
# ─────────────────────────────────────────────
|
|
760
|
+
|
|
761
|
+
def paper_to_install_plan(paper_input: str) -> dict:
|
|
762
|
+
"""
|
|
763
|
+
从论文 ID/URL 生成安装计划。
|
|
764
|
+
|
|
765
|
+
流程:
|
|
766
|
+
1. 解析 arXiv ID
|
|
767
|
+
2. 获取论文元数据
|
|
768
|
+
3. 查找关联代码仓库(Papers With Code)
|
|
769
|
+
4. 返回 repo URL + 论文信息
|
|
770
|
+
|
|
771
|
+
Args:
|
|
772
|
+
paper_input: arXiv ID、URL、或论文标题
|
|
773
|
+
|
|
774
|
+
Returns:
|
|
775
|
+
{"paper": PaperInfo, "repos": [...], "suggested_repo": "...", "install_cmd": "..."}
|
|
776
|
+
"""
|
|
777
|
+
arxiv_id = parse_arxiv_id(paper_input)
|
|
778
|
+
if not arxiv_id:
|
|
779
|
+
return {
|
|
780
|
+
"error": f"无法识别 arXiv ID: {paper_input}",
|
|
781
|
+
"hint": "请输入 arXiv ID (如 2301.13688) 或 URL",
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
paper = fetch_arxiv_paper(arxiv_id)
|
|
785
|
+
|
|
786
|
+
# 查找代码仓库
|
|
787
|
+
repos = list(paper.code_urls) # 从摘要中提取的
|
|
788
|
+
pwc_repos = search_papers_with_code(arxiv_id)
|
|
789
|
+
for r in pwc_repos:
|
|
790
|
+
if r not in repos:
|
|
791
|
+
repos.append(r)
|
|
792
|
+
|
|
793
|
+
suggested = repos[0] if repos else ""
|
|
794
|
+
|
|
795
|
+
# 生成安装命令
|
|
796
|
+
install_cmd = ""
|
|
797
|
+
if suggested:
|
|
798
|
+
# 提取 owner/repo
|
|
799
|
+
m = re.search(r'github\.com/([\w.-]+/[\w.-]+)', suggested)
|
|
800
|
+
if m:
|
|
801
|
+
install_cmd = f"gitinstall install {m.group(1)}"
|
|
802
|
+
|
|
803
|
+
return {
|
|
804
|
+
"paper": {
|
|
805
|
+
"id": paper.paper_id,
|
|
806
|
+
"title": paper.title,
|
|
807
|
+
"authors": paper.authors,
|
|
808
|
+
"published": paper.published,
|
|
809
|
+
"categories": paper.categories,
|
|
810
|
+
"pdf_url": paper.pdf_url,
|
|
811
|
+
"bibtex": paper.bibtex,
|
|
812
|
+
},
|
|
813
|
+
"repos": repos,
|
|
814
|
+
"suggested_repo": suggested,
|
|
815
|
+
"install_cmd": install_cmd,
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def format_paper_info(paper: PaperInfo) -> str:
|
|
820
|
+
"""格式化论文信息"""
|
|
821
|
+
lines = [
|
|
822
|
+
f"📄 {paper.title}",
|
|
823
|
+
f" 作者: {', '.join(paper.authors[:5])}" + (" 等" if len(paper.authors) > 5 else ""),
|
|
824
|
+
f" 发布: {paper.published}",
|
|
825
|
+
f" 分类: {', '.join(paper.categories[:3])}",
|
|
826
|
+
]
|
|
827
|
+
if paper.pdf_url:
|
|
828
|
+
lines.append(f" PDF: {paper.pdf_url}")
|
|
829
|
+
if paper.code_urls:
|
|
830
|
+
lines.append(f" 代码: {', '.join(paper.code_urls)}")
|
|
831
|
+
return "\n".join(lines)
|