fc-data 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasmith/__init__.py +330 -0
- datasmith/__init__.pyi +194 -0
- datasmith/agents/__init__.py +31 -0
- datasmith/agents/classifiers.py +272 -0
- datasmith/agents/codex.py +25 -0
- datasmith/agents/config.py +108 -0
- datasmith/agents/extractors.py +197 -0
- datasmith/agents/installed/README.md +52 -0
- datasmith/agents/installed/__init__.py +22 -0
- datasmith/agents/installed/base.py +240 -0
- datasmith/agents/installed/claude.py +134 -0
- datasmith/agents/installed/codex.py +91 -0
- datasmith/agents/installed/gemini.py +118 -0
- datasmith/agents/installed/none.py +27 -0
- datasmith/agents/sandbox.py +547 -0
- datasmith/agents/synthesizer.py +439 -0
- datasmith/agents/templates/AGENTS.md.j2 +150 -0
- datasmith/agents/templates/sandbox_verify.py +428 -0
- datasmith/docker/__init__.py +31 -0
- datasmith/docker/context.py +112 -0
- datasmith/docker/images.py +158 -0
- datasmith/docker/publish.py +56 -0
- datasmith/docker/templates/Dockerfile.base +26 -0
- datasmith/docker/templates/Dockerfile.pr +42 -0
- datasmith/docker/templates/Dockerfile.repo +11 -0
- datasmith/docker/templates/docker_build_base.sh +780 -0
- datasmith/docker/templates/docker_build_env.sh +309 -0
- datasmith/docker/templates/docker_build_final.sh +106 -0
- datasmith/docker/templates/docker_build_pkg.sh +99 -0
- datasmith/docker/templates/docker_build_run.sh +124 -0
- datasmith/docker/templates/entrypoint.sh +62 -0
- datasmith/docker/templates/parser.py +1405 -0
- datasmith/docker/templates/profile.sh +199 -0
- datasmith/docker/templates/pytest_runner.py +692 -0
- datasmith/docker/templates/run-tests.sh +197 -0
- datasmith/docker/verifiers.py +131 -0
- datasmith/filters.py +154 -0
- datasmith/github/__init__.py +22 -0
- datasmith/github/client.py +333 -0
- datasmith/github/hooks.py +50 -0
- datasmith/github/links.py +110 -0
- datasmith/github/models.py +206 -0
- datasmith/github/render.py +173 -0
- datasmith/github/search.py +66 -0
- datasmith/github/templates/comment.md.j2 +5 -0
- datasmith/github/templates/final.md.j2 +66 -0
- datasmith/github/templates/issues.md.j2 +21 -0
- datasmith/github/templates/repo.md.j2 +1 -0
- datasmith/preflight.py +162 -0
- datasmith/publish/__init__.py +13 -0
- datasmith/publish/huggingface.py +104 -0
- datasmith/publish/pipeline.py +60 -0
- datasmith/publish/records.py +91 -0
- datasmith/py.typed +1 -0
- datasmith/resolution/__init__.py +14 -0
- datasmith/resolution/blocklist.py +145 -0
- datasmith/resolution/cache.py +120 -0
- datasmith/resolution/constants.py +277 -0
- datasmith/resolution/dependency_resolver.py +174 -0
- datasmith/resolution/git_utils.py +378 -0
- datasmith/resolution/import_analyzer.py +66 -0
- datasmith/resolution/metadata_parser.py +412 -0
- datasmith/resolution/models.py +41 -0
- datasmith/resolution/orchestrator.py +522 -0
- datasmith/resolution/package_filters.py +312 -0
- datasmith/resolution/python_manager.py +110 -0
- datasmith/runners/__init__.py +15 -0
- datasmith/runners/base.py +112 -0
- datasmith/runners/classify_prs.py +48 -0
- datasmith/runners/render_problems.py +113 -0
- datasmith/runners/resolve_packages.py +66 -0
- datasmith/runners/scrape_commits.py +166 -0
- datasmith/runners/scrape_repos.py +44 -0
- datasmith/runners/synthesize_images.py +310 -0
- datasmith/update/__init__.py +5 -0
- datasmith/update/cli.py +169 -0
- datasmith/update/offline.py +173 -0
- datasmith/update/pipeline.py +497 -0
- datasmith/utils/__init__.py +18 -0
- datasmith/utils/core.py +67 -0
- datasmith/utils/db.py +156 -0
- datasmith/utils/tokens.py +65 -0
- fc_data-0.2.0.dist-info/METADATA +441 -0
- fc_data-0.2.0.dist-info/RECORD +87 -0
- fc_data-0.2.0.dist-info/WHEEL +4 -0
- fc_data-0.2.0.dist-info/entry_points.txt +2 -0
- fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Dependency resolution using uv."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import datetime as dt
|
|
6
|
+
import os
|
|
7
|
+
import zipfile
|
|
8
|
+
from collections.abc import Iterable
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .constants import ANSI_RE
|
|
12
|
+
from .package_filters import fix_marker_spacing
|
|
13
|
+
from .python_manager import run_uv
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def strip_ansi(s: str) -> str:
|
|
17
|
+
"""Remove ANSI escape codes from a string."""
|
|
18
|
+
return ANSI_RE.sub("", s)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def rfc3339(ts: dt.datetime) -> str:
|
|
22
|
+
"""Convert a datetime to RFC3339 format string."""
|
|
23
|
+
if ts.tzinfo is None:
|
|
24
|
+
ts = ts.replace(tzinfo=dt.timezone.utc)
|
|
25
|
+
return ts.astimezone(dt.timezone.utc).isoformat().replace("+00:00", "Z")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def uv_compile_from_pyproject(
|
|
29
|
+
pyproject_path: Path, python_version: str | None, cutoff_rfc3339: str | None
|
|
30
|
+
) -> list[str]:
|
|
31
|
+
"""Use `uv pip compile` to resolve to pinned requirements from pyproject.toml."""
|
|
32
|
+
if not pyproject_path.exists():
|
|
33
|
+
return []
|
|
34
|
+
args = ["pip", "compile", str(pyproject_path.resolve())]
|
|
35
|
+
if python_version:
|
|
36
|
+
args.extend(["--python", python_version])
|
|
37
|
+
args.append("--all-extras")
|
|
38
|
+
extra_env: dict[str, str] = {}
|
|
39
|
+
if cutoff_rfc3339:
|
|
40
|
+
extra_env["UV_EXCLUDE_NEWER"] = cutoff_rfc3339
|
|
41
|
+
cp = run_uv(args, input_text=None, extra_env=extra_env, cwd=pyproject_path.parent)
|
|
42
|
+
if cp.returncode != 0:
|
|
43
|
+
raise RuntimeError(f"uv pip compile failed:\n{cp.stderr.decode() or cp.stdout.decode()}")
|
|
44
|
+
out: list[str] = []
|
|
45
|
+
for raw in cp.stdout.decode().splitlines():
|
|
46
|
+
s = strip_ansi(raw).strip()
|
|
47
|
+
if s and not s.startswith("#"):
|
|
48
|
+
out.append(s)
|
|
49
|
+
return out
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def uv_compile(requirements: Iterable[str], *, python_version: str | None, cutoff_rfc3339: str | None) -> list[str]:
|
|
53
|
+
"""Use `uv pip compile` to resolve to pinned requirements from stdin."""
|
|
54
|
+
reqs = sorted({fix_marker_spacing(r.strip()) for r in requirements if r and r.strip()})
|
|
55
|
+
if not reqs:
|
|
56
|
+
return []
|
|
57
|
+
req_text = "\n".join(reqs) + "\n"
|
|
58
|
+
args = ["pip", "compile", "-"]
|
|
59
|
+
if python_version:
|
|
60
|
+
args.extend(["--python", python_version])
|
|
61
|
+
args.append("--upgrade")
|
|
62
|
+
extra_env: dict[str, str] = {}
|
|
63
|
+
if cutoff_rfc3339:
|
|
64
|
+
extra_env["UV_EXCLUDE_NEWER"] = cutoff_rfc3339
|
|
65
|
+
cp = run_uv(args, input_text=req_text, extra_env=extra_env)
|
|
66
|
+
if cp.returncode != 0:
|
|
67
|
+
raise RuntimeError(f"uv pip compile failed:\n{cp.stderr.decode() or cp.stdout.decode()}")
|
|
68
|
+
out: list[str] = []
|
|
69
|
+
for raw in cp.stdout.decode().splitlines():
|
|
70
|
+
s = strip_ansi(raw).strip()
|
|
71
|
+
if s and not s.startswith("#"):
|
|
72
|
+
out.append(s)
|
|
73
|
+
return out
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def uv_dry_run_install(
|
|
77
|
+
pinned: Iterable[str], *, python_version: str | None, venv_path: Path | None = None
|
|
78
|
+
) -> tuple[bool, str]:
|
|
79
|
+
"""Run a dry-run install to validate that dependencies can be installed."""
|
|
80
|
+
text_lines = [fix_marker_spacing(x) for x in pinned if x.strip()]
|
|
81
|
+
if not text_lines:
|
|
82
|
+
return True, "No runtime dependencies."
|
|
83
|
+
text = "\n".join(text_lines) + "\n"
|
|
84
|
+
args = ["pip", "install", "--dry-run", "-r", "-"]
|
|
85
|
+
|
|
86
|
+
if venv_path and venv_path.exists():
|
|
87
|
+
python_exe = venv_path / "bin" / "python"
|
|
88
|
+
if not python_exe.exists():
|
|
89
|
+
python_exe = venv_path / "Scripts" / "python.exe"
|
|
90
|
+
if python_exe.exists():
|
|
91
|
+
args.extend(["--python", str(python_exe)])
|
|
92
|
+
elif python_version:
|
|
93
|
+
args.extend(["--python", python_version, "--system"])
|
|
94
|
+
elif python_version:
|
|
95
|
+
args.extend(["--python", python_version, "--system"])
|
|
96
|
+
|
|
97
|
+
cp = run_uv(args, input_text=text)
|
|
98
|
+
ok = cp.returncode == 0
|
|
99
|
+
log = strip_ansi(cp.stdout.decode() + "\n" + cp.stderr.decode())
|
|
100
|
+
return ok, log
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def uv_install_real(pinned: Iterable[str], *, python_executable: str | None = None) -> tuple[bool, str]:
|
|
104
|
+
"""Perform a real install of pinned requirements to surface sdist build failures."""
|
|
105
|
+
lines = [fix_marker_spacing(x) for x in pinned if x.strip()]
|
|
106
|
+
if not lines:
|
|
107
|
+
return True, "No dependencies to install."
|
|
108
|
+
text = "\n".join(lines) + "\n"
|
|
109
|
+
args = ["pip", "install", "-r", "-"]
|
|
110
|
+
if python_executable:
|
|
111
|
+
args.extend(["--python", python_executable])
|
|
112
|
+
cp = run_uv(args, input_text=text)
|
|
113
|
+
ok = cp.returncode == 0
|
|
114
|
+
log = strip_ansi(cp.stdout.decode() + "\n" + cp.stderr.decode())
|
|
115
|
+
return ok, log
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def uv_build_and_read_metadata(project_dir: Path) -> tuple[str | None, str | None, list[str], str | None]:
|
|
119
|
+
"""Run `uv build` in the project directory, then read metadata from the wheel.
|
|
120
|
+
|
|
121
|
+
This is a best-effort fallback — many repos have dynamic setup.py files that
|
|
122
|
+
fail in partial clones. Failures are expected and logged at debug level.
|
|
123
|
+
"""
|
|
124
|
+
import subprocess as _sp
|
|
125
|
+
|
|
126
|
+
from datasmith.utils import get_logger as _get_logger
|
|
127
|
+
|
|
128
|
+
_logger = _get_logger("resolution.dependency_resolver")
|
|
129
|
+
|
|
130
|
+
# Use subprocess directly with DEVNULL for stderr to suppress noisy
|
|
131
|
+
# setup.py tracebacks from child processes that bypass capture_output.
|
|
132
|
+
env = os.environ.copy()
|
|
133
|
+
env.setdefault("UV_COLOR", "never")
|
|
134
|
+
env.setdefault("NO_COLOR", "1")
|
|
135
|
+
cp = _sp.run(
|
|
136
|
+
["uv", "build"],
|
|
137
|
+
capture_output=True,
|
|
138
|
+
stdin=_sp.DEVNULL,
|
|
139
|
+
cwd=str(project_dir),
|
|
140
|
+
env=env,
|
|
141
|
+
)
|
|
142
|
+
if cp.returncode != 0:
|
|
143
|
+
_logger.debug(
|
|
144
|
+
"uv build failed in %s (expected for repos with dynamic setup.py): %s",
|
|
145
|
+
project_dir.name,
|
|
146
|
+
strip_ansi(cp.stderr.decode())[-200:],
|
|
147
|
+
)
|
|
148
|
+
return None, None, [], None
|
|
149
|
+
dist_dir = project_dir / "dist"
|
|
150
|
+
if not dist_dir.exists():
|
|
151
|
+
return None, None, [], None
|
|
152
|
+
wheels = sorted(dist_dir.glob("*.whl"))
|
|
153
|
+
if not wheels:
|
|
154
|
+
return None, None, [], None
|
|
155
|
+
name, version = None, None
|
|
156
|
+
requires_dist: list[str] = []
|
|
157
|
+
requires_python: str | None = None
|
|
158
|
+
with zipfile.ZipFile(wheels[-1]) as zf:
|
|
159
|
+
meta_name = next((n for n in zf.namelist() if n.endswith(".dist-info/METADATA")), None)
|
|
160
|
+
if not meta_name:
|
|
161
|
+
return None, None, [], None
|
|
162
|
+
content = zf.read(meta_name).decode("utf-8", errors="replace")
|
|
163
|
+
for line in content.splitlines():
|
|
164
|
+
if line.startswith("Name: "):
|
|
165
|
+
name = line.split("Name:", 1)[1].strip()
|
|
166
|
+
elif line.startswith("Version: "):
|
|
167
|
+
version = line.split("Version:", 1)[1].strip()
|
|
168
|
+
elif line.startswith("Requires-Dist: "):
|
|
169
|
+
req = line.split("Requires-Dist:", 1)[1].strip()
|
|
170
|
+
req = fix_marker_spacing(req)
|
|
171
|
+
requires_dist.append(req)
|
|
172
|
+
elif line.startswith("Requires-Python: "):
|
|
173
|
+
requires_python = line.split("Requires-Python:", 1)[1].strip()
|
|
174
|
+
return name, version, requires_dist, requires_python
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
"""Git repository operations for dependency resolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import os
|
|
7
|
+
import shutil
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
from collections.abc import Iterable
|
|
11
|
+
from contextlib import suppress
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Callable, cast
|
|
14
|
+
|
|
15
|
+
from git import Commit, Repo
|
|
16
|
+
|
|
17
|
+
from datasmith.utils import get_logger
|
|
18
|
+
|
|
19
|
+
from .constants import ASV_REGEX, GIT_CACHE_DIR
|
|
20
|
+
|
|
21
|
+
_worktree_lock_registry: dict[tuple[str, str], threading.Lock] = {}
|
|
22
|
+
_worktree_registry_lock = threading.Lock()
|
|
23
|
+
_worktree_cleanup_lock = threading.Lock()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _env_non_negative_int(var: str, default: int) -> int:
|
|
27
|
+
raw = os.getenv(var)
|
|
28
|
+
if raw is None:
|
|
29
|
+
return default
|
|
30
|
+
try:
|
|
31
|
+
value = int(float(raw))
|
|
32
|
+
except ValueError:
|
|
33
|
+
return default
|
|
34
|
+
return max(0, value)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _env_non_negative_float(var: str, default: float) -> float:
|
|
38
|
+
raw = os.getenv(var)
|
|
39
|
+
if raw is None:
|
|
40
|
+
return default
|
|
41
|
+
try:
|
|
42
|
+
value = float(raw)
|
|
43
|
+
except ValueError:
|
|
44
|
+
return default
|
|
45
|
+
return max(0.0, value)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _default_worktree_ttl_seconds() -> int | None:
|
|
49
|
+
raw = os.getenv("DATASMITH_GIT_WORKTREE_TTL_SECONDS")
|
|
50
|
+
if raw is None:
|
|
51
|
+
return 24 * 3600
|
|
52
|
+
try:
|
|
53
|
+
value = int(float(raw))
|
|
54
|
+
except ValueError:
|
|
55
|
+
return 24 * 3600
|
|
56
|
+
return value if value > 0 else None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
DEFAULT_MAX_WORKTREES_PER_REPO = _env_non_negative_int("DATASMITH_GIT_MAX_WORKTREES_PER_REPO", 128)
|
|
60
|
+
DEFAULT_WORKTREE_TTL_SECONDS = _default_worktree_ttl_seconds()
|
|
61
|
+
DEFAULT_WORKTREE_MIN_FREE_GB = _env_non_negative_float("DATASMITH_GIT_WORKTREE_MIN_FREE_GB", 256.0)
|
|
62
|
+
|
|
63
|
+
logger = get_logger("resolution.git_utils")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _get_worktree_lock(repo_name: str, sha: str) -> threading.Lock:
|
|
67
|
+
"""Return a per-(repo, sha) mutex used to serialize worktree materialization."""
|
|
68
|
+
key = (repo_name, sha)
|
|
69
|
+
with _worktree_registry_lock:
|
|
70
|
+
lock = _worktree_lock_registry.get(key)
|
|
71
|
+
if lock is None:
|
|
72
|
+
lock = threading.Lock()
|
|
73
|
+
_worktree_lock_registry[key] = lock
|
|
74
|
+
return lock
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def repo_key(repo_name: str) -> str:
|
|
78
|
+
"""Convert a GitHub repo name to a filesystem-safe key."""
|
|
79
|
+
return repo_name.replace("/", "__")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def base_clone_path(repo_name: str) -> Path:
|
|
83
|
+
"""Get the path for a base clone of a repository."""
|
|
84
|
+
return GIT_CACHE_DIR / "base_clones" / repo_key(repo_name)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def mirror_path(repo_name: str) -> Path:
|
|
88
|
+
"""Get the path for a bare mirror of a repository."""
|
|
89
|
+
return GIT_CACHE_DIR / "mirrors" / f"{repo_key(repo_name)}.git"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def worktree_root(repo_name: str) -> Path:
|
|
93
|
+
"""Return the persistent worktree root for a repository."""
|
|
94
|
+
return base_clone_path(repo_name) / "worktrees"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _free_gb(path: Path) -> float:
|
|
98
|
+
"""Calculate the free space at ``path`` in gigabytes."""
|
|
99
|
+
try:
|
|
100
|
+
usage = shutil.disk_usage(path)
|
|
101
|
+
except FileNotFoundError:
|
|
102
|
+
return float("inf")
|
|
103
|
+
return usage.free / (1024**3)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _remove_worktree_dir(base_repo: Repo | None, worktree_dir: Path) -> None:
|
|
107
|
+
"""Remove a worktree directory, falling back to direct removal on failure."""
|
|
108
|
+
with suppress(Exception):
|
|
109
|
+
if base_repo is not None:
|
|
110
|
+
base_repo.git.worktree("remove", str(worktree_dir), "--force")
|
|
111
|
+
with suppress(Exception):
|
|
112
|
+
if worktree_dir.exists():
|
|
113
|
+
shutil.rmtree(worktree_dir)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def cleanup_worktree_cache( # noqa: C901
|
|
117
|
+
repo_name: str,
|
|
118
|
+
*,
|
|
119
|
+
base_repo: Repo | None = None,
|
|
120
|
+
active_shas: Iterable[str] | None = None,
|
|
121
|
+
max_worktrees: int | None = None,
|
|
122
|
+
max_age_seconds: int | None = None,
|
|
123
|
+
min_free_gb: float | None = None,
|
|
124
|
+
) -> list[Path]:
|
|
125
|
+
"""Remove stale or excess worktrees for ``repo_name``."""
|
|
126
|
+
protected = set(active_shas or ())
|
|
127
|
+
with _worktree_registry_lock:
|
|
128
|
+
for (name, locked_sha), lock in _worktree_lock_registry.items():
|
|
129
|
+
if name == repo_name and lock.locked():
|
|
130
|
+
protected.add(locked_sha)
|
|
131
|
+
ttl_seconds = max_age_seconds if max_age_seconds is not None else DEFAULT_WORKTREE_TTL_SECONDS
|
|
132
|
+
ttl_seconds = ttl_seconds if ttl_seconds and ttl_seconds > 0 else None
|
|
133
|
+
keep_limit = max_worktrees if max_worktrees is not None else DEFAULT_MAX_WORKTREES_PER_REPO
|
|
134
|
+
if keep_limit is not None:
|
|
135
|
+
keep_limit = max(0, keep_limit)
|
|
136
|
+
min_free: float | None = min_free_gb if min_free_gb is not None else DEFAULT_WORKTREE_MIN_FREE_GB
|
|
137
|
+
min_free = min_free if min_free and min_free > 0 else None
|
|
138
|
+
|
|
139
|
+
wroot = worktree_root(repo_name)
|
|
140
|
+
if not wroot.exists():
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
repo = base_repo
|
|
144
|
+
if repo is None:
|
|
145
|
+
with suppress(Exception):
|
|
146
|
+
repo = Repo(base_clone_path(repo_name))
|
|
147
|
+
|
|
148
|
+
removed: list[Path] = []
|
|
149
|
+
|
|
150
|
+
with _worktree_cleanup_lock:
|
|
151
|
+
entries: list[tuple[Path, float, str]] = []
|
|
152
|
+
protected_entries: list[tuple[Path, float, str]] = []
|
|
153
|
+
for child in wroot.iterdir():
|
|
154
|
+
if not child.is_dir():
|
|
155
|
+
continue
|
|
156
|
+
sha = child.name
|
|
157
|
+
try:
|
|
158
|
+
mtime = child.stat().st_mtime
|
|
159
|
+
except FileNotFoundError:
|
|
160
|
+
continue
|
|
161
|
+
entry = (child, mtime, sha)
|
|
162
|
+
if sha in protected:
|
|
163
|
+
protected_entries.append(entry)
|
|
164
|
+
else:
|
|
165
|
+
entries.append(entry)
|
|
166
|
+
|
|
167
|
+
now = time.time()
|
|
168
|
+
if ttl_seconds is not None:
|
|
169
|
+
cutoff = now - ttl_seconds
|
|
170
|
+
fresh_entries: list[tuple[Path, float, str]] = []
|
|
171
|
+
for path, mtime, sha in entries:
|
|
172
|
+
if mtime < cutoff:
|
|
173
|
+
removed.append(path)
|
|
174
|
+
_remove_worktree_dir(repo, path)
|
|
175
|
+
else:
|
|
176
|
+
fresh_entries.append((path, mtime, sha))
|
|
177
|
+
entries = fresh_entries
|
|
178
|
+
|
|
179
|
+
entries.sort(key=lambda item: item[1], reverse=True)
|
|
180
|
+
kept_entries: list[tuple[Path, float, str]] = list(protected_entries)
|
|
181
|
+
|
|
182
|
+
if keep_limit is not None:
|
|
183
|
+
available = keep_limit - len(protected_entries)
|
|
184
|
+
if available <= 0:
|
|
185
|
+
to_remove = entries
|
|
186
|
+
else:
|
|
187
|
+
kept_entries.extend(entries[:available])
|
|
188
|
+
to_remove = entries[available:]
|
|
189
|
+
else:
|
|
190
|
+
kept_entries.extend(entries)
|
|
191
|
+
to_remove = []
|
|
192
|
+
|
|
193
|
+
for entry in to_remove:
|
|
194
|
+
path = entry[0]
|
|
195
|
+
removed.append(path)
|
|
196
|
+
_remove_worktree_dir(repo, path)
|
|
197
|
+
|
|
198
|
+
if min_free is not None:
|
|
199
|
+
removable = [entry for entry in kept_entries if entry[2] not in protected]
|
|
200
|
+
removable.sort(key=lambda item: item[1])
|
|
201
|
+
free_space = _free_gb(GIT_CACHE_DIR)
|
|
202
|
+
idx = 0
|
|
203
|
+
while free_space < min_free and idx < len(removable):
|
|
204
|
+
entry = removable[idx]
|
|
205
|
+
path = entry[0]
|
|
206
|
+
idx += 1
|
|
207
|
+
removed.append(path)
|
|
208
|
+
_remove_worktree_dir(repo, path)
|
|
209
|
+
free_space = _free_gb(GIT_CACHE_DIR)
|
|
210
|
+
|
|
211
|
+
if repo is not None and removed:
|
|
212
|
+
with suppress(Exception):
|
|
213
|
+
repo.git.worktree("prune", "--expire=now")
|
|
214
|
+
|
|
215
|
+
if removed:
|
|
216
|
+
logger.debug("Removed %d worktree(s) for %s", len(removed), repo_name)
|
|
217
|
+
|
|
218
|
+
return removed
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def ensure_base_clone(repo_name: str) -> Repo:
|
|
222
|
+
"""Ensure a non-bare base clone exists (partial clone)."""
|
|
223
|
+
url = f"https://github.com/{repo_name}.git"
|
|
224
|
+
path = base_clone_path(repo_name)
|
|
225
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
226
|
+
repo = Repo.clone_from(url, path, multi_options=["--filter=blob:none"]) if not path.exists() else Repo(path)
|
|
227
|
+
return repo
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def ensure_mirror(repo_name: str) -> Path:
|
|
231
|
+
"""Ensure a local bare mirror exists."""
|
|
232
|
+
url = f"https://github.com/{repo_name}.git"
|
|
233
|
+
mpath = mirror_path(repo_name)
|
|
234
|
+
mpath.parent.mkdir(parents=True, exist_ok=True)
|
|
235
|
+
if not mpath.exists():
|
|
236
|
+
Repo.clone_from(url, mpath, mirror=True, multi_options=["--filter=blob:none"])
|
|
237
|
+
else:
|
|
238
|
+
with suppress(Exception):
|
|
239
|
+
Repo(mpath).remote().update(prune=True)
|
|
240
|
+
return mpath
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def ensure_commit_available(repo: Repo, sha: str) -> None:
|
|
244
|
+
"""Make sure the repo has the object for `sha`. If not, fetch just that SHA."""
|
|
245
|
+
with suppress(Exception):
|
|
246
|
+
repo.commit(sha)
|
|
247
|
+
return
|
|
248
|
+
repo.git.fetch("origin", sha)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def prepare_repo_checkout(repo_name: str, sha: str, tmp_root: Path) -> tuple[Repo, Path, Callable[[], None]]:
|
|
252
|
+
"""Prefer a worktree from a cached base clone; fall back to a reference clone."""
|
|
253
|
+
persistent_root = worktree_root(repo_name)
|
|
254
|
+
worktree_dir = persistent_root / sha
|
|
255
|
+
lock = _get_worktree_lock(repo_name, sha)
|
|
256
|
+
with lock:
|
|
257
|
+
try:
|
|
258
|
+
base_repo = ensure_base_clone(repo_name)
|
|
259
|
+
cleanup_worktree_cache(repo_name, base_repo=base_repo, active_shas={sha})
|
|
260
|
+
ensure_commit_available(base_repo, sha)
|
|
261
|
+
worktree_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
262
|
+
git_dir = worktree_dir / ".git"
|
|
263
|
+
if git_dir.exists():
|
|
264
|
+
wt_repo = Repo(worktree_dir)
|
|
265
|
+
try:
|
|
266
|
+
current = wt_repo.head.commit.hexsha
|
|
267
|
+
except Exception:
|
|
268
|
+
current = None
|
|
269
|
+
if current != sha:
|
|
270
|
+
with suppress(Exception):
|
|
271
|
+
wt_repo.git.reset("--hard", sha)
|
|
272
|
+
else:
|
|
273
|
+
if worktree_dir.exists():
|
|
274
|
+
shutil.rmtree(worktree_dir)
|
|
275
|
+
with suppress(Exception):
|
|
276
|
+
base_repo.git.worktree("prune", "--expire=now")
|
|
277
|
+
base_repo.git.worktree("add", "--detach", str(worktree_dir), sha)
|
|
278
|
+
wt_repo = Repo(worktree_dir)
|
|
279
|
+
|
|
280
|
+
def _cleanup_worktree() -> None:
|
|
281
|
+
with suppress(Exception):
|
|
282
|
+
base_repo.git.worktree("remove", str(worktree_dir), "--force")
|
|
283
|
+
with suppress(Exception):
|
|
284
|
+
if worktree_dir.exists():
|
|
285
|
+
shutil.rmtree(worktree_dir)
|
|
286
|
+
with suppress(Exception):
|
|
287
|
+
base_repo.git.worktree("prune")
|
|
288
|
+
|
|
289
|
+
return wt_repo, worktree_dir, _cleanup_worktree
|
|
290
|
+
except Exception: # noqa: S110
|
|
291
|
+
pass
|
|
292
|
+
|
|
293
|
+
# Reference clone fallback
|
|
294
|
+
repo_dir = tmp_root / "repo"
|
|
295
|
+
mirror = ensure_mirror(repo_name)
|
|
296
|
+
url = f"https://github.com/{repo_name}.git"
|
|
297
|
+
repo = Repo.clone_from(
|
|
298
|
+
url,
|
|
299
|
+
to_path=repo_dir,
|
|
300
|
+
reference=str(mirror),
|
|
301
|
+
multi_options=["--filter=blob:none", "--no-tags"],
|
|
302
|
+
)
|
|
303
|
+
ensure_commit_available(repo, sha)
|
|
304
|
+
with suppress(Exception):
|
|
305
|
+
repo.git.checkout(sha)
|
|
306
|
+
|
|
307
|
+
def _cleanup_refclone() -> None:
|
|
308
|
+
return None
|
|
309
|
+
|
|
310
|
+
return repo, repo_dir, _cleanup_refclone
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def base_tmp_for_commit(commit: Commit) -> Path:
|
|
314
|
+
"""Base directory for transient artifacts tied to a specific commit."""
|
|
315
|
+
worktree = commit.repo.working_tree_dir
|
|
316
|
+
if worktree is None:
|
|
317
|
+
raise ValueError("Commit repository has no working tree directory")
|
|
318
|
+
return Path(worktree)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def materialize_blobs(
|
|
322
|
+
commit: Commit,
|
|
323
|
+
predicate: Callable[[str], bool],
|
|
324
|
+
out_dirname: str,
|
|
325
|
+
) -> dict[str, Path]:
|
|
326
|
+
"""Copy matching blobs from <commit> into a workspace-local folder."""
|
|
327
|
+
base = base_tmp_for_commit(commit) / out_dirname
|
|
328
|
+
base.mkdir(parents=True, exist_ok=True)
|
|
329
|
+
out: dict[str, Path] = {}
|
|
330
|
+
for raw_item in commit.tree.traverse():
|
|
331
|
+
item = cast(Any, raw_item)
|
|
332
|
+
if getattr(item, "type", None) != "blob":
|
|
333
|
+
continue
|
|
334
|
+
relpath = cast(str, getattr(item, "path", ""))
|
|
335
|
+
if predicate(relpath):
|
|
336
|
+
dst = base / relpath
|
|
337
|
+
parent = dst.parent
|
|
338
|
+
try:
|
|
339
|
+
if parent.exists() and parent.is_file():
|
|
340
|
+
parent.unlink()
|
|
341
|
+
except Exception as e:
|
|
342
|
+
logger.debug("Failed to remove parent file %s: %s", parent, e)
|
|
343
|
+
parent.mkdir(parents=True, exist_ok=True)
|
|
344
|
+
try:
|
|
345
|
+
if dst.exists() and dst.is_dir():
|
|
346
|
+
shutil.rmtree(dst)
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.debug("Failed to remove directory %s: %s", dst, e)
|
|
349
|
+
data_stream = getattr(item, "data_stream", None)
|
|
350
|
+
if data_stream is None:
|
|
351
|
+
continue
|
|
352
|
+
with io.BytesIO(data_stream.read()) as src, open(dst, "wb") as f:
|
|
353
|
+
f.write(src.read())
|
|
354
|
+
out[relpath] = dst
|
|
355
|
+
return out
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def read_blob_text(commit: Commit, relpath: str, default: str | None = None) -> str | None:
|
|
359
|
+
"""Read a text file from a commit by path."""
|
|
360
|
+
try:
|
|
361
|
+
blob = cast(Any, commit.tree / relpath)
|
|
362
|
+
if getattr(blob, "type", None) != "blob":
|
|
363
|
+
return default
|
|
364
|
+
data_stream = getattr(blob, "data_stream", None)
|
|
365
|
+
if data_stream is None:
|
|
366
|
+
return default
|
|
367
|
+
raw_bytes = data_stream.read()
|
|
368
|
+
if not isinstance(raw_bytes, (bytes, bytearray)):
|
|
369
|
+
return default
|
|
370
|
+
return bytes(raw_bytes).decode("utf-8", errors="replace")
|
|
371
|
+
except Exception:
|
|
372
|
+
return default
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def asv_finder(commit: Commit) -> list[Path]:
|
|
376
|
+
"""Find ASV configuration files in a commit."""
|
|
377
|
+
mats = materialize_blobs(commit, lambda rel: bool(ASV_REGEX.search(rel)), out_dirname="_asv_blobs")
|
|
378
|
+
return list(mats.values())
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Analyzing Python imports to infer runtime dependencies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from datasmith.utils import get_logger
|
|
9
|
+
|
|
10
|
+
from .constants import NOT_REQUIREMENTS, SPECIAL_IMPORT_TO_PYPI, STDLIB
|
|
11
|
+
|
|
12
|
+
logger = get_logger("resolution.import_analyzer")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def top_level_imports_under(root: Path) -> set[str]: # noqa: C901
|
|
16
|
+
"""Parse all .py files under root and return top-level imported module names."""
|
|
17
|
+
skip_dirs = {"tests", "test", "testing", "benchmarks", "doc", "docs", ".eggs", ".tox", "build", "dist"}
|
|
18
|
+
names: set[str] = set()
|
|
19
|
+
for path in root.rglob("*.py"):
|
|
20
|
+
rel_parts = set(path.parts)
|
|
21
|
+
if skip_dirs & rel_parts:
|
|
22
|
+
continue
|
|
23
|
+
try:
|
|
24
|
+
src = path.read_text(encoding="utf-8", errors="replace")
|
|
25
|
+
except Exception as e:
|
|
26
|
+
logger.debug("Failed to read %s: %s", path, e)
|
|
27
|
+
continue
|
|
28
|
+
try:
|
|
29
|
+
tree = ast.parse(src, filename=str(path))
|
|
30
|
+
except Exception as e:
|
|
31
|
+
logger.debug("Failed to parse %s: %s", path, e)
|
|
32
|
+
continue
|
|
33
|
+
for node in ast.walk(tree):
|
|
34
|
+
if isinstance(node, ast.Import):
|
|
35
|
+
for alias in node.names:
|
|
36
|
+
mod = (alias.name or "").split(".", 1)[0]
|
|
37
|
+
if mod:
|
|
38
|
+
names.add(mod)
|
|
39
|
+
elif isinstance(node, ast.ImportFrom):
|
|
40
|
+
if getattr(node, "level", 0) and node.module is None:
|
|
41
|
+
continue
|
|
42
|
+
mod = (node.module or "").split(".", 1)[0]
|
|
43
|
+
if mod:
|
|
44
|
+
names.add(mod)
|
|
45
|
+
return names
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def infer_runtime_from_imports(project_dir: Path, own_import_name: str | None) -> set[str]:
|
|
49
|
+
"""Convert top-level imports to likely PyPI packages, filtering stdlib and self-import."""
|
|
50
|
+
imports = top_level_imports_under(project_dir)
|
|
51
|
+
out: set[str] = set()
|
|
52
|
+
own = set()
|
|
53
|
+
if own_import_name:
|
|
54
|
+
own.add(own_import_name)
|
|
55
|
+
own.add(own_import_name.replace("-", "_"))
|
|
56
|
+
own.add(own_import_name.replace("_", "-"))
|
|
57
|
+
for mod in imports:
|
|
58
|
+
if mod.lower() in STDLIB:
|
|
59
|
+
continue
|
|
60
|
+
if mod in own:
|
|
61
|
+
continue
|
|
62
|
+
if mod in NOT_REQUIREMENTS:
|
|
63
|
+
continue
|
|
64
|
+
pkg = SPECIAL_IMPORT_TO_PYPI.get(mod, mod)
|
|
65
|
+
out.add(pkg)
|
|
66
|
+
return out
|