fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,174 @@
1
+ """Dependency resolution using uv."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as dt
6
+ import os
7
+ import zipfile
8
+ from collections.abc import Iterable
9
+ from pathlib import Path
10
+
11
+ from .constants import ANSI_RE
12
+ from .package_filters import fix_marker_spacing
13
+ from .python_manager import run_uv
14
+
15
+
16
+ def strip_ansi(s: str) -> str:
17
+ """Remove ANSI escape codes from a string."""
18
+ return ANSI_RE.sub("", s)
19
+
20
+
21
+ def rfc3339(ts: dt.datetime) -> str:
22
+ """Convert a datetime to RFC3339 format string."""
23
+ if ts.tzinfo is None:
24
+ ts = ts.replace(tzinfo=dt.timezone.utc)
25
+ return ts.astimezone(dt.timezone.utc).isoformat().replace("+00:00", "Z")
26
+
27
+
28
+ def uv_compile_from_pyproject(
29
+ pyproject_path: Path, python_version: str | None, cutoff_rfc3339: str | None
30
+ ) -> list[str]:
31
+ """Use `uv pip compile` to resolve to pinned requirements from pyproject.toml."""
32
+ if not pyproject_path.exists():
33
+ return []
34
+ args = ["pip", "compile", str(pyproject_path.resolve())]
35
+ if python_version:
36
+ args.extend(["--python", python_version])
37
+ args.append("--all-extras")
38
+ extra_env: dict[str, str] = {}
39
+ if cutoff_rfc3339:
40
+ extra_env["UV_EXCLUDE_NEWER"] = cutoff_rfc3339
41
+ cp = run_uv(args, input_text=None, extra_env=extra_env, cwd=pyproject_path.parent)
42
+ if cp.returncode != 0:
43
+ raise RuntimeError(f"uv pip compile failed:\n{cp.stderr.decode() or cp.stdout.decode()}")
44
+ out: list[str] = []
45
+ for raw in cp.stdout.decode().splitlines():
46
+ s = strip_ansi(raw).strip()
47
+ if s and not s.startswith("#"):
48
+ out.append(s)
49
+ return out
50
+
51
+
52
+ def uv_compile(requirements: Iterable[str], *, python_version: str | None, cutoff_rfc3339: str | None) -> list[str]:
53
+ """Use `uv pip compile` to resolve to pinned requirements from stdin."""
54
+ reqs = sorted({fix_marker_spacing(r.strip()) for r in requirements if r and r.strip()})
55
+ if not reqs:
56
+ return []
57
+ req_text = "\n".join(reqs) + "\n"
58
+ args = ["pip", "compile", "-"]
59
+ if python_version:
60
+ args.extend(["--python", python_version])
61
+ args.append("--upgrade")
62
+ extra_env: dict[str, str] = {}
63
+ if cutoff_rfc3339:
64
+ extra_env["UV_EXCLUDE_NEWER"] = cutoff_rfc3339
65
+ cp = run_uv(args, input_text=req_text, extra_env=extra_env)
66
+ if cp.returncode != 0:
67
+ raise RuntimeError(f"uv pip compile failed:\n{cp.stderr.decode() or cp.stdout.decode()}")
68
+ out: list[str] = []
69
+ for raw in cp.stdout.decode().splitlines():
70
+ s = strip_ansi(raw).strip()
71
+ if s and not s.startswith("#"):
72
+ out.append(s)
73
+ return out
74
+
75
+
76
+ def uv_dry_run_install(
77
+ pinned: Iterable[str], *, python_version: str | None, venv_path: Path | None = None
78
+ ) -> tuple[bool, str]:
79
+ """Run a dry-run install to validate that dependencies can be installed."""
80
+ text_lines = [fix_marker_spacing(x) for x in pinned if x.strip()]
81
+ if not text_lines:
82
+ return True, "No runtime dependencies."
83
+ text = "\n".join(text_lines) + "\n"
84
+ args = ["pip", "install", "--dry-run", "-r", "-"]
85
+
86
+ if venv_path and venv_path.exists():
87
+ python_exe = venv_path / "bin" / "python"
88
+ if not python_exe.exists():
89
+ python_exe = venv_path / "Scripts" / "python.exe"
90
+ if python_exe.exists():
91
+ args.extend(["--python", str(python_exe)])
92
+ elif python_version:
93
+ args.extend(["--python", python_version, "--system"])
94
+ elif python_version:
95
+ args.extend(["--python", python_version, "--system"])
96
+
97
+ cp = run_uv(args, input_text=text)
98
+ ok = cp.returncode == 0
99
+ log = strip_ansi(cp.stdout.decode() + "\n" + cp.stderr.decode())
100
+ return ok, log
101
+
102
+
103
+ def uv_install_real(pinned: Iterable[str], *, python_executable: str | None = None) -> tuple[bool, str]:
104
+ """Perform a real install of pinned requirements to surface sdist build failures."""
105
+ lines = [fix_marker_spacing(x) for x in pinned if x.strip()]
106
+ if not lines:
107
+ return True, "No dependencies to install."
108
+ text = "\n".join(lines) + "\n"
109
+ args = ["pip", "install", "-r", "-"]
110
+ if python_executable:
111
+ args.extend(["--python", python_executable])
112
+ cp = run_uv(args, input_text=text)
113
+ ok = cp.returncode == 0
114
+ log = strip_ansi(cp.stdout.decode() + "\n" + cp.stderr.decode())
115
+ return ok, log
116
+
117
+
118
+ def uv_build_and_read_metadata(project_dir: Path) -> tuple[str | None, str | None, list[str], str | None]:
119
+ """Run `uv build` in the project directory, then read metadata from the wheel.
120
+
121
+ This is a best-effort fallback — many repos have dynamic setup.py files that
122
+ fail in partial clones. Failures are expected and logged at debug level.
123
+ """
124
+ import subprocess as _sp
125
+
126
+ from datasmith.utils import get_logger as _get_logger
127
+
128
+ _logger = _get_logger("resolution.dependency_resolver")
129
+
130
+ # Use subprocess directly with DEVNULL for stderr to suppress noisy
131
+ # setup.py tracebacks from child processes that bypass capture_output.
132
+ env = os.environ.copy()
133
+ env.setdefault("UV_COLOR", "never")
134
+ env.setdefault("NO_COLOR", "1")
135
+ cp = _sp.run(
136
+ ["uv", "build"],
137
+ capture_output=True,
138
+ stdin=_sp.DEVNULL,
139
+ cwd=str(project_dir),
140
+ env=env,
141
+ )
142
+ if cp.returncode != 0:
143
+ _logger.debug(
144
+ "uv build failed in %s (expected for repos with dynamic setup.py): %s",
145
+ project_dir.name,
146
+ strip_ansi(cp.stderr.decode())[-200:],
147
+ )
148
+ return None, None, [], None
149
+ dist_dir = project_dir / "dist"
150
+ if not dist_dir.exists():
151
+ return None, None, [], None
152
+ wheels = sorted(dist_dir.glob("*.whl"))
153
+ if not wheels:
154
+ return None, None, [], None
155
+ name, version = None, None
156
+ requires_dist: list[str] = []
157
+ requires_python: str | None = None
158
+ with zipfile.ZipFile(wheels[-1]) as zf:
159
+ meta_name = next((n for n in zf.namelist() if n.endswith(".dist-info/METADATA")), None)
160
+ if not meta_name:
161
+ return None, None, [], None
162
+ content = zf.read(meta_name).decode("utf-8", errors="replace")
163
+ for line in content.splitlines():
164
+ if line.startswith("Name: "):
165
+ name = line.split("Name:", 1)[1].strip()
166
+ elif line.startswith("Version: "):
167
+ version = line.split("Version:", 1)[1].strip()
168
+ elif line.startswith("Requires-Dist: "):
169
+ req = line.split("Requires-Dist:", 1)[1].strip()
170
+ req = fix_marker_spacing(req)
171
+ requires_dist.append(req)
172
+ elif line.startswith("Requires-Python: "):
173
+ requires_python = line.split("Requires-Python:", 1)[1].strip()
174
+ return name, version, requires_dist, requires_python
@@ -0,0 +1,378 @@
1
+ """Git repository operations for dependency resolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import os
7
+ import shutil
8
+ import threading
9
+ import time
10
+ from collections.abc import Iterable
11
+ from contextlib import suppress
12
+ from pathlib import Path
13
+ from typing import Any, Callable, cast
14
+
15
+ from git import Commit, Repo
16
+
17
+ from datasmith.utils import get_logger
18
+
19
+ from .constants import ASV_REGEX, GIT_CACHE_DIR
20
+
21
+ _worktree_lock_registry: dict[tuple[str, str], threading.Lock] = {}
22
+ _worktree_registry_lock = threading.Lock()
23
+ _worktree_cleanup_lock = threading.Lock()
24
+
25
+
26
+ def _env_non_negative_int(var: str, default: int) -> int:
27
+ raw = os.getenv(var)
28
+ if raw is None:
29
+ return default
30
+ try:
31
+ value = int(float(raw))
32
+ except ValueError:
33
+ return default
34
+ return max(0, value)
35
+
36
+
37
+ def _env_non_negative_float(var: str, default: float) -> float:
38
+ raw = os.getenv(var)
39
+ if raw is None:
40
+ return default
41
+ try:
42
+ value = float(raw)
43
+ except ValueError:
44
+ return default
45
+ return max(0.0, value)
46
+
47
+
48
+ def _default_worktree_ttl_seconds() -> int | None:
49
+ raw = os.getenv("DATASMITH_GIT_WORKTREE_TTL_SECONDS")
50
+ if raw is None:
51
+ return 24 * 3600
52
+ try:
53
+ value = int(float(raw))
54
+ except ValueError:
55
+ return 24 * 3600
56
+ return value if value > 0 else None
57
+
58
+
59
+ DEFAULT_MAX_WORKTREES_PER_REPO = _env_non_negative_int("DATASMITH_GIT_MAX_WORKTREES_PER_REPO", 128)
60
+ DEFAULT_WORKTREE_TTL_SECONDS = _default_worktree_ttl_seconds()
61
+ DEFAULT_WORKTREE_MIN_FREE_GB = _env_non_negative_float("DATASMITH_GIT_WORKTREE_MIN_FREE_GB", 256.0)
62
+
63
+ logger = get_logger("resolution.git_utils")
64
+
65
+
66
+ def _get_worktree_lock(repo_name: str, sha: str) -> threading.Lock:
67
+ """Return a per-(repo, sha) mutex used to serialize worktree materialization."""
68
+ key = (repo_name, sha)
69
+ with _worktree_registry_lock:
70
+ lock = _worktree_lock_registry.get(key)
71
+ if lock is None:
72
+ lock = threading.Lock()
73
+ _worktree_lock_registry[key] = lock
74
+ return lock
75
+
76
+
77
+ def repo_key(repo_name: str) -> str:
78
+ """Convert a GitHub repo name to a filesystem-safe key."""
79
+ return repo_name.replace("/", "__")
80
+
81
+
82
+ def base_clone_path(repo_name: str) -> Path:
83
+ """Get the path for a base clone of a repository."""
84
+ return GIT_CACHE_DIR / "base_clones" / repo_key(repo_name)
85
+
86
+
87
+ def mirror_path(repo_name: str) -> Path:
88
+ """Get the path for a bare mirror of a repository."""
89
+ return GIT_CACHE_DIR / "mirrors" / f"{repo_key(repo_name)}.git"
90
+
91
+
92
+ def worktree_root(repo_name: str) -> Path:
93
+ """Return the persistent worktree root for a repository."""
94
+ return base_clone_path(repo_name) / "worktrees"
95
+
96
+
97
+ def _free_gb(path: Path) -> float:
98
+ """Calculate the free space at ``path`` in gigabytes."""
99
+ try:
100
+ usage = shutil.disk_usage(path)
101
+ except FileNotFoundError:
102
+ return float("inf")
103
+ return usage.free / (1024**3)
104
+
105
+
106
+ def _remove_worktree_dir(base_repo: Repo | None, worktree_dir: Path) -> None:
107
+ """Remove a worktree directory, falling back to direct removal on failure."""
108
+ with suppress(Exception):
109
+ if base_repo is not None:
110
+ base_repo.git.worktree("remove", str(worktree_dir), "--force")
111
+ with suppress(Exception):
112
+ if worktree_dir.exists():
113
+ shutil.rmtree(worktree_dir)
114
+
115
+
116
+ def cleanup_worktree_cache( # noqa: C901
117
+ repo_name: str,
118
+ *,
119
+ base_repo: Repo | None = None,
120
+ active_shas: Iterable[str] | None = None,
121
+ max_worktrees: int | None = None,
122
+ max_age_seconds: int | None = None,
123
+ min_free_gb: float | None = None,
124
+ ) -> list[Path]:
125
+ """Remove stale or excess worktrees for ``repo_name``."""
126
+ protected = set(active_shas or ())
127
+ with _worktree_registry_lock:
128
+ for (name, locked_sha), lock in _worktree_lock_registry.items():
129
+ if name == repo_name and lock.locked():
130
+ protected.add(locked_sha)
131
+ ttl_seconds = max_age_seconds if max_age_seconds is not None else DEFAULT_WORKTREE_TTL_SECONDS
132
+ ttl_seconds = ttl_seconds if ttl_seconds and ttl_seconds > 0 else None
133
+ keep_limit = max_worktrees if max_worktrees is not None else DEFAULT_MAX_WORKTREES_PER_REPO
134
+ if keep_limit is not None:
135
+ keep_limit = max(0, keep_limit)
136
+ min_free: float | None = min_free_gb if min_free_gb is not None else DEFAULT_WORKTREE_MIN_FREE_GB
137
+ min_free = min_free if min_free and min_free > 0 else None
138
+
139
+ wroot = worktree_root(repo_name)
140
+ if not wroot.exists():
141
+ return []
142
+
143
+ repo = base_repo
144
+ if repo is None:
145
+ with suppress(Exception):
146
+ repo = Repo(base_clone_path(repo_name))
147
+
148
+ removed: list[Path] = []
149
+
150
+ with _worktree_cleanup_lock:
151
+ entries: list[tuple[Path, float, str]] = []
152
+ protected_entries: list[tuple[Path, float, str]] = []
153
+ for child in wroot.iterdir():
154
+ if not child.is_dir():
155
+ continue
156
+ sha = child.name
157
+ try:
158
+ mtime = child.stat().st_mtime
159
+ except FileNotFoundError:
160
+ continue
161
+ entry = (child, mtime, sha)
162
+ if sha in protected:
163
+ protected_entries.append(entry)
164
+ else:
165
+ entries.append(entry)
166
+
167
+ now = time.time()
168
+ if ttl_seconds is not None:
169
+ cutoff = now - ttl_seconds
170
+ fresh_entries: list[tuple[Path, float, str]] = []
171
+ for path, mtime, sha in entries:
172
+ if mtime < cutoff:
173
+ removed.append(path)
174
+ _remove_worktree_dir(repo, path)
175
+ else:
176
+ fresh_entries.append((path, mtime, sha))
177
+ entries = fresh_entries
178
+
179
+ entries.sort(key=lambda item: item[1], reverse=True)
180
+ kept_entries: list[tuple[Path, float, str]] = list(protected_entries)
181
+
182
+ if keep_limit is not None:
183
+ available = keep_limit - len(protected_entries)
184
+ if available <= 0:
185
+ to_remove = entries
186
+ else:
187
+ kept_entries.extend(entries[:available])
188
+ to_remove = entries[available:]
189
+ else:
190
+ kept_entries.extend(entries)
191
+ to_remove = []
192
+
193
+ for entry in to_remove:
194
+ path = entry[0]
195
+ removed.append(path)
196
+ _remove_worktree_dir(repo, path)
197
+
198
+ if min_free is not None:
199
+ removable = [entry for entry in kept_entries if entry[2] not in protected]
200
+ removable.sort(key=lambda item: item[1])
201
+ free_space = _free_gb(GIT_CACHE_DIR)
202
+ idx = 0
203
+ while free_space < min_free and idx < len(removable):
204
+ entry = removable[idx]
205
+ path = entry[0]
206
+ idx += 1
207
+ removed.append(path)
208
+ _remove_worktree_dir(repo, path)
209
+ free_space = _free_gb(GIT_CACHE_DIR)
210
+
211
+ if repo is not None and removed:
212
+ with suppress(Exception):
213
+ repo.git.worktree("prune", "--expire=now")
214
+
215
+ if removed:
216
+ logger.debug("Removed %d worktree(s) for %s", len(removed), repo_name)
217
+
218
+ return removed
219
+
220
+
221
+ def ensure_base_clone(repo_name: str) -> Repo:
222
+ """Ensure a non-bare base clone exists (partial clone)."""
223
+ url = f"https://github.com/{repo_name}.git"
224
+ path = base_clone_path(repo_name)
225
+ path.parent.mkdir(parents=True, exist_ok=True)
226
+ repo = Repo.clone_from(url, path, multi_options=["--filter=blob:none"]) if not path.exists() else Repo(path)
227
+ return repo
228
+
229
+
230
+ def ensure_mirror(repo_name: str) -> Path:
231
+ """Ensure a local bare mirror exists."""
232
+ url = f"https://github.com/{repo_name}.git"
233
+ mpath = mirror_path(repo_name)
234
+ mpath.parent.mkdir(parents=True, exist_ok=True)
235
+ if not mpath.exists():
236
+ Repo.clone_from(url, mpath, mirror=True, multi_options=["--filter=blob:none"])
237
+ else:
238
+ with suppress(Exception):
239
+ Repo(mpath).remote().update(prune=True)
240
+ return mpath
241
+
242
+
243
+ def ensure_commit_available(repo: Repo, sha: str) -> None:
244
+ """Make sure the repo has the object for `sha`. If not, fetch just that SHA."""
245
+ with suppress(Exception):
246
+ repo.commit(sha)
247
+ return
248
+ repo.git.fetch("origin", sha)
249
+
250
+
251
+ def prepare_repo_checkout(repo_name: str, sha: str, tmp_root: Path) -> tuple[Repo, Path, Callable[[], None]]:
252
+ """Prefer a worktree from a cached base clone; fall back to a reference clone."""
253
+ persistent_root = worktree_root(repo_name)
254
+ worktree_dir = persistent_root / sha
255
+ lock = _get_worktree_lock(repo_name, sha)
256
+ with lock:
257
+ try:
258
+ base_repo = ensure_base_clone(repo_name)
259
+ cleanup_worktree_cache(repo_name, base_repo=base_repo, active_shas={sha})
260
+ ensure_commit_available(base_repo, sha)
261
+ worktree_dir.parent.mkdir(parents=True, exist_ok=True)
262
+ git_dir = worktree_dir / ".git"
263
+ if git_dir.exists():
264
+ wt_repo = Repo(worktree_dir)
265
+ try:
266
+ current = wt_repo.head.commit.hexsha
267
+ except Exception:
268
+ current = None
269
+ if current != sha:
270
+ with suppress(Exception):
271
+ wt_repo.git.reset("--hard", sha)
272
+ else:
273
+ if worktree_dir.exists():
274
+ shutil.rmtree(worktree_dir)
275
+ with suppress(Exception):
276
+ base_repo.git.worktree("prune", "--expire=now")
277
+ base_repo.git.worktree("add", "--detach", str(worktree_dir), sha)
278
+ wt_repo = Repo(worktree_dir)
279
+
280
+ def _cleanup_worktree() -> None:
281
+ with suppress(Exception):
282
+ base_repo.git.worktree("remove", str(worktree_dir), "--force")
283
+ with suppress(Exception):
284
+ if worktree_dir.exists():
285
+ shutil.rmtree(worktree_dir)
286
+ with suppress(Exception):
287
+ base_repo.git.worktree("prune")
288
+
289
+ return wt_repo, worktree_dir, _cleanup_worktree
290
+ except Exception: # noqa: S110
291
+ pass
292
+
293
+ # Reference clone fallback
294
+ repo_dir = tmp_root / "repo"
295
+ mirror = ensure_mirror(repo_name)
296
+ url = f"https://github.com/{repo_name}.git"
297
+ repo = Repo.clone_from(
298
+ url,
299
+ to_path=repo_dir,
300
+ reference=str(mirror),
301
+ multi_options=["--filter=blob:none", "--no-tags"],
302
+ )
303
+ ensure_commit_available(repo, sha)
304
+ with suppress(Exception):
305
+ repo.git.checkout(sha)
306
+
307
+ def _cleanup_refclone() -> None:
308
+ return None
309
+
310
+ return repo, repo_dir, _cleanup_refclone
311
+
312
+
313
+ def base_tmp_for_commit(commit: Commit) -> Path:
314
+ """Base directory for transient artifacts tied to a specific commit."""
315
+ worktree = commit.repo.working_tree_dir
316
+ if worktree is None:
317
+ raise ValueError("Commit repository has no working tree directory")
318
+ return Path(worktree)
319
+
320
+
321
+ def materialize_blobs(
322
+ commit: Commit,
323
+ predicate: Callable[[str], bool],
324
+ out_dirname: str,
325
+ ) -> dict[str, Path]:
326
+ """Copy matching blobs from <commit> into a workspace-local folder."""
327
+ base = base_tmp_for_commit(commit) / out_dirname
328
+ base.mkdir(parents=True, exist_ok=True)
329
+ out: dict[str, Path] = {}
330
+ for raw_item in commit.tree.traverse():
331
+ item = cast(Any, raw_item)
332
+ if getattr(item, "type", None) != "blob":
333
+ continue
334
+ relpath = cast(str, getattr(item, "path", ""))
335
+ if predicate(relpath):
336
+ dst = base / relpath
337
+ parent = dst.parent
338
+ try:
339
+ if parent.exists() and parent.is_file():
340
+ parent.unlink()
341
+ except Exception as e:
342
+ logger.debug("Failed to remove parent file %s: %s", parent, e)
343
+ parent.mkdir(parents=True, exist_ok=True)
344
+ try:
345
+ if dst.exists() and dst.is_dir():
346
+ shutil.rmtree(dst)
347
+ except Exception as e:
348
+ logger.debug("Failed to remove directory %s: %s", dst, e)
349
+ data_stream = getattr(item, "data_stream", None)
350
+ if data_stream is None:
351
+ continue
352
+ with io.BytesIO(data_stream.read()) as src, open(dst, "wb") as f:
353
+ f.write(src.read())
354
+ out[relpath] = dst
355
+ return out
356
+
357
+
358
+ def read_blob_text(commit: Commit, relpath: str, default: str | None = None) -> str | None:
359
+ """Read a text file from a commit by path."""
360
+ try:
361
+ blob = cast(Any, commit.tree / relpath)
362
+ if getattr(blob, "type", None) != "blob":
363
+ return default
364
+ data_stream = getattr(blob, "data_stream", None)
365
+ if data_stream is None:
366
+ return default
367
+ raw_bytes = data_stream.read()
368
+ if not isinstance(raw_bytes, (bytes, bytearray)):
369
+ return default
370
+ return bytes(raw_bytes).decode("utf-8", errors="replace")
371
+ except Exception:
372
+ return default
373
+
374
+
375
+ def asv_finder(commit: Commit) -> list[Path]:
376
+ """Find ASV configuration files in a commit."""
377
+ mats = materialize_blobs(commit, lambda rel: bool(ASV_REGEX.search(rel)), out_dirname="_asv_blobs")
378
+ return list(mats.values())
@@ -0,0 +1,66 @@
1
+ """Analyzing Python imports to infer runtime dependencies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ from pathlib import Path
7
+
8
+ from datasmith.utils import get_logger
9
+
10
+ from .constants import NOT_REQUIREMENTS, SPECIAL_IMPORT_TO_PYPI, STDLIB
11
+
12
+ logger = get_logger("resolution.import_analyzer")
13
+
14
+
15
+ def top_level_imports_under(root: Path) -> set[str]: # noqa: C901
16
+ """Parse all .py files under root and return top-level imported module names."""
17
+ skip_dirs = {"tests", "test", "testing", "benchmarks", "doc", "docs", ".eggs", ".tox", "build", "dist"}
18
+ names: set[str] = set()
19
+ for path in root.rglob("*.py"):
20
+ rel_parts = set(path.parts)
21
+ if skip_dirs & rel_parts:
22
+ continue
23
+ try:
24
+ src = path.read_text(encoding="utf-8", errors="replace")
25
+ except Exception as e:
26
+ logger.debug("Failed to read %s: %s", path, e)
27
+ continue
28
+ try:
29
+ tree = ast.parse(src, filename=str(path))
30
+ except Exception as e:
31
+ logger.debug("Failed to parse %s: %s", path, e)
32
+ continue
33
+ for node in ast.walk(tree):
34
+ if isinstance(node, ast.Import):
35
+ for alias in node.names:
36
+ mod = (alias.name or "").split(".", 1)[0]
37
+ if mod:
38
+ names.add(mod)
39
+ elif isinstance(node, ast.ImportFrom):
40
+ if getattr(node, "level", 0) and node.module is None:
41
+ continue
42
+ mod = (node.module or "").split(".", 1)[0]
43
+ if mod:
44
+ names.add(mod)
45
+ return names
46
+
47
+
48
+ def infer_runtime_from_imports(project_dir: Path, own_import_name: str | None) -> set[str]:
49
+ """Convert top-level imports to likely PyPI packages, filtering stdlib and self-import."""
50
+ imports = top_level_imports_under(project_dir)
51
+ out: set[str] = set()
52
+ own = set()
53
+ if own_import_name:
54
+ own.add(own_import_name)
55
+ own.add(own_import_name.replace("-", "_"))
56
+ own.add(own_import_name.replace("_", "-"))
57
+ for mod in imports:
58
+ if mod.lower() in STDLIB:
59
+ continue
60
+ if mod in own:
61
+ continue
62
+ if mod in NOT_REQUIREMENTS:
63
+ continue
64
+ pkg = SPECIAL_IMPORT_TO_PYPI.get(mod, mod)
65
+ out.add(pkg)
66
+ return out