fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from typing import Any
5
+
6
+ from datasmith.runners.base import BaseRunner
7
+ from datasmith.utils import get_client, get_logger
8
+
9
+ logger = get_logger("runners.render_problems")
10
+
11
+
12
+ class RenderProblemsRunner(BaseRunner):
13
+ """Scrape linked issues, run ProblemExtractor, and persist deconstructed context.
14
+
15
+ For each PR this runner:
16
+ 1. BFS-scrapes linked GitHub issues via the GitHub API.
17
+ 2. Runs :class:`~datasmith.agents.extractors.ProblemExtractor` (DSPy) once to
18
+ split the PR body into four structured fields.
19
+ 3. Renders the full Jinja2 problem-statement template.
20
+ 4. Upserts everything into the ``candidate_prs`` table (raw components + rendered).
21
+ 5. Updates ``pull_requests.rendered_problem`` and the new
22
+ ``pull_requests.problem_description`` column (problem-only text).
23
+ """
24
+
25
+ def __init__(self, gh: Any, n_concurrent: int = 5) -> None:
26
+ super().__init__(name="render_problems", n_concurrent=n_concurrent)
27
+ self._gh = gh
28
+
29
+ async def _process_item(self, item: Any) -> None:
30
+ """Render the problem statement for a single PR dict."""
31
+ owner: str = item["owner"]
32
+ repo: str = item["repo"]
33
+ issue_number: int = item["issue_number"]
34
+ merge_commit_sha: str = item.get("merge_commit_sha", "") or ""
35
+ repo_description: str = item.get("repo_description", "") or ""
36
+
37
+ from datasmith.agents.extractors import ProblemExtractor
38
+ from datasmith.github.links import scrape_links
39
+ from datasmith.github.models import PR
40
+ from datasmith.github.render import render_problem_statement
41
+
42
+ pr = PR(
43
+ repository=f"{owner}/{repo}",
44
+ issue_number=issue_number,
45
+ title=item.get("title", ""),
46
+ body=item.get("body", ""),
47
+ created_at=item.get("created_at"),
48
+ )
49
+
50
+ # BFS-scrape linked issues (async GitHub API calls)
51
+ issues = await scrape_links(
52
+ pr,
53
+ self._gh.get_issue_expanded,
54
+ depth=2,
55
+ only_issues=True,
56
+ limit=6,
57
+ )
58
+
59
+ logger.info(
60
+ "Scraped %d linked issues for %s/%s#%d",
61
+ len(issues),
62
+ owner,
63
+ repo,
64
+ issue_number,
65
+ )
66
+
67
+ # Run ProblemExtractor once (DSPy LLM call — run in thread)
68
+ extraction = await asyncio.to_thread(
69
+ ProblemExtractor().extract_problem,
70
+ item.get("title", ""),
71
+ item.get("body", ""),
72
+ )
73
+ problem_description = extraction.to_problem_markdown()
74
+
75
+ # Render the full problem statement using the pre-extracted observations
76
+ # (pass initial_observations to avoid a second ProblemExtractor call)
77
+ rendered = await asyncio.to_thread(
78
+ render_problem_statement,
79
+ pr,
80
+ issues=issues,
81
+ repo_description=repo_description,
82
+ anonymize=True,
83
+ extract=False,
84
+ initial_observations=problem_description or getattr(pr, "body", ""),
85
+ )
86
+
87
+ # Serialize linked issues for storage (mode="json" converts datetime → ISO string)
88
+ issues_json = [issue.model_dump(mode="json") for issue in issues]
89
+
90
+ # Upsert all raw components + rendered output into candidate_prs
91
+ client = get_client()
92
+ client.table("candidate_prs").upsert({
93
+ "owner": owner,
94
+ "repo": repo,
95
+ "issue_number": issue_number,
96
+ "merge_commit_sha": merge_commit_sha,
97
+ "repo_description": repo_description,
98
+ "issues_json": issues_json,
99
+ "initial_observations": extraction.initial_observations,
100
+ "triage_attempts": extraction.triage_attempts,
101
+ "solution_overview": extraction.solution_overview,
102
+ "solution_observations": extraction.solution_observations,
103
+ "rendered_problem": rendered,
104
+ }).execute()
105
+
106
+ # Keep pull_requests in sync: rendered_problem (used by synthesize_images)
107
+ # and problem_description (problem-only extracted text)
108
+ client.table("pull_requests").update({
109
+ "rendered_problem": rendered,
110
+ "problem_description": problem_description,
111
+ }).eq("owner", owner).eq("repo", repo).eq("issue_number", issue_number).execute()
112
+
113
+ logger.info("Rendered problem context for %s/%s#%d", owner, repo, issue_number)
@@ -0,0 +1,66 @@
1
+ """Runner for resolving Python dependencies for classified PRs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import functools
7
+ import json
8
+ from typing import Any
9
+
10
+ from datasmith.runners.base import BaseRunner
11
+ from datasmith.utils import get_client, get_logger
12
+
13
+ logger = get_logger("runners.resolve_packages")
14
+
15
+
16
+ class ResolvePackagesRunner(BaseRunner):
17
+ """Resolve dependencies for classified PRs and persist to the packages table."""
18
+
19
+ def __init__(self, n_concurrent: int = 16) -> None:
20
+ super().__init__(name="resolve_packages", n_concurrent=n_concurrent)
21
+
22
+ async def _process_item(self, item: Any) -> None:
23
+ """Process an item dict with owner, repo, sha."""
24
+ owner = item["owner"]
25
+ repo = item["repo"]
26
+ sha = item["sha"]
27
+
28
+ from datasmith.resolution import analyze_commit
29
+
30
+ loop = asyncio.get_running_loop()
31
+ result = await loop.run_in_executor(None, functools.partial(analyze_commit, sha, f"{owner}/{repo}"))
32
+
33
+ client = get_client()
34
+
35
+ if result is None:
36
+ logger.info("Resolution returned None for %s/%s@%s", owner, repo, sha[:8])
37
+ return
38
+
39
+ env_payload = json.dumps(result.get("final_dependencies", []))
40
+
41
+ row = {
42
+ "owner": owner,
43
+ "repo": repo,
44
+ "sha": sha,
45
+ "package_name": result.get("package_name"),
46
+ "package_version": result.get("package_version"),
47
+ "python_version": result.get("python_version", ""),
48
+ "env_payload": env_payload,
49
+ "build_commands": result.get("build_command"),
50
+ "install_commands": result.get("install_command"),
51
+ "primary_root": result.get("primary_root"),
52
+ "resolution_strategy": result.get("resolution_strategy"),
53
+ "can_install": result.get("can_install", False),
54
+ "requires_python": None,
55
+ }
56
+
57
+ client.table("packages").upsert(row).execute()
58
+ logger.info(
59
+ "Resolved %s/%s@%s: python=%s can_install=%s deps=%d",
60
+ owner,
61
+ repo,
62
+ sha[:8],
63
+ result.get("python_version"),
64
+ result.get("can_install"),
65
+ len(result.get("final_dependencies", [])),
66
+ )
@@ -0,0 +1,166 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+ from typing import Any
5
+
6
+ from datasmith.filters import symbolic_compliance
7
+ from datasmith.runners.base import BaseRunner
8
+ from datasmith.utils import get_client, get_logger
9
+ from datasmith.utils.db import fetch_all
10
+
11
+ logger = get_logger("runners.scrape_commits")
12
+
13
+
14
+ def _parse_iso(value: str | None) -> datetime | None:
15
+ """Parse an ISO-8601 date or datetime string to a timezone-aware datetime."""
16
+ if not value:
17
+ return None
18
+ # Handle date-only strings like "2024-01-01"
19
+ if "T" not in value:
20
+ return datetime.fromisoformat(value).replace(tzinfo=timezone.utc)
21
+ # Handle full ISO datetime strings (with or without trailing Z)
22
+ cleaned = value.replace("Z", "+00:00")
23
+ return datetime.fromisoformat(cleaned)
24
+
25
+
26
+ def _should_skip_pr(
27
+ pr_data: dict[str, Any],
28
+ since: datetime | None,
29
+ until: datetime | None,
30
+ seen_shas: set[str],
31
+ ) -> bool | str:
32
+ """Return False if the PR should be processed, 'skip' to skip, 'stop' to halt pagination."""
33
+ if since:
34
+ created = _parse_iso(pr_data.get("created_at"))
35
+ if created and created < since:
36
+ return "stop"
37
+
38
+ if not pr_data.get("merged_at"):
39
+ return "skip"
40
+
41
+ merged = _parse_iso(pr_data.get("merged_at"))
42
+ if merged:
43
+ if since and merged < since:
44
+ return "skip"
45
+ if until and merged >= until:
46
+ return "skip"
47
+
48
+ sha = pr_data.get("merge_commit_sha", "")
49
+ if sha:
50
+ if sha in seen_shas:
51
+ return "skip"
52
+ seen_shas.add(sha)
53
+
54
+ return False
55
+
56
+
57
+ def _sanitize_text(value: str) -> str:
58
+ """Strip Postgres-illegal null bytes from text values."""
59
+ return value.replace("\u0000", "")
60
+
61
+
62
+ async def _build_record(
63
+ gh: Any,
64
+ owner: str,
65
+ repo: str,
66
+ pr_data: dict[str, Any],
67
+ ) -> dict[str, Any]:
68
+ """Fetch diff/files and build the upsert record for a single PR."""
69
+ issue_number = pr_data["number"]
70
+ title = _sanitize_text(pr_data.get("title", ""))
71
+
72
+ diff = await gh.get_diff(owner, repo, issue_number)
73
+ if diff:
74
+ diff = _sanitize_text(diff)
75
+ files = await gh.get_files(owner, repo, issue_number)
76
+
77
+ file_changes: list[dict[str, Any]] | None = None
78
+ if files:
79
+ file_changes = [
80
+ {
81
+ "filename": f.get("filename", ""),
82
+ "additions": f.get("additions", 0),
83
+ "deletions": f.get("deletions", 0),
84
+ }
85
+ for f in files
86
+ ]
87
+
88
+ record: dict[str, Any] = {
89
+ "owner": owner,
90
+ "repo": repo,
91
+ "issue_number": issue_number,
92
+ "title": title,
93
+ "body": _sanitize_text(pr_data.get("body", "") or ""),
94
+ "state": pr_data.get("state", ""),
95
+ "created_at": pr_data.get("created_at"),
96
+ "merged_at": pr_data.get("merged_at"),
97
+ "closed_at": pr_data.get("closed_at"),
98
+ "merge_commit_sha": pr_data.get("merge_commit_sha", ""),
99
+ "base_sha": pr_data.get("base", {}).get("sha", ""),
100
+ "head_sha": pr_data.get("head", {}).get("sha", ""),
101
+ "labels": [label["name"] for label in pr_data.get("labels", [])],
102
+ "is_performance_commit_symbolic": symbolic_compliance(
103
+ title=title,
104
+ patch=diff or None,
105
+ file_changes=file_changes,
106
+ ),
107
+ }
108
+ if diff:
109
+ record["patch"] = diff
110
+ if file_changes:
111
+ record["file_changes"] = file_changes
112
+ return record
113
+
114
+
115
+ class ScrapeCommitsRunner(BaseRunner):
116
+ """Scrape PRs for each repo, run compliance hooks, store in pull_requests table."""
117
+
118
+ def __init__(
119
+ self,
120
+ github_client: Any,
121
+ n_concurrent: int = 5,
122
+ since: str | None = None,
123
+ until: str | None = None,
124
+ ) -> None:
125
+ super().__init__(name="scrape_commits", n_concurrent=n_concurrent)
126
+ self._gh = github_client
127
+ self._since = _parse_iso(since)
128
+ self._until = _parse_iso(until)
129
+
130
+ async def _process_item(self, item: Any) -> None:
131
+ """Process a (owner, repo) tuple — scrape its merged PRs via GraphQL."""
132
+ owner, repo = item if isinstance(item, tuple) else item.split("/")
133
+
134
+ # Pre-fetch existing issue_numbers for this repo to avoid redundant API calls
135
+ existing_rows = fetch_all(
136
+ "pull_requests",
137
+ select="issue_number",
138
+ filters={"owner": owner, "repo": repo},
139
+ )
140
+ existing_issues: set[int] = {r["issue_number"] for r in existing_rows}
141
+
142
+ seen_shas: set[str] = set()
143
+ client = get_client()
144
+ count = 0
145
+
146
+ async for page in self._gh.paginate_merged_prs(owner, repo):
147
+ stop = False
148
+ for pr_data in page:
149
+ verdict = _should_skip_pr(pr_data, self._since, self._until, seen_shas)
150
+ if verdict == "stop":
151
+ stop = True
152
+ break
153
+ if verdict == "skip":
154
+ continue
155
+
156
+ if pr_data["number"] in existing_issues:
157
+ continue
158
+
159
+ record = await _build_record(self._gh, owner, repo, pr_data)
160
+ client.table("pull_requests").upsert(record).execute()
161
+ count += 1
162
+
163
+ if stop:
164
+ break
165
+
166
+ logger.info("Scraped %d merged PRs for %s/%s", count, owner, repo)
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from datasmith.runners.base import BaseRunner
6
+ from datasmith.utils import get_client, get_logger
7
+
8
+ logger = get_logger("runners.scrape_repos")
9
+
10
+
11
+ class ScrapeReposRunner(BaseRunner):
12
+ """Scrape GitHub repos via search API and store in repositories table."""
13
+
14
+ def __init__(self, github_client: Any, n_concurrent: int = 5) -> None:
15
+ super().__init__(name="scrape_repos", n_concurrent=n_concurrent)
16
+ self._gh = github_client
17
+
18
+ async def _process_item(self, item: Any) -> None:
19
+ """Process a search query or repo identifier."""
20
+ owner, repo = item if isinstance(item, tuple) else item.split("/")
21
+
22
+ # Check if already exists
23
+ client = get_client()
24
+ resp = client.table("repositories").select("owner").eq("owner", owner).eq("repo", repo).execute()
25
+ if resp.data:
26
+ logger.debug("Skipping existing repo: %s/%s", owner, repo)
27
+ return
28
+
29
+ # Fetch repo info via GitHub API
30
+ resp_gh = await self._gh._request("GET", f"/repos/{owner}/{repo}")
31
+ if resp_gh is None:
32
+ return
33
+
34
+ data = resp_gh.json()
35
+ client.table("repositories").upsert({
36
+ "owner": owner,
37
+ "repo": repo,
38
+ "url": data.get("html_url", ""),
39
+ "language": data.get("language", ""),
40
+ "stars": data.get("stargazers_count", 0),
41
+ "topics": data.get("topics", []),
42
+ "description": data.get("description", ""),
43
+ }).execute()
44
+ logger.info("Stored repo: %s/%s", owner, repo)
@@ -0,0 +1,310 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import tempfile
5
+ import threading
6
+ from typing import Any
7
+
8
+ from datasmith.agents.synthesizer import Synthesizer
9
+ from datasmith.runners.base import BaseRunner
10
+ from datasmith.utils import get_client, get_logger
11
+
12
+ logger = get_logger("runners.synthesize_images")
13
+
14
+
15
+ def _ensure_prerequisite_images(owner: str, repo: str, py_version: str = "") -> None:
16
+ """Build the base and repo Docker images if they don't exist locally.
17
+
18
+ The three-tier hierarchy (base → repo → PR) requires each parent image
19
+ to be present in the local daemon before the child can be built.
20
+ """
21
+ from datasmith.docker.images import ImageManager, get_base_image_name, get_repo_image_name
22
+
23
+ mgr = ImageManager()
24
+ base_tag = get_base_image_name()
25
+ repo_tag = get_repo_image_name(owner, repo)
26
+
27
+ if not mgr.image_exists(base_tag):
28
+ logger.info("Building missing base image: %s", base_tag)
29
+ mgr.build_base_image(py_version=py_version)
30
+
31
+ if not mgr.image_exists(repo_tag):
32
+ logger.info("Building missing repo image: %s", repo_tag)
33
+ mgr.build_repo_image(owner, repo, py_version=py_version)
34
+
35
+
36
+ def _build_and_push_pr_image(
37
+ owner: str,
38
+ repo: str,
39
+ issue_number: int,
40
+ sha: str,
41
+ env_payload: str,
42
+ docker_context: Any | None = None,
43
+ python_version: str = "",
44
+ ) -> str:
45
+ """Build the final PR image from synthesized context and push to DockerHub.
46
+
47
+ The three-tier Dockerfiles (base/repo/pr) use the same shell scripts as
48
+ the synthesized single-Dockerfile flow, so the build result is identical.
49
+
50
+ Returns the pushed image tag.
51
+ """
52
+ from datasmith.docker.images import ImageManager, get_pr_image_name, get_repo_image_name
53
+ from datasmith.docker.publish import DockerHubPublisher
54
+
55
+ ctx = docker_context
56
+ mgr = ImageManager()
57
+ pr_tag = get_pr_image_name(owner, repo, issue_number)
58
+
59
+ if ctx is not None:
60
+ with tempfile.TemporaryDirectory(prefix="docker-ctx-") as tmpdir:
61
+ ctx.to_directory(tmpdir)
62
+ _fill_missing_scripts(tmpdir, base_commit=sha)
63
+ mgr.build_pr_image(
64
+ owner,
65
+ repo,
66
+ issue_number,
67
+ context=tmpdir,
68
+ commit_sha=sha or "HEAD",
69
+ env_payload=env_payload or "[]",
70
+ py_version=python_version,
71
+ )
72
+ else:
73
+ mgr.build_pr_image(
74
+ owner,
75
+ repo,
76
+ issue_number,
77
+ commit_sha=sha or "HEAD",
78
+ env_payload=env_payload or "[]",
79
+ py_version=python_version,
80
+ )
81
+
82
+ publisher = DockerHubPublisher()
83
+ repo_tag = get_repo_image_name(owner, repo)
84
+
85
+ try:
86
+ publisher.push(repo_tag)
87
+ except Exception:
88
+ logger.warning("Failed to push repo image %s (non-fatal)", repo_tag)
89
+
90
+ publisher.push(pr_tag)
91
+ logger.info("Pushed PR image: %s", pr_tag)
92
+ return pr_tag
93
+
94
+
95
+ def _render_run_tests_sh(docker_templates: Any, base_commit: str) -> str:
96
+ """Render the run-tests.sh Jinja2 template with embedded scripts."""
97
+ from pathlib import Path
98
+
99
+ from jinja2 import Environment, FileSystemLoader
100
+
101
+ docker_templates = Path(docker_templates)
102
+ env = Environment(
103
+ loader=FileSystemLoader(str(docker_templates)),
104
+ keep_trailing_newline=True,
105
+ autoescape=False,
106
+ )
107
+ template = env.get_template("run-tests.sh")
108
+
109
+ pytest_runner = (docker_templates / "pytest_runner.py").read_text()
110
+ parser = (docker_templates / "parser.py").read_text()
111
+
112
+ return template.render(
113
+ base_commit=base_commit,
114
+ pytest_runner=pytest_runner,
115
+ parser=parser,
116
+ run_pytest=True,
117
+ )
118
+
119
+
120
+ def _fill_missing_scripts(context_dir: str, base_commit: str = "") -> None:
121
+ """Copy any missing shell scripts and Dockerfile.pr from the templates directory.
122
+
123
+ Synthesized contexts may only contain a subset of the 9 expected files
124
+ (e.g. only ``build_pkg_sh``). The Dockerfile.pr ``COPY`` directives
125
+ require every file to be present, so we backfill from the built-in
126
+ templates for anything the synthesizer didn't produce.
127
+
128
+ ``run-tests.sh`` is a Jinja2 template that requires rendering with
129
+ ``base_commit`` and embedded Python scripts before it can be used.
130
+ """
131
+ import os
132
+ import shutil
133
+ from pathlib import Path
134
+
135
+ templates = Path(__file__).parents[1] / "docker" / "templates"
136
+
137
+ # Every file that Dockerfile.pr references via COPY
138
+ required = [
139
+ "Dockerfile.pr",
140
+ "docker_build_env.sh",
141
+ "docker_build_pkg.sh",
142
+ "docker_build_run.sh",
143
+ "docker_build_final.sh",
144
+ "profile.sh",
145
+ "run-tests.sh",
146
+ "entrypoint.sh",
147
+ ]
148
+
149
+ for fname in required:
150
+ target = os.path.join(context_dir, fname)
151
+ if os.path.exists(target):
152
+ continue
153
+ if fname == "run-tests.sh":
154
+ # run-tests.sh is a Jinja2 template — render it instead of copying raw
155
+ rendered = _render_run_tests_sh(templates, base_commit=base_commit)
156
+ with open(target, "w") as f:
157
+ f.write(rendered)
158
+ else:
159
+ src = templates / fname
160
+ if src.exists():
161
+ shutil.copy2(str(src), target)
162
+
163
+
164
+ # Lock to serialize prerequisite image builds (base + repo) across threads.
165
+ # Building these is expensive and they're shared, so we avoid duplicate work.
166
+ _prereq_lock = threading.Lock()
167
+ # Track repos whose prerequisite images are confirmed present.
168
+ _prereq_done: set[tuple[str, str]] = set()
169
+
170
+
171
+ class SynthesizeImagesRunner(BaseRunner):
172
+ """Run Synthesizer for each PR to produce Docker build contexts."""
173
+
174
+ def __init__(
175
+ self,
176
+ synthesizer: Synthesizer,
177
+ gh: Any | None = None,
178
+ n_concurrent: int = 3,
179
+ ) -> None:
180
+ super().__init__(name="synthesize_images", n_concurrent=n_concurrent)
181
+ self._synthesizer = synthesizer
182
+ self._gh = gh # GitHubClient, optional — needed for rendering problem statements
183
+
184
+ async def _render_problem(self, item: dict[str, Any]) -> str | None:
185
+ """Render the problem statement for a PR, scraping linked issues.
186
+
187
+ Returns the rendered markdown, or ``None`` if rendering is skipped
188
+ (no GitHubClient) or fails.
189
+ """
190
+ if self._gh is None:
191
+ return None
192
+
193
+ owner: str = item["owner"]
194
+ repo: str = item["repo"]
195
+ issue_number: int = item["issue_number"]
196
+
197
+ from datasmith.github.links import scrape_links
198
+ from datasmith.github.models import PR
199
+ from datasmith.github.render import render_problem_statement
200
+
201
+ # Build a PR object for scrape_links and render_problem_statement
202
+ pr = PR(
203
+ repository=f"{owner}/{repo}",
204
+ issue_number=issue_number,
205
+ title=item.get("title", ""),
206
+ body=item.get("body", ""),
207
+ created_at=item.get("created_at"),
208
+ )
209
+
210
+ # BFS-scrape linked issues (async GitHub API calls)
211
+ issues = await scrape_links(
212
+ pr,
213
+ self._gh.get_issue_expanded,
214
+ depth=2,
215
+ only_issues=True,
216
+ limit=6,
217
+ )
218
+
219
+ logger.info(
220
+ "Scraped %d linked issues for %s/%s#%d",
221
+ len(issues),
222
+ owner,
223
+ repo,
224
+ issue_number,
225
+ )
226
+
227
+ # Render the problem statement (may invoke ProblemExtractor LLM — run in thread)
228
+ repo_description: str = item.get("repo_description", "")
229
+ rendered = await asyncio.to_thread(
230
+ render_problem_statement,
231
+ pr,
232
+ issues=issues,
233
+ repo_description=repo_description,
234
+ anonymize=True,
235
+ extract=True,
236
+ )
237
+
238
+ # Persist to DB
239
+ client = get_client()
240
+ client.table("pull_requests").update({"rendered_problem": rendered}).eq("owner", owner).eq("repo", repo).eq(
241
+ "issue_number", issue_number
242
+ ).execute()
243
+
244
+ logger.info("Rendered problem statement for %s/%s#%d", owner, repo, issue_number)
245
+ return rendered
246
+
247
+ async def _process_item(self, item: Any) -> None:
248
+ """Process a PR dict with owner, repo, issue_number, pr_context."""
249
+ owner = item["owner"]
250
+ repo = item["repo"]
251
+ issue_number = item["issue_number"]
252
+ pr_context = item.get("pr_context", "")
253
+ py_version = item.get("python_version", "")
254
+
255
+ # Ensure base and repo images exist before synthesis needs them
256
+ await asyncio.to_thread(self._ensure_prereqs, owner, repo, py_version)
257
+
258
+ # Render the problem statement before synthesis (skip if already rendered)
259
+ if not pr_context:
260
+ rendered = await self._render_problem(item)
261
+ if rendered:
262
+ pr_context = rendered
263
+
264
+ sha = item.get("sha", "")
265
+ env_payload = item.get("env_payload", "")
266
+
267
+ from datasmith.docker.images import get_repo_image_name
268
+
269
+ repo_image = get_repo_image_name(owner, repo)
270
+
271
+ # Run synthesizer in thread (Docker operations are blocking)
272
+ ctx = await asyncio.to_thread(
273
+ self._synthesizer.run,
274
+ owner,
275
+ repo,
276
+ issue_number,
277
+ pr_context,
278
+ sha,
279
+ repo_image=repo_image,
280
+ env_payload=env_payload,
281
+ python_version=py_version,
282
+ )
283
+
284
+ if ctx is None:
285
+ raise RuntimeError(f"Synthesis failed for {owner}/{repo}#{issue_number}")
286
+
287
+ logger.info("Successfully synthesized image for %s/%s#%d", owner, repo, issue_number)
288
+
289
+ # Build the final PR image and push to DockerHub
290
+ pr_tag = await asyncio.to_thread(
291
+ _build_and_push_pr_image, owner, repo, issue_number, sha, env_payload, ctx, py_version
292
+ )
293
+
294
+ # Record the container name in Supabase
295
+ client = get_client()
296
+ client.table("pull_requests").update({"container_name": pr_tag}).eq("owner", owner).eq("repo", repo).eq(
297
+ "issue_number", issue_number
298
+ ).execute()
299
+
300
+ @staticmethod
301
+ def _ensure_prereqs(owner: str, repo: str, py_version: str) -> None:
302
+ """Build base/repo images if missing, with dedup across threads."""
303
+ key = (owner, repo)
304
+ if key in _prereq_done:
305
+ return
306
+ with _prereq_lock:
307
+ if key in _prereq_done:
308
+ return
309
+ _ensure_prerequisite_images(owner, repo, py_version)
310
+ _prereq_done.add(key)