fc-data 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasmith/__init__.py +330 -0
- datasmith/__init__.pyi +194 -0
- datasmith/agents/__init__.py +31 -0
- datasmith/agents/classifiers.py +272 -0
- datasmith/agents/codex.py +25 -0
- datasmith/agents/config.py +108 -0
- datasmith/agents/extractors.py +197 -0
- datasmith/agents/installed/README.md +52 -0
- datasmith/agents/installed/__init__.py +22 -0
- datasmith/agents/installed/base.py +240 -0
- datasmith/agents/installed/claude.py +134 -0
- datasmith/agents/installed/codex.py +91 -0
- datasmith/agents/installed/gemini.py +118 -0
- datasmith/agents/installed/none.py +27 -0
- datasmith/agents/sandbox.py +547 -0
- datasmith/agents/synthesizer.py +439 -0
- datasmith/agents/templates/AGENTS.md.j2 +150 -0
- datasmith/agents/templates/sandbox_verify.py +428 -0
- datasmith/docker/__init__.py +31 -0
- datasmith/docker/context.py +112 -0
- datasmith/docker/images.py +158 -0
- datasmith/docker/publish.py +56 -0
- datasmith/docker/templates/Dockerfile.base +26 -0
- datasmith/docker/templates/Dockerfile.pr +42 -0
- datasmith/docker/templates/Dockerfile.repo +11 -0
- datasmith/docker/templates/docker_build_base.sh +780 -0
- datasmith/docker/templates/docker_build_env.sh +309 -0
- datasmith/docker/templates/docker_build_final.sh +106 -0
- datasmith/docker/templates/docker_build_pkg.sh +99 -0
- datasmith/docker/templates/docker_build_run.sh +124 -0
- datasmith/docker/templates/entrypoint.sh +62 -0
- datasmith/docker/templates/parser.py +1405 -0
- datasmith/docker/templates/profile.sh +199 -0
- datasmith/docker/templates/pytest_runner.py +692 -0
- datasmith/docker/templates/run-tests.sh +197 -0
- datasmith/docker/verifiers.py +131 -0
- datasmith/filters.py +154 -0
- datasmith/github/__init__.py +22 -0
- datasmith/github/client.py +333 -0
- datasmith/github/hooks.py +50 -0
- datasmith/github/links.py +110 -0
- datasmith/github/models.py +206 -0
- datasmith/github/render.py +173 -0
- datasmith/github/search.py +66 -0
- datasmith/github/templates/comment.md.j2 +5 -0
- datasmith/github/templates/final.md.j2 +66 -0
- datasmith/github/templates/issues.md.j2 +21 -0
- datasmith/github/templates/repo.md.j2 +1 -0
- datasmith/preflight.py +162 -0
- datasmith/publish/__init__.py +13 -0
- datasmith/publish/huggingface.py +104 -0
- datasmith/publish/pipeline.py +60 -0
- datasmith/publish/records.py +91 -0
- datasmith/py.typed +1 -0
- datasmith/resolution/__init__.py +14 -0
- datasmith/resolution/blocklist.py +145 -0
- datasmith/resolution/cache.py +120 -0
- datasmith/resolution/constants.py +277 -0
- datasmith/resolution/dependency_resolver.py +174 -0
- datasmith/resolution/git_utils.py +378 -0
- datasmith/resolution/import_analyzer.py +66 -0
- datasmith/resolution/metadata_parser.py +412 -0
- datasmith/resolution/models.py +41 -0
- datasmith/resolution/orchestrator.py +522 -0
- datasmith/resolution/package_filters.py +312 -0
- datasmith/resolution/python_manager.py +110 -0
- datasmith/runners/__init__.py +15 -0
- datasmith/runners/base.py +112 -0
- datasmith/runners/classify_prs.py +48 -0
- datasmith/runners/render_problems.py +113 -0
- datasmith/runners/resolve_packages.py +66 -0
- datasmith/runners/scrape_commits.py +166 -0
- datasmith/runners/scrape_repos.py +44 -0
- datasmith/runners/synthesize_images.py +310 -0
- datasmith/update/__init__.py +5 -0
- datasmith/update/cli.py +169 -0
- datasmith/update/offline.py +173 -0
- datasmith/update/pipeline.py +497 -0
- datasmith/utils/__init__.py +18 -0
- datasmith/utils/core.py +67 -0
- datasmith/utils/db.py +156 -0
- datasmith/utils/tokens.py +65 -0
- fc_data-0.2.0.dist-info/METADATA +441 -0
- fc_data-0.2.0.dist-info/RECORD +87 -0
- fc_data-0.2.0.dist-info/WHEEL +4 -0
- fc_data-0.2.0.dist-info/entry_points.txt +2 -0
- fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from datasmith.runners.base import BaseRunner
|
|
7
|
+
from datasmith.utils import get_client, get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger("runners.render_problems")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RenderProblemsRunner(BaseRunner):
|
|
13
|
+
"""Scrape linked issues, run ProblemExtractor, and persist deconstructed context.
|
|
14
|
+
|
|
15
|
+
For each PR this runner:
|
|
16
|
+
1. BFS-scrapes linked GitHub issues via the GitHub API.
|
|
17
|
+
2. Runs :class:`~datasmith.agents.extractors.ProblemExtractor` (DSPy) once to
|
|
18
|
+
split the PR body into four structured fields.
|
|
19
|
+
3. Renders the full Jinja2 problem-statement template.
|
|
20
|
+
4. Upserts everything into the ``candidate_prs`` table (raw components + rendered).
|
|
21
|
+
5. Updates ``pull_requests.rendered_problem`` and the new
|
|
22
|
+
``pull_requests.problem_description`` column (problem-only text).
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, gh: Any, n_concurrent: int = 5) -> None:
|
|
26
|
+
super().__init__(name="render_problems", n_concurrent=n_concurrent)
|
|
27
|
+
self._gh = gh
|
|
28
|
+
|
|
29
|
+
async def _process_item(self, item: Any) -> None:
|
|
30
|
+
"""Render the problem statement for a single PR dict."""
|
|
31
|
+
owner: str = item["owner"]
|
|
32
|
+
repo: str = item["repo"]
|
|
33
|
+
issue_number: int = item["issue_number"]
|
|
34
|
+
merge_commit_sha: str = item.get("merge_commit_sha", "") or ""
|
|
35
|
+
repo_description: str = item.get("repo_description", "") or ""
|
|
36
|
+
|
|
37
|
+
from datasmith.agents.extractors import ProblemExtractor
|
|
38
|
+
from datasmith.github.links import scrape_links
|
|
39
|
+
from datasmith.github.models import PR
|
|
40
|
+
from datasmith.github.render import render_problem_statement
|
|
41
|
+
|
|
42
|
+
pr = PR(
|
|
43
|
+
repository=f"{owner}/{repo}",
|
|
44
|
+
issue_number=issue_number,
|
|
45
|
+
title=item.get("title", ""),
|
|
46
|
+
body=item.get("body", ""),
|
|
47
|
+
created_at=item.get("created_at"),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# BFS-scrape linked issues (async GitHub API calls)
|
|
51
|
+
issues = await scrape_links(
|
|
52
|
+
pr,
|
|
53
|
+
self._gh.get_issue_expanded,
|
|
54
|
+
depth=2,
|
|
55
|
+
only_issues=True,
|
|
56
|
+
limit=6,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
logger.info(
|
|
60
|
+
"Scraped %d linked issues for %s/%s#%d",
|
|
61
|
+
len(issues),
|
|
62
|
+
owner,
|
|
63
|
+
repo,
|
|
64
|
+
issue_number,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Run ProblemExtractor once (DSPy LLM call — run in thread)
|
|
68
|
+
extraction = await asyncio.to_thread(
|
|
69
|
+
ProblemExtractor().extract_problem,
|
|
70
|
+
item.get("title", ""),
|
|
71
|
+
item.get("body", ""),
|
|
72
|
+
)
|
|
73
|
+
problem_description = extraction.to_problem_markdown()
|
|
74
|
+
|
|
75
|
+
# Render the full problem statement using the pre-extracted observations
|
|
76
|
+
# (pass initial_observations to avoid a second ProblemExtractor call)
|
|
77
|
+
rendered = await asyncio.to_thread(
|
|
78
|
+
render_problem_statement,
|
|
79
|
+
pr,
|
|
80
|
+
issues=issues,
|
|
81
|
+
repo_description=repo_description,
|
|
82
|
+
anonymize=True,
|
|
83
|
+
extract=False,
|
|
84
|
+
initial_observations=problem_description or getattr(pr, "body", ""),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Serialize linked issues for storage (mode="json" converts datetime → ISO string)
|
|
88
|
+
issues_json = [issue.model_dump(mode="json") for issue in issues]
|
|
89
|
+
|
|
90
|
+
# Upsert all raw components + rendered output into candidate_prs
|
|
91
|
+
client = get_client()
|
|
92
|
+
client.table("candidate_prs").upsert({
|
|
93
|
+
"owner": owner,
|
|
94
|
+
"repo": repo,
|
|
95
|
+
"issue_number": issue_number,
|
|
96
|
+
"merge_commit_sha": merge_commit_sha,
|
|
97
|
+
"repo_description": repo_description,
|
|
98
|
+
"issues_json": issues_json,
|
|
99
|
+
"initial_observations": extraction.initial_observations,
|
|
100
|
+
"triage_attempts": extraction.triage_attempts,
|
|
101
|
+
"solution_overview": extraction.solution_overview,
|
|
102
|
+
"solution_observations": extraction.solution_observations,
|
|
103
|
+
"rendered_problem": rendered,
|
|
104
|
+
}).execute()
|
|
105
|
+
|
|
106
|
+
# Keep pull_requests in sync: rendered_problem (used by synthesize_images)
|
|
107
|
+
# and problem_description (problem-only extracted text)
|
|
108
|
+
client.table("pull_requests").update({
|
|
109
|
+
"rendered_problem": rendered,
|
|
110
|
+
"problem_description": problem_description,
|
|
111
|
+
}).eq("owner", owner).eq("repo", repo).eq("issue_number", issue_number).execute()
|
|
112
|
+
|
|
113
|
+
logger.info("Rendered problem context for %s/%s#%d", owner, repo, issue_number)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Runner for resolving Python dependencies for classified PRs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import functools
|
|
7
|
+
import json
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from datasmith.runners.base import BaseRunner
|
|
11
|
+
from datasmith.utils import get_client, get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger("runners.resolve_packages")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ResolvePackagesRunner(BaseRunner):
|
|
17
|
+
"""Resolve dependencies for classified PRs and persist to the packages table."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, n_concurrent: int = 16) -> None:
|
|
20
|
+
super().__init__(name="resolve_packages", n_concurrent=n_concurrent)
|
|
21
|
+
|
|
22
|
+
async def _process_item(self, item: Any) -> None:
|
|
23
|
+
"""Process an item dict with owner, repo, sha."""
|
|
24
|
+
owner = item["owner"]
|
|
25
|
+
repo = item["repo"]
|
|
26
|
+
sha = item["sha"]
|
|
27
|
+
|
|
28
|
+
from datasmith.resolution import analyze_commit
|
|
29
|
+
|
|
30
|
+
loop = asyncio.get_running_loop()
|
|
31
|
+
result = await loop.run_in_executor(None, functools.partial(analyze_commit, sha, f"{owner}/{repo}"))
|
|
32
|
+
|
|
33
|
+
client = get_client()
|
|
34
|
+
|
|
35
|
+
if result is None:
|
|
36
|
+
logger.info("Resolution returned None for %s/%s@%s", owner, repo, sha[:8])
|
|
37
|
+
return
|
|
38
|
+
|
|
39
|
+
env_payload = json.dumps(result.get("final_dependencies", []))
|
|
40
|
+
|
|
41
|
+
row = {
|
|
42
|
+
"owner": owner,
|
|
43
|
+
"repo": repo,
|
|
44
|
+
"sha": sha,
|
|
45
|
+
"package_name": result.get("package_name"),
|
|
46
|
+
"package_version": result.get("package_version"),
|
|
47
|
+
"python_version": result.get("python_version", ""),
|
|
48
|
+
"env_payload": env_payload,
|
|
49
|
+
"build_commands": result.get("build_command"),
|
|
50
|
+
"install_commands": result.get("install_command"),
|
|
51
|
+
"primary_root": result.get("primary_root"),
|
|
52
|
+
"resolution_strategy": result.get("resolution_strategy"),
|
|
53
|
+
"can_install": result.get("can_install", False),
|
|
54
|
+
"requires_python": None,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
client.table("packages").upsert(row).execute()
|
|
58
|
+
logger.info(
|
|
59
|
+
"Resolved %s/%s@%s: python=%s can_install=%s deps=%d",
|
|
60
|
+
owner,
|
|
61
|
+
repo,
|
|
62
|
+
sha[:8],
|
|
63
|
+
result.get("python_version"),
|
|
64
|
+
result.get("can_install"),
|
|
65
|
+
len(result.get("final_dependencies", [])),
|
|
66
|
+
)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from datasmith.filters import symbolic_compliance
|
|
7
|
+
from datasmith.runners.base import BaseRunner
|
|
8
|
+
from datasmith.utils import get_client, get_logger
|
|
9
|
+
from datasmith.utils.db import fetch_all
|
|
10
|
+
|
|
11
|
+
logger = get_logger("runners.scrape_commits")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _parse_iso(value: str | None) -> datetime | None:
|
|
15
|
+
"""Parse an ISO-8601 date or datetime string to a timezone-aware datetime."""
|
|
16
|
+
if not value:
|
|
17
|
+
return None
|
|
18
|
+
# Handle date-only strings like "2024-01-01"
|
|
19
|
+
if "T" not in value:
|
|
20
|
+
return datetime.fromisoformat(value).replace(tzinfo=timezone.utc)
|
|
21
|
+
# Handle full ISO datetime strings (with or without trailing Z)
|
|
22
|
+
cleaned = value.replace("Z", "+00:00")
|
|
23
|
+
return datetime.fromisoformat(cleaned)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _should_skip_pr(
|
|
27
|
+
pr_data: dict[str, Any],
|
|
28
|
+
since: datetime | None,
|
|
29
|
+
until: datetime | None,
|
|
30
|
+
seen_shas: set[str],
|
|
31
|
+
) -> bool | str:
|
|
32
|
+
"""Return False if the PR should be processed, 'skip' to skip, 'stop' to halt pagination."""
|
|
33
|
+
if since:
|
|
34
|
+
created = _parse_iso(pr_data.get("created_at"))
|
|
35
|
+
if created and created < since:
|
|
36
|
+
return "stop"
|
|
37
|
+
|
|
38
|
+
if not pr_data.get("merged_at"):
|
|
39
|
+
return "skip"
|
|
40
|
+
|
|
41
|
+
merged = _parse_iso(pr_data.get("merged_at"))
|
|
42
|
+
if merged:
|
|
43
|
+
if since and merged < since:
|
|
44
|
+
return "skip"
|
|
45
|
+
if until and merged >= until:
|
|
46
|
+
return "skip"
|
|
47
|
+
|
|
48
|
+
sha = pr_data.get("merge_commit_sha", "")
|
|
49
|
+
if sha:
|
|
50
|
+
if sha in seen_shas:
|
|
51
|
+
return "skip"
|
|
52
|
+
seen_shas.add(sha)
|
|
53
|
+
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _sanitize_text(value: str) -> str:
|
|
58
|
+
"""Strip Postgres-illegal null bytes from text values."""
|
|
59
|
+
return value.replace("\u0000", "")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
async def _build_record(
|
|
63
|
+
gh: Any,
|
|
64
|
+
owner: str,
|
|
65
|
+
repo: str,
|
|
66
|
+
pr_data: dict[str, Any],
|
|
67
|
+
) -> dict[str, Any]:
|
|
68
|
+
"""Fetch diff/files and build the upsert record for a single PR."""
|
|
69
|
+
issue_number = pr_data["number"]
|
|
70
|
+
title = _sanitize_text(pr_data.get("title", ""))
|
|
71
|
+
|
|
72
|
+
diff = await gh.get_diff(owner, repo, issue_number)
|
|
73
|
+
if diff:
|
|
74
|
+
diff = _sanitize_text(diff)
|
|
75
|
+
files = await gh.get_files(owner, repo, issue_number)
|
|
76
|
+
|
|
77
|
+
file_changes: list[dict[str, Any]] | None = None
|
|
78
|
+
if files:
|
|
79
|
+
file_changes = [
|
|
80
|
+
{
|
|
81
|
+
"filename": f.get("filename", ""),
|
|
82
|
+
"additions": f.get("additions", 0),
|
|
83
|
+
"deletions": f.get("deletions", 0),
|
|
84
|
+
}
|
|
85
|
+
for f in files
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
record: dict[str, Any] = {
|
|
89
|
+
"owner": owner,
|
|
90
|
+
"repo": repo,
|
|
91
|
+
"issue_number": issue_number,
|
|
92
|
+
"title": title,
|
|
93
|
+
"body": _sanitize_text(pr_data.get("body", "") or ""),
|
|
94
|
+
"state": pr_data.get("state", ""),
|
|
95
|
+
"created_at": pr_data.get("created_at"),
|
|
96
|
+
"merged_at": pr_data.get("merged_at"),
|
|
97
|
+
"closed_at": pr_data.get("closed_at"),
|
|
98
|
+
"merge_commit_sha": pr_data.get("merge_commit_sha", ""),
|
|
99
|
+
"base_sha": pr_data.get("base", {}).get("sha", ""),
|
|
100
|
+
"head_sha": pr_data.get("head", {}).get("sha", ""),
|
|
101
|
+
"labels": [label["name"] for label in pr_data.get("labels", [])],
|
|
102
|
+
"is_performance_commit_symbolic": symbolic_compliance(
|
|
103
|
+
title=title,
|
|
104
|
+
patch=diff or None,
|
|
105
|
+
file_changes=file_changes,
|
|
106
|
+
),
|
|
107
|
+
}
|
|
108
|
+
if diff:
|
|
109
|
+
record["patch"] = diff
|
|
110
|
+
if file_changes:
|
|
111
|
+
record["file_changes"] = file_changes
|
|
112
|
+
return record
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class ScrapeCommitsRunner(BaseRunner):
|
|
116
|
+
"""Scrape PRs for each repo, run compliance hooks, store in pull_requests table."""
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
github_client: Any,
|
|
121
|
+
n_concurrent: int = 5,
|
|
122
|
+
since: str | None = None,
|
|
123
|
+
until: str | None = None,
|
|
124
|
+
) -> None:
|
|
125
|
+
super().__init__(name="scrape_commits", n_concurrent=n_concurrent)
|
|
126
|
+
self._gh = github_client
|
|
127
|
+
self._since = _parse_iso(since)
|
|
128
|
+
self._until = _parse_iso(until)
|
|
129
|
+
|
|
130
|
+
async def _process_item(self, item: Any) -> None:
|
|
131
|
+
"""Process a (owner, repo) tuple — scrape its merged PRs via GraphQL."""
|
|
132
|
+
owner, repo = item if isinstance(item, tuple) else item.split("/")
|
|
133
|
+
|
|
134
|
+
# Pre-fetch existing issue_numbers for this repo to avoid redundant API calls
|
|
135
|
+
existing_rows = fetch_all(
|
|
136
|
+
"pull_requests",
|
|
137
|
+
select="issue_number",
|
|
138
|
+
filters={"owner": owner, "repo": repo},
|
|
139
|
+
)
|
|
140
|
+
existing_issues: set[int] = {r["issue_number"] for r in existing_rows}
|
|
141
|
+
|
|
142
|
+
seen_shas: set[str] = set()
|
|
143
|
+
client = get_client()
|
|
144
|
+
count = 0
|
|
145
|
+
|
|
146
|
+
async for page in self._gh.paginate_merged_prs(owner, repo):
|
|
147
|
+
stop = False
|
|
148
|
+
for pr_data in page:
|
|
149
|
+
verdict = _should_skip_pr(pr_data, self._since, self._until, seen_shas)
|
|
150
|
+
if verdict == "stop":
|
|
151
|
+
stop = True
|
|
152
|
+
break
|
|
153
|
+
if verdict == "skip":
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
if pr_data["number"] in existing_issues:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
record = await _build_record(self._gh, owner, repo, pr_data)
|
|
160
|
+
client.table("pull_requests").upsert(record).execute()
|
|
161
|
+
count += 1
|
|
162
|
+
|
|
163
|
+
if stop:
|
|
164
|
+
break
|
|
165
|
+
|
|
166
|
+
logger.info("Scraped %d merged PRs for %s/%s", count, owner, repo)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from datasmith.runners.base import BaseRunner
|
|
6
|
+
from datasmith.utils import get_client, get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger("runners.scrape_repos")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ScrapeReposRunner(BaseRunner):
|
|
12
|
+
"""Scrape GitHub repos via search API and store in repositories table."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, github_client: Any, n_concurrent: int = 5) -> None:
|
|
15
|
+
super().__init__(name="scrape_repos", n_concurrent=n_concurrent)
|
|
16
|
+
self._gh = github_client
|
|
17
|
+
|
|
18
|
+
async def _process_item(self, item: Any) -> None:
|
|
19
|
+
"""Process a search query or repo identifier."""
|
|
20
|
+
owner, repo = item if isinstance(item, tuple) else item.split("/")
|
|
21
|
+
|
|
22
|
+
# Check if already exists
|
|
23
|
+
client = get_client()
|
|
24
|
+
resp = client.table("repositories").select("owner").eq("owner", owner).eq("repo", repo).execute()
|
|
25
|
+
if resp.data:
|
|
26
|
+
logger.debug("Skipping existing repo: %s/%s", owner, repo)
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
# Fetch repo info via GitHub API
|
|
30
|
+
resp_gh = await self._gh._request("GET", f"/repos/{owner}/{repo}")
|
|
31
|
+
if resp_gh is None:
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
data = resp_gh.json()
|
|
35
|
+
client.table("repositories").upsert({
|
|
36
|
+
"owner": owner,
|
|
37
|
+
"repo": repo,
|
|
38
|
+
"url": data.get("html_url", ""),
|
|
39
|
+
"language": data.get("language", ""),
|
|
40
|
+
"stars": data.get("stargazers_count", 0),
|
|
41
|
+
"topics": data.get("topics", []),
|
|
42
|
+
"description": data.get("description", ""),
|
|
43
|
+
}).execute()
|
|
44
|
+
logger.info("Stored repo: %s/%s", owner, repo)
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import tempfile
|
|
5
|
+
import threading
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from datasmith.agents.synthesizer import Synthesizer
|
|
9
|
+
from datasmith.runners.base import BaseRunner
|
|
10
|
+
from datasmith.utils import get_client, get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger("runners.synthesize_images")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _ensure_prerequisite_images(owner: str, repo: str, py_version: str = "") -> None:
|
|
16
|
+
"""Build the base and repo Docker images if they don't exist locally.
|
|
17
|
+
|
|
18
|
+
The three-tier hierarchy (base → repo → PR) requires each parent image
|
|
19
|
+
to be present in the local daemon before the child can be built.
|
|
20
|
+
"""
|
|
21
|
+
from datasmith.docker.images import ImageManager, get_base_image_name, get_repo_image_name
|
|
22
|
+
|
|
23
|
+
mgr = ImageManager()
|
|
24
|
+
base_tag = get_base_image_name()
|
|
25
|
+
repo_tag = get_repo_image_name(owner, repo)
|
|
26
|
+
|
|
27
|
+
if not mgr.image_exists(base_tag):
|
|
28
|
+
logger.info("Building missing base image: %s", base_tag)
|
|
29
|
+
mgr.build_base_image(py_version=py_version)
|
|
30
|
+
|
|
31
|
+
if not mgr.image_exists(repo_tag):
|
|
32
|
+
logger.info("Building missing repo image: %s", repo_tag)
|
|
33
|
+
mgr.build_repo_image(owner, repo, py_version=py_version)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _build_and_push_pr_image(
|
|
37
|
+
owner: str,
|
|
38
|
+
repo: str,
|
|
39
|
+
issue_number: int,
|
|
40
|
+
sha: str,
|
|
41
|
+
env_payload: str,
|
|
42
|
+
docker_context: Any | None = None,
|
|
43
|
+
python_version: str = "",
|
|
44
|
+
) -> str:
|
|
45
|
+
"""Build the final PR image from synthesized context and push to DockerHub.
|
|
46
|
+
|
|
47
|
+
The three-tier Dockerfiles (base/repo/pr) use the same shell scripts as
|
|
48
|
+
the synthesized single-Dockerfile flow, so the build result is identical.
|
|
49
|
+
|
|
50
|
+
Returns the pushed image tag.
|
|
51
|
+
"""
|
|
52
|
+
from datasmith.docker.images import ImageManager, get_pr_image_name, get_repo_image_name
|
|
53
|
+
from datasmith.docker.publish import DockerHubPublisher
|
|
54
|
+
|
|
55
|
+
ctx = docker_context
|
|
56
|
+
mgr = ImageManager()
|
|
57
|
+
pr_tag = get_pr_image_name(owner, repo, issue_number)
|
|
58
|
+
|
|
59
|
+
if ctx is not None:
|
|
60
|
+
with tempfile.TemporaryDirectory(prefix="docker-ctx-") as tmpdir:
|
|
61
|
+
ctx.to_directory(tmpdir)
|
|
62
|
+
_fill_missing_scripts(tmpdir, base_commit=sha)
|
|
63
|
+
mgr.build_pr_image(
|
|
64
|
+
owner,
|
|
65
|
+
repo,
|
|
66
|
+
issue_number,
|
|
67
|
+
context=tmpdir,
|
|
68
|
+
commit_sha=sha or "HEAD",
|
|
69
|
+
env_payload=env_payload or "[]",
|
|
70
|
+
py_version=python_version,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
mgr.build_pr_image(
|
|
74
|
+
owner,
|
|
75
|
+
repo,
|
|
76
|
+
issue_number,
|
|
77
|
+
commit_sha=sha or "HEAD",
|
|
78
|
+
env_payload=env_payload or "[]",
|
|
79
|
+
py_version=python_version,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
publisher = DockerHubPublisher()
|
|
83
|
+
repo_tag = get_repo_image_name(owner, repo)
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
publisher.push(repo_tag)
|
|
87
|
+
except Exception:
|
|
88
|
+
logger.warning("Failed to push repo image %s (non-fatal)", repo_tag)
|
|
89
|
+
|
|
90
|
+
publisher.push(pr_tag)
|
|
91
|
+
logger.info("Pushed PR image: %s", pr_tag)
|
|
92
|
+
return pr_tag
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _render_run_tests_sh(docker_templates: Any, base_commit: str) -> str:
|
|
96
|
+
"""Render the run-tests.sh Jinja2 template with embedded scripts."""
|
|
97
|
+
from pathlib import Path
|
|
98
|
+
|
|
99
|
+
from jinja2 import Environment, FileSystemLoader
|
|
100
|
+
|
|
101
|
+
docker_templates = Path(docker_templates)
|
|
102
|
+
env = Environment(
|
|
103
|
+
loader=FileSystemLoader(str(docker_templates)),
|
|
104
|
+
keep_trailing_newline=True,
|
|
105
|
+
autoescape=False,
|
|
106
|
+
)
|
|
107
|
+
template = env.get_template("run-tests.sh")
|
|
108
|
+
|
|
109
|
+
pytest_runner = (docker_templates / "pytest_runner.py").read_text()
|
|
110
|
+
parser = (docker_templates / "parser.py").read_text()
|
|
111
|
+
|
|
112
|
+
return template.render(
|
|
113
|
+
base_commit=base_commit,
|
|
114
|
+
pytest_runner=pytest_runner,
|
|
115
|
+
parser=parser,
|
|
116
|
+
run_pytest=True,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _fill_missing_scripts(context_dir: str, base_commit: str = "") -> None:
|
|
121
|
+
"""Copy any missing shell scripts and Dockerfile.pr from the templates directory.
|
|
122
|
+
|
|
123
|
+
Synthesized contexts may only contain a subset of the 9 expected files
|
|
124
|
+
(e.g. only ``build_pkg_sh``). The Dockerfile.pr ``COPY`` directives
|
|
125
|
+
require every file to be present, so we backfill from the built-in
|
|
126
|
+
templates for anything the synthesizer didn't produce.
|
|
127
|
+
|
|
128
|
+
``run-tests.sh`` is a Jinja2 template that requires rendering with
|
|
129
|
+
``base_commit`` and embedded Python scripts before it can be used.
|
|
130
|
+
"""
|
|
131
|
+
import os
|
|
132
|
+
import shutil
|
|
133
|
+
from pathlib import Path
|
|
134
|
+
|
|
135
|
+
templates = Path(__file__).parents[1] / "docker" / "templates"
|
|
136
|
+
|
|
137
|
+
# Every file that Dockerfile.pr references via COPY
|
|
138
|
+
required = [
|
|
139
|
+
"Dockerfile.pr",
|
|
140
|
+
"docker_build_env.sh",
|
|
141
|
+
"docker_build_pkg.sh",
|
|
142
|
+
"docker_build_run.sh",
|
|
143
|
+
"docker_build_final.sh",
|
|
144
|
+
"profile.sh",
|
|
145
|
+
"run-tests.sh",
|
|
146
|
+
"entrypoint.sh",
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
for fname in required:
|
|
150
|
+
target = os.path.join(context_dir, fname)
|
|
151
|
+
if os.path.exists(target):
|
|
152
|
+
continue
|
|
153
|
+
if fname == "run-tests.sh":
|
|
154
|
+
# run-tests.sh is a Jinja2 template — render it instead of copying raw
|
|
155
|
+
rendered = _render_run_tests_sh(templates, base_commit=base_commit)
|
|
156
|
+
with open(target, "w") as f:
|
|
157
|
+
f.write(rendered)
|
|
158
|
+
else:
|
|
159
|
+
src = templates / fname
|
|
160
|
+
if src.exists():
|
|
161
|
+
shutil.copy2(str(src), target)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# Lock to serialize prerequisite image builds (base + repo) across threads.
|
|
165
|
+
# Building these is expensive and they're shared, so we avoid duplicate work.
|
|
166
|
+
_prereq_lock = threading.Lock()
|
|
167
|
+
# Track repos whose prerequisite images are confirmed present.
|
|
168
|
+
_prereq_done: set[tuple[str, str]] = set()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class SynthesizeImagesRunner(BaseRunner):
|
|
172
|
+
"""Run Synthesizer for each PR to produce Docker build contexts."""
|
|
173
|
+
|
|
174
|
+
def __init__(
|
|
175
|
+
self,
|
|
176
|
+
synthesizer: Synthesizer,
|
|
177
|
+
gh: Any | None = None,
|
|
178
|
+
n_concurrent: int = 3,
|
|
179
|
+
) -> None:
|
|
180
|
+
super().__init__(name="synthesize_images", n_concurrent=n_concurrent)
|
|
181
|
+
self._synthesizer = synthesizer
|
|
182
|
+
self._gh = gh # GitHubClient, optional — needed for rendering problem statements
|
|
183
|
+
|
|
184
|
+
async def _render_problem(self, item: dict[str, Any]) -> str | None:
|
|
185
|
+
"""Render the problem statement for a PR, scraping linked issues.
|
|
186
|
+
|
|
187
|
+
Returns the rendered markdown, or ``None`` if rendering is skipped
|
|
188
|
+
(no GitHubClient) or fails.
|
|
189
|
+
"""
|
|
190
|
+
if self._gh is None:
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
owner: str = item["owner"]
|
|
194
|
+
repo: str = item["repo"]
|
|
195
|
+
issue_number: int = item["issue_number"]
|
|
196
|
+
|
|
197
|
+
from datasmith.github.links import scrape_links
|
|
198
|
+
from datasmith.github.models import PR
|
|
199
|
+
from datasmith.github.render import render_problem_statement
|
|
200
|
+
|
|
201
|
+
# Build a PR object for scrape_links and render_problem_statement
|
|
202
|
+
pr = PR(
|
|
203
|
+
repository=f"{owner}/{repo}",
|
|
204
|
+
issue_number=issue_number,
|
|
205
|
+
title=item.get("title", ""),
|
|
206
|
+
body=item.get("body", ""),
|
|
207
|
+
created_at=item.get("created_at"),
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# BFS-scrape linked issues (async GitHub API calls)
|
|
211
|
+
issues = await scrape_links(
|
|
212
|
+
pr,
|
|
213
|
+
self._gh.get_issue_expanded,
|
|
214
|
+
depth=2,
|
|
215
|
+
only_issues=True,
|
|
216
|
+
limit=6,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
logger.info(
|
|
220
|
+
"Scraped %d linked issues for %s/%s#%d",
|
|
221
|
+
len(issues),
|
|
222
|
+
owner,
|
|
223
|
+
repo,
|
|
224
|
+
issue_number,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Render the problem statement (may invoke ProblemExtractor LLM — run in thread)
|
|
228
|
+
repo_description: str = item.get("repo_description", "")
|
|
229
|
+
rendered = await asyncio.to_thread(
|
|
230
|
+
render_problem_statement,
|
|
231
|
+
pr,
|
|
232
|
+
issues=issues,
|
|
233
|
+
repo_description=repo_description,
|
|
234
|
+
anonymize=True,
|
|
235
|
+
extract=True,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Persist to DB
|
|
239
|
+
client = get_client()
|
|
240
|
+
client.table("pull_requests").update({"rendered_problem": rendered}).eq("owner", owner).eq("repo", repo).eq(
|
|
241
|
+
"issue_number", issue_number
|
|
242
|
+
).execute()
|
|
243
|
+
|
|
244
|
+
logger.info("Rendered problem statement for %s/%s#%d", owner, repo, issue_number)
|
|
245
|
+
return rendered
|
|
246
|
+
|
|
247
|
+
async def _process_item(self, item: Any) -> None:
|
|
248
|
+
"""Process a PR dict with owner, repo, issue_number, pr_context."""
|
|
249
|
+
owner = item["owner"]
|
|
250
|
+
repo = item["repo"]
|
|
251
|
+
issue_number = item["issue_number"]
|
|
252
|
+
pr_context = item.get("pr_context", "")
|
|
253
|
+
py_version = item.get("python_version", "")
|
|
254
|
+
|
|
255
|
+
# Ensure base and repo images exist before synthesis needs them
|
|
256
|
+
await asyncio.to_thread(self._ensure_prereqs, owner, repo, py_version)
|
|
257
|
+
|
|
258
|
+
# Render the problem statement before synthesis (skip if already rendered)
|
|
259
|
+
if not pr_context:
|
|
260
|
+
rendered = await self._render_problem(item)
|
|
261
|
+
if rendered:
|
|
262
|
+
pr_context = rendered
|
|
263
|
+
|
|
264
|
+
sha = item.get("sha", "")
|
|
265
|
+
env_payload = item.get("env_payload", "")
|
|
266
|
+
|
|
267
|
+
from datasmith.docker.images import get_repo_image_name
|
|
268
|
+
|
|
269
|
+
repo_image = get_repo_image_name(owner, repo)
|
|
270
|
+
|
|
271
|
+
# Run synthesizer in thread (Docker operations are blocking)
|
|
272
|
+
ctx = await asyncio.to_thread(
|
|
273
|
+
self._synthesizer.run,
|
|
274
|
+
owner,
|
|
275
|
+
repo,
|
|
276
|
+
issue_number,
|
|
277
|
+
pr_context,
|
|
278
|
+
sha,
|
|
279
|
+
repo_image=repo_image,
|
|
280
|
+
env_payload=env_payload,
|
|
281
|
+
python_version=py_version,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
if ctx is None:
|
|
285
|
+
raise RuntimeError(f"Synthesis failed for {owner}/{repo}#{issue_number}")
|
|
286
|
+
|
|
287
|
+
logger.info("Successfully synthesized image for %s/%s#%d", owner, repo, issue_number)
|
|
288
|
+
|
|
289
|
+
# Build the final PR image and push to DockerHub
|
|
290
|
+
pr_tag = await asyncio.to_thread(
|
|
291
|
+
_build_and_push_pr_image, owner, repo, issue_number, sha, env_payload, ctx, py_version
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Record the container name in Supabase
|
|
295
|
+
client = get_client()
|
|
296
|
+
client.table("pull_requests").update({"container_name": pr_tag}).eq("owner", owner).eq("repo", repo).eq(
|
|
297
|
+
"issue_number", issue_number
|
|
298
|
+
).execute()
|
|
299
|
+
|
|
300
|
+
@staticmethod
|
|
301
|
+
def _ensure_prereqs(owner: str, repo: str, py_version: str) -> None:
|
|
302
|
+
"""Build base/repo images if missing, with dedup across threads."""
|
|
303
|
+
key = (owner, repo)
|
|
304
|
+
if key in _prereq_done:
|
|
305
|
+
return
|
|
306
|
+
with _prereq_lock:
|
|
307
|
+
if key in _prereq_done:
|
|
308
|
+
return
|
|
309
|
+
_ensure_prerequisite_images(owner, repo, py_version)
|
|
310
|
+
_prereq_done.add(key)
|