fc-data 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasmith/__init__.py +330 -0
- datasmith/__init__.pyi +194 -0
- datasmith/agents/__init__.py +31 -0
- datasmith/agents/classifiers.py +272 -0
- datasmith/agents/codex.py +25 -0
- datasmith/agents/config.py +108 -0
- datasmith/agents/extractors.py +197 -0
- datasmith/agents/installed/README.md +52 -0
- datasmith/agents/installed/__init__.py +22 -0
- datasmith/agents/installed/base.py +240 -0
- datasmith/agents/installed/claude.py +134 -0
- datasmith/agents/installed/codex.py +91 -0
- datasmith/agents/installed/gemini.py +118 -0
- datasmith/agents/installed/none.py +27 -0
- datasmith/agents/sandbox.py +547 -0
- datasmith/agents/synthesizer.py +439 -0
- datasmith/agents/templates/AGENTS.md.j2 +150 -0
- datasmith/agents/templates/sandbox_verify.py +428 -0
- datasmith/docker/__init__.py +31 -0
- datasmith/docker/context.py +112 -0
- datasmith/docker/images.py +158 -0
- datasmith/docker/publish.py +56 -0
- datasmith/docker/templates/Dockerfile.base +26 -0
- datasmith/docker/templates/Dockerfile.pr +42 -0
- datasmith/docker/templates/Dockerfile.repo +11 -0
- datasmith/docker/templates/docker_build_base.sh +780 -0
- datasmith/docker/templates/docker_build_env.sh +309 -0
- datasmith/docker/templates/docker_build_final.sh +106 -0
- datasmith/docker/templates/docker_build_pkg.sh +99 -0
- datasmith/docker/templates/docker_build_run.sh +124 -0
- datasmith/docker/templates/entrypoint.sh +62 -0
- datasmith/docker/templates/parser.py +1405 -0
- datasmith/docker/templates/profile.sh +199 -0
- datasmith/docker/templates/pytest_runner.py +692 -0
- datasmith/docker/templates/run-tests.sh +197 -0
- datasmith/docker/verifiers.py +131 -0
- datasmith/filters.py +154 -0
- datasmith/github/__init__.py +22 -0
- datasmith/github/client.py +333 -0
- datasmith/github/hooks.py +50 -0
- datasmith/github/links.py +110 -0
- datasmith/github/models.py +206 -0
- datasmith/github/render.py +173 -0
- datasmith/github/search.py +66 -0
- datasmith/github/templates/comment.md.j2 +5 -0
- datasmith/github/templates/final.md.j2 +66 -0
- datasmith/github/templates/issues.md.j2 +21 -0
- datasmith/github/templates/repo.md.j2 +1 -0
- datasmith/preflight.py +162 -0
- datasmith/publish/__init__.py +13 -0
- datasmith/publish/huggingface.py +104 -0
- datasmith/publish/pipeline.py +60 -0
- datasmith/publish/records.py +91 -0
- datasmith/py.typed +1 -0
- datasmith/resolution/__init__.py +14 -0
- datasmith/resolution/blocklist.py +145 -0
- datasmith/resolution/cache.py +120 -0
- datasmith/resolution/constants.py +277 -0
- datasmith/resolution/dependency_resolver.py +174 -0
- datasmith/resolution/git_utils.py +378 -0
- datasmith/resolution/import_analyzer.py +66 -0
- datasmith/resolution/metadata_parser.py +412 -0
- datasmith/resolution/models.py +41 -0
- datasmith/resolution/orchestrator.py +522 -0
- datasmith/resolution/package_filters.py +312 -0
- datasmith/resolution/python_manager.py +110 -0
- datasmith/runners/__init__.py +15 -0
- datasmith/runners/base.py +112 -0
- datasmith/runners/classify_prs.py +48 -0
- datasmith/runners/render_problems.py +113 -0
- datasmith/runners/resolve_packages.py +66 -0
- datasmith/runners/scrape_commits.py +166 -0
- datasmith/runners/scrape_repos.py +44 -0
- datasmith/runners/synthesize_images.py +310 -0
- datasmith/update/__init__.py +5 -0
- datasmith/update/cli.py +169 -0
- datasmith/update/offline.py +173 -0
- datasmith/update/pipeline.py +497 -0
- datasmith/utils/__init__.py +18 -0
- datasmith/utils/core.py +67 -0
- datasmith/utils/db.py +156 -0
- datasmith/utils/tokens.py +65 -0
- fc_data-0.2.0.dist-info/METADATA +441 -0
- fc_data-0.2.0.dist-info/RECORD +87 -0
- fc_data-0.2.0.dist-info/WHEEL +4 -0
- fc_data-0.2.0.dist-info/entry_points.txt +2 -0
- fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from datasmith.utils import get_client, get_logger
|
|
6
|
+
from datasmith.utils.db import fetch_all
|
|
7
|
+
|
|
8
|
+
logger = get_logger("update.pipeline")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _cap_per_repo(items: list[dict[str, Any]], limit: int) -> list[dict[str, Any]]:
|
|
12
|
+
"""Return at most *limit* randomly-sampled items per (owner, repo)."""
|
|
13
|
+
import random
|
|
14
|
+
from collections import defaultdict
|
|
15
|
+
|
|
16
|
+
by_repo: defaultdict[tuple[str, str], list[dict[str, Any]]] = defaultdict(list)
|
|
17
|
+
for it in items:
|
|
18
|
+
by_repo[(it["owner"], it["repo"])].append(it)
|
|
19
|
+
|
|
20
|
+
capped: list[dict[str, Any]] = []
|
|
21
|
+
for group in by_repo.values():
|
|
22
|
+
capped.extend(random.sample(group, min(limit, len(group))))
|
|
23
|
+
logger.info("Capped to %d tasks (%d per repo) from %d total", len(capped), limit, len(items))
|
|
24
|
+
return capped
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _fetch_repo_descriptions(rows: list[dict[str, Any]]) -> dict[tuple[str, str], str]:
|
|
28
|
+
"""Batch-fetch repo descriptions for a set of rows with owner/repo keys."""
|
|
29
|
+
repo_keys = {(r["owner"], r["repo"]) for r in rows}
|
|
30
|
+
descriptions: dict[tuple[str, str], str] = {}
|
|
31
|
+
if repo_keys:
|
|
32
|
+
desc_rows = fetch_all("repositories", select="owner, repo, description")
|
|
33
|
+
for rd in desc_rows:
|
|
34
|
+
key = (rd["owner"], rd["repo"])
|
|
35
|
+
if key in repo_keys:
|
|
36
|
+
descriptions[key] = rd.get("description") or ""
|
|
37
|
+
return descriptions
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
STAGES = [
|
|
41
|
+
"scrape_repos",
|
|
42
|
+
"scrape_commits",
|
|
43
|
+
"classify_prs",
|
|
44
|
+
"resolve_packages",
|
|
45
|
+
"render_problems",
|
|
46
|
+
"synthesize_images",
|
|
47
|
+
"publish",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _build_file_change_summary(file_changes: list[dict[str, Any]] | None) -> str:
|
|
52
|
+
if not file_changes:
|
|
53
|
+
return ""
|
|
54
|
+
lines = [
|
|
55
|
+
"| File | Lines Added | Lines Removed |",
|
|
56
|
+
"|------|-------------|----------------|",
|
|
57
|
+
]
|
|
58
|
+
for f in file_changes:
|
|
59
|
+
lines.append(f"| {f.get('filename', '')} | {f.get('additions', 0)} | {f.get('deletions', 0)} |")
|
|
60
|
+
return "\n".join(lines)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _format_description(title: str, body: str) -> str:
|
|
64
|
+
parts = [p for p in (title.strip(), body.strip()) if p]
|
|
65
|
+
return "\n\n".join(parts)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Pipeline:
|
|
69
|
+
"""Orchestrate the full FormulaCode update pipeline."""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
dry_run: bool = False,
|
|
74
|
+
n_concurrent: int | None = None,
|
|
75
|
+
tasks_per_repo: int | None = None,
|
|
76
|
+
agent: str | None = None,
|
|
77
|
+
force: bool = False,
|
|
78
|
+
offline_source: str | None = None,
|
|
79
|
+
min_stars: int = 500,
|
|
80
|
+
) -> None:
|
|
81
|
+
self._dry_run = dry_run
|
|
82
|
+
self._n_concurrent = n_concurrent
|
|
83
|
+
self._tasks_per_repo = tasks_per_repo
|
|
84
|
+
self._agent = agent
|
|
85
|
+
self._force = force
|
|
86
|
+
self._offline_source = offline_source
|
|
87
|
+
self._min_stars = min_stars
|
|
88
|
+
self._completed_stages: list[str] = []
|
|
89
|
+
|
|
90
|
+
async def run(
|
|
91
|
+
self,
|
|
92
|
+
start_date: str,
|
|
93
|
+
end_date: str,
|
|
94
|
+
resume: bool = False,
|
|
95
|
+
stage: int | list[int] | None = None,
|
|
96
|
+
) -> None:
|
|
97
|
+
"""Execute pipeline stages in order.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
start_date: ISO date string (YYYY-MM-DD)
|
|
101
|
+
end_date: ISO date string (YYYY-MM-DD)
|
|
102
|
+
resume: If True, skip already-completed stages
|
|
103
|
+
stage: If set, run only these stages (1-based indices)
|
|
104
|
+
"""
|
|
105
|
+
stages_to_run = STAGES
|
|
106
|
+
|
|
107
|
+
if stage is not None:
|
|
108
|
+
indices = [stage] if isinstance(stage, int) else stage
|
|
109
|
+
for s in indices:
|
|
110
|
+
if s < 1 or s > len(STAGES):
|
|
111
|
+
raise ValueError(f"Stage must be 1-{len(STAGES)}, got {s}")
|
|
112
|
+
stages_to_run = [STAGES[s - 1] for s in sorted(indices)]
|
|
113
|
+
elif resume:
|
|
114
|
+
completed = self._get_completed_stages()
|
|
115
|
+
stages_to_run = [s for s in STAGES if s not in completed]
|
|
116
|
+
if not stages_to_run:
|
|
117
|
+
logger.info("All stages already completed")
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
logger.info(
|
|
121
|
+
"Running pipeline stages: %s (dry_run=%s)",
|
|
122
|
+
", ".join(stages_to_run),
|
|
123
|
+
self._dry_run,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
for stage_name in stages_to_run:
|
|
127
|
+
if self._dry_run:
|
|
128
|
+
logger.info("[DRY RUN] Would run stage: %s", stage_name)
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
logger.info("Starting stage: %s", stage_name)
|
|
132
|
+
try:
|
|
133
|
+
await self._run_stage(stage_name, start_date, end_date)
|
|
134
|
+
self._completed_stages.append(stage_name)
|
|
135
|
+
self._mark_stage_completed(stage_name)
|
|
136
|
+
logger.info("Completed stage: %s", stage_name)
|
|
137
|
+
except Exception:
|
|
138
|
+
logger.exception("Stage %s failed", stage_name)
|
|
139
|
+
raise
|
|
140
|
+
|
|
141
|
+
async def _run_stage(self, stage_name: str, start_date: str, end_date: str) -> None:
|
|
142
|
+
if stage_name == "scrape_repos":
|
|
143
|
+
await self._scrape_repos()
|
|
144
|
+
elif stage_name == "scrape_commits":
|
|
145
|
+
await self._scrape_commits(start_date, end_date)
|
|
146
|
+
elif stage_name == "classify_prs":
|
|
147
|
+
await self._classify_prs()
|
|
148
|
+
elif stage_name == "resolve_packages":
|
|
149
|
+
await self._resolve_packages(start_date, end_date)
|
|
150
|
+
elif stage_name == "render_problems":
|
|
151
|
+
await self._render_problems()
|
|
152
|
+
elif stage_name == "synthesize_images":
|
|
153
|
+
await self._synthesize_images()
|
|
154
|
+
elif stage_name == "publish":
|
|
155
|
+
await self._publish(start_date, end_date)
|
|
156
|
+
|
|
157
|
+
async def _scrape_repos(self) -> None:
|
|
158
|
+
from datasmith.github.client import GitHubClient
|
|
159
|
+
from datasmith.github.search import search_repos_by_file
|
|
160
|
+
from datasmith.runners.scrape_repos import ScrapeReposRunner
|
|
161
|
+
from datasmith.utils.tokens import TokenPool
|
|
162
|
+
|
|
163
|
+
pool = TokenPool()
|
|
164
|
+
gh = GitHubClient(pool)
|
|
165
|
+
runner = ScrapeReposRunner(gh, **({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}))
|
|
166
|
+
|
|
167
|
+
seen: set[tuple[str, str]] = set()
|
|
168
|
+
items: list[tuple[str, str]] = []
|
|
169
|
+
|
|
170
|
+
# 1. Discover repos via GitHub code search
|
|
171
|
+
discovered = await search_repos_by_file(gh, filename="asv.conf.json", min_stars=self._min_stars)
|
|
172
|
+
for pair in discovered:
|
|
173
|
+
if pair not in seen:
|
|
174
|
+
seen.add(pair)
|
|
175
|
+
items.append(pair)
|
|
176
|
+
|
|
177
|
+
# 2. Import repos from offline source (parquet) if provided
|
|
178
|
+
if self._offline_source:
|
|
179
|
+
from datasmith.update.offline import load_offline_repo_names
|
|
180
|
+
|
|
181
|
+
for pair in load_offline_repo_names(self._offline_source):
|
|
182
|
+
if pair not in seen:
|
|
183
|
+
seen.add(pair)
|
|
184
|
+
items.append(pair)
|
|
185
|
+
logger.info(
|
|
186
|
+
"Imported repos from offline source: %d new (total %d)",
|
|
187
|
+
len(items) - len(discovered),
|
|
188
|
+
len(items),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# 3. Also include repos already in the DB (metadata refresh)
|
|
192
|
+
rows = fetch_all("repositories", select="owner, repo")
|
|
193
|
+
for r in rows:
|
|
194
|
+
pair = (r["owner"], r["repo"])
|
|
195
|
+
if pair not in seen:
|
|
196
|
+
seen.add(pair)
|
|
197
|
+
items.append(pair)
|
|
198
|
+
|
|
199
|
+
logger.info("Total repos to process: %d", len(items))
|
|
200
|
+
await runner.run(items)
|
|
201
|
+
await gh.close()
|
|
202
|
+
|
|
203
|
+
async def _scrape_commits(self, start_date: str, end_date: str) -> None:
|
|
204
|
+
from datasmith.github.client import GitHubClient
|
|
205
|
+
from datasmith.runners.scrape_commits import ScrapeCommitsRunner
|
|
206
|
+
from datasmith.utils.tokens import TokenPool
|
|
207
|
+
|
|
208
|
+
pool = TokenPool()
|
|
209
|
+
gh = GitHubClient(pool)
|
|
210
|
+
kwargs: dict[str, Any] = {"since": start_date, "until": end_date}
|
|
211
|
+
if self._n_concurrent:
|
|
212
|
+
kwargs["n_concurrent"] = self._n_concurrent
|
|
213
|
+
runner = ScrapeCommitsRunner(gh, **kwargs)
|
|
214
|
+
|
|
215
|
+
rows = fetch_all("repositories", select="owner, repo")
|
|
216
|
+
items = [(r["owner"], r["repo"]) for r in rows]
|
|
217
|
+
await runner.run(items)
|
|
218
|
+
await gh.close()
|
|
219
|
+
|
|
220
|
+
# Bulk-import from offline source (parquet) if provided
|
|
221
|
+
if self._offline_source:
|
|
222
|
+
from datasmith.update.offline import load_offline_pull_requests
|
|
223
|
+
from datasmith.utils.db import batch_upsert
|
|
224
|
+
|
|
225
|
+
records = load_offline_pull_requests(self._offline_source, start_date, end_date)
|
|
226
|
+
n = batch_upsert("pull_requests", records)
|
|
227
|
+
logger.info("Imported %d pull request records from offline source", n)
|
|
228
|
+
|
|
229
|
+
async def _classify_prs(self) -> None:
|
|
230
|
+
from datasmith.agents.classifiers import ClassifyJudge, PerfClassifier
|
|
231
|
+
from datasmith.agents.config import AgentConfig, configure_dspy
|
|
232
|
+
from datasmith.runners.classify_prs import ClassifyPRsRunner
|
|
233
|
+
|
|
234
|
+
configure_dspy(AgentConfig.from_env())
|
|
235
|
+
|
|
236
|
+
classifier = PerfClassifier()
|
|
237
|
+
judge = ClassifyJudge()
|
|
238
|
+
runner = ClassifyPRsRunner(
|
|
239
|
+
classifier,
|
|
240
|
+
judge,
|
|
241
|
+
**({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}),
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
classify_kwargs: dict[str, Any] = {
|
|
245
|
+
"select": "owner, repo, issue_number, title, body, patch, file_changes",
|
|
246
|
+
"filters": {"is_performance_commit_symbolic": True},
|
|
247
|
+
}
|
|
248
|
+
if not self._force:
|
|
249
|
+
classify_kwargs["is_null"] = ["is_performance_commit"]
|
|
250
|
+
rows = fetch_all("pull_requests", **classify_kwargs)
|
|
251
|
+
items = [
|
|
252
|
+
{
|
|
253
|
+
"owner": r["owner"],
|
|
254
|
+
"repo": r["repo"],
|
|
255
|
+
"issue_number": r["issue_number"],
|
|
256
|
+
"description": _format_description(r.get("title", ""), r.get("body", "")),
|
|
257
|
+
"patch": r.get("patch", ""),
|
|
258
|
+
"file_change_summary": _build_file_change_summary(r.get("file_changes")),
|
|
259
|
+
}
|
|
260
|
+
for r in rows
|
|
261
|
+
]
|
|
262
|
+
await runner.run(items)
|
|
263
|
+
|
|
264
|
+
async def _resolve_packages(self, start_date: str, end_date: str) -> None:
|
|
265
|
+
from datasmith.runners.resolve_packages import ResolvePackagesRunner
|
|
266
|
+
|
|
267
|
+
runner = ResolvePackagesRunner(
|
|
268
|
+
**({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}),
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Get performance-classified PRs within the date range
|
|
272
|
+
rows = fetch_all(
|
|
273
|
+
"pull_requests",
|
|
274
|
+
select="owner, repo, merge_commit_sha",
|
|
275
|
+
filters={"is_performance_commit": True},
|
|
276
|
+
gte_filters={"created_at": start_date},
|
|
277
|
+
lte_filters={"created_at": end_date},
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
# Deduplicate by (owner, repo, sha) — multiple PRs may share the same commit
|
|
281
|
+
seen: set[tuple[str, str, str]] = set()
|
|
282
|
+
items: list[dict[str, Any]] = []
|
|
283
|
+
for r in rows:
|
|
284
|
+
sha = r.get("merge_commit_sha", "")
|
|
285
|
+
if not sha:
|
|
286
|
+
continue
|
|
287
|
+
key = (r["owner"], r["repo"], sha)
|
|
288
|
+
if key in seen:
|
|
289
|
+
continue
|
|
290
|
+
seen.add(key)
|
|
291
|
+
items.append({"owner": r["owner"], "repo": r["repo"], "sha": sha})
|
|
292
|
+
|
|
293
|
+
# Skip items already in the packages table (unless --force)
|
|
294
|
+
if items and not self._force:
|
|
295
|
+
existing_rows = fetch_all("packages", select="owner, repo, sha")
|
|
296
|
+
existing_keys = {(e["owner"], e["repo"], e["sha"]) for e in existing_rows}
|
|
297
|
+
items = [it for it in items if (it["owner"], it["repo"], it["sha"]) not in existing_keys]
|
|
298
|
+
|
|
299
|
+
logger.info("Resolving packages for %d commits", len(items))
|
|
300
|
+
await runner.run(items)
|
|
301
|
+
|
|
302
|
+
async def _render_problems(self) -> None:
|
|
303
|
+
from datasmith.agents.config import AgentConfig, configure_dspy
|
|
304
|
+
from datasmith.github.client import GitHubClient
|
|
305
|
+
from datasmith.runners.render_problems import RenderProblemsRunner
|
|
306
|
+
from datasmith.utils.tokens import TokenPool
|
|
307
|
+
|
|
308
|
+
configure_dspy(AgentConfig.from_env())
|
|
309
|
+
|
|
310
|
+
pool = TokenPool()
|
|
311
|
+
gh = GitHubClient(pool)
|
|
312
|
+
runner = RenderProblemsRunner(
|
|
313
|
+
gh=gh,
|
|
314
|
+
**({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}),
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Fetch performance-classified PRs
|
|
318
|
+
rows = fetch_all(
|
|
319
|
+
"pull_requests",
|
|
320
|
+
select="owner, repo, issue_number, merge_commit_sha, title, body, created_at",
|
|
321
|
+
filters={"is_performance_commit": True, "is_performance_commit_symbolic": True},
|
|
322
|
+
neq_filters={"merge_commit_sha": ""},
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Only process PRs whose commit has can_install=True resolved packages
|
|
326
|
+
pkg_rows = fetch_all(
|
|
327
|
+
"packages",
|
|
328
|
+
select="owner, repo, sha",
|
|
329
|
+
filters={"can_install": True},
|
|
330
|
+
)
|
|
331
|
+
installable: set[tuple[str, str, str]] = {(p["owner"], p["repo"], p["sha"]) for p in pkg_rows}
|
|
332
|
+
|
|
333
|
+
repo_descriptions = _fetch_repo_descriptions(rows)
|
|
334
|
+
|
|
335
|
+
# Skip PRs already processed (have a candidate_prs row) unless --force
|
|
336
|
+
existing_keys: set[tuple[str, str, int]] = set()
|
|
337
|
+
if not self._force:
|
|
338
|
+
existing_rows = fetch_all("candidate_prs", select="owner, repo, issue_number")
|
|
339
|
+
existing_keys = {(e["owner"], e["repo"], e["issue_number"]) for e in existing_rows}
|
|
340
|
+
|
|
341
|
+
items = []
|
|
342
|
+
for r in rows:
|
|
343
|
+
sha = r.get("merge_commit_sha", "")
|
|
344
|
+
if not sha:
|
|
345
|
+
continue
|
|
346
|
+
if (r["owner"], r["repo"], sha) not in installable:
|
|
347
|
+
logger.debug(
|
|
348
|
+
"Skipping %s/%s#%d: no can_install package for sha %s",
|
|
349
|
+
r["owner"],
|
|
350
|
+
r["repo"],
|
|
351
|
+
r["issue_number"],
|
|
352
|
+
sha[:8],
|
|
353
|
+
)
|
|
354
|
+
continue
|
|
355
|
+
if (r["owner"], r["repo"], r["issue_number"]) in existing_keys:
|
|
356
|
+
continue
|
|
357
|
+
items.append({
|
|
358
|
+
"owner": r["owner"],
|
|
359
|
+
"repo": r["repo"],
|
|
360
|
+
"issue_number": r["issue_number"],
|
|
361
|
+
"merge_commit_sha": sha,
|
|
362
|
+
"title": r.get("title", ""),
|
|
363
|
+
"body": r.get("body", ""),
|
|
364
|
+
"created_at": r.get("created_at"),
|
|
365
|
+
"repo_description": repo_descriptions.get((r["owner"], r["repo"]), ""),
|
|
366
|
+
})
|
|
367
|
+
|
|
368
|
+
if self._tasks_per_repo is not None:
|
|
369
|
+
items = _cap_per_repo(items, self._tasks_per_repo)
|
|
370
|
+
|
|
371
|
+
logger.info("Rendering problem contexts for %d PRs", len(items))
|
|
372
|
+
await runner.run(items)
|
|
373
|
+
await gh.close()
|
|
374
|
+
|
|
375
|
+
async def _synthesize_images(self) -> None:
|
|
376
|
+
from datasmith.agents.synthesizer import Synthesizer
|
|
377
|
+
from datasmith.github.client import GitHubClient
|
|
378
|
+
from datasmith.runners.synthesize_images import SynthesizeImagesRunner
|
|
379
|
+
from datasmith.utils.tokens import TokenPool
|
|
380
|
+
|
|
381
|
+
pool = TokenPool()
|
|
382
|
+
gh = GitHubClient(pool)
|
|
383
|
+
|
|
384
|
+
synth = Synthesizer(agent=self._agent, force=self._force)
|
|
385
|
+
runner = SynthesizeImagesRunner(
|
|
386
|
+
synth,
|
|
387
|
+
gh=gh,
|
|
388
|
+
**({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}),
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
query_kwargs: dict[str, Any] = {
|
|
392
|
+
"select": "owner, repo, issue_number, merge_commit_sha, title, body, created_at, rendered_problem",
|
|
393
|
+
"filters": {"is_performance_commit": True, "is_performance_commit_symbolic": True},
|
|
394
|
+
"neq_filters": {"merge_commit_sha": ""},
|
|
395
|
+
}
|
|
396
|
+
if not self._force:
|
|
397
|
+
query_kwargs["is_null"] = ["container_name"]
|
|
398
|
+
rows = fetch_all("pull_requests", **query_kwargs)
|
|
399
|
+
|
|
400
|
+
# Join with packages table for env_payload and python_version
|
|
401
|
+
pkg_rows = fetch_all(
|
|
402
|
+
"packages",
|
|
403
|
+
select="owner, repo, sha, env_payload, python_version",
|
|
404
|
+
filters={"can_install": True},
|
|
405
|
+
)
|
|
406
|
+
pkg_lookup: dict[tuple[str, str, str], dict[str, Any]] = {
|
|
407
|
+
(p["owner"], p["repo"], p["sha"]): p for p in pkg_rows
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
repo_descriptions = _fetch_repo_descriptions(rows)
|
|
411
|
+
|
|
412
|
+
# Only synthesize PRs that have a rendered context with linked issues
|
|
413
|
+
# and a non-empty extracted problem statement (from stage 5).
|
|
414
|
+
ctx_rows = fetch_all(
|
|
415
|
+
"candidate_prs",
|
|
416
|
+
select="owner, repo, issue_number, issues_json, initial_observations",
|
|
417
|
+
)
|
|
418
|
+
eligible_prs: set[tuple[str, str, int]] = {
|
|
419
|
+
(c["owner"], c["repo"], c["issue_number"])
|
|
420
|
+
for c in ctx_rows
|
|
421
|
+
if c.get("issues_json") or c.get("initial_observations")
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
items = []
|
|
425
|
+
for r in rows:
|
|
426
|
+
sha = r.get("merge_commit_sha", "")
|
|
427
|
+
pkg = pkg_lookup.get((r["owner"], r["repo"], sha), {})
|
|
428
|
+
# Skip PRs without resolved packages
|
|
429
|
+
if not pkg:
|
|
430
|
+
logger.debug(
|
|
431
|
+
"Skipping %s/%s#%d: no resolved packages for sha %s",
|
|
432
|
+
r["owner"],
|
|
433
|
+
r["repo"],
|
|
434
|
+
r["issue_number"],
|
|
435
|
+
sha[:8] if sha else "?",
|
|
436
|
+
)
|
|
437
|
+
continue
|
|
438
|
+
# Skip PRs without a rendered context (non-empty issues + observations)
|
|
439
|
+
if (r["owner"], r["repo"], r["issue_number"]) not in eligible_prs:
|
|
440
|
+
logger.debug(
|
|
441
|
+
"Skipping %s/%s#%d: no eligible pr_context (empty issues_json or initial_observations)",
|
|
442
|
+
r["owner"],
|
|
443
|
+
r["repo"],
|
|
444
|
+
r["issue_number"],
|
|
445
|
+
)
|
|
446
|
+
continue
|
|
447
|
+
items.append({
|
|
448
|
+
"owner": r["owner"],
|
|
449
|
+
"repo": r["repo"],
|
|
450
|
+
"issue_number": r["issue_number"],
|
|
451
|
+
"sha": sha,
|
|
452
|
+
"title": r.get("title", ""),
|
|
453
|
+
"body": r.get("body", ""),
|
|
454
|
+
"created_at": r.get("created_at"),
|
|
455
|
+
"pr_context": r.get("rendered_problem") or r.get("body", ""),
|
|
456
|
+
"repo_description": repo_descriptions.get((r["owner"], r["repo"]), ""),
|
|
457
|
+
"env_payload": pkg.get("env_payload", ""),
|
|
458
|
+
"python_version": pkg.get("python_version", ""),
|
|
459
|
+
})
|
|
460
|
+
if self._tasks_per_repo is not None:
|
|
461
|
+
items = _cap_per_repo(items, self._tasks_per_repo)
|
|
462
|
+
|
|
463
|
+
logger.info("Synthesizing images for %d PRs", len(items))
|
|
464
|
+
await runner.run(items)
|
|
465
|
+
await gh.close()
|
|
466
|
+
|
|
467
|
+
async def _publish(self, start_date: str, end_date: str) -> None:
|
|
468
|
+
from datasmith.publish.pipeline import publish_pipeline
|
|
469
|
+
|
|
470
|
+
await publish_pipeline(start_date, end_date)
|
|
471
|
+
|
|
472
|
+
def _get_completed_stages(self) -> list[str]:
|
|
473
|
+
try:
|
|
474
|
+
client = get_client()
|
|
475
|
+
resp = client.table("runner_progress").select("runner_name, completed, total").execute()
|
|
476
|
+
rows: list[dict[str, Any]] = resp.data # type: ignore[assignment]
|
|
477
|
+
completed: list[str] = []
|
|
478
|
+
for r in rows:
|
|
479
|
+
if r["total"] > 0 and r["completed"] >= r["total"]:
|
|
480
|
+
completed.append(r["runner_name"])
|
|
481
|
+
except Exception:
|
|
482
|
+
return []
|
|
483
|
+
else:
|
|
484
|
+
return completed
|
|
485
|
+
|
|
486
|
+
def _mark_stage_completed(self, stage_name: str) -> None:
|
|
487
|
+
try:
|
|
488
|
+
client = get_client()
|
|
489
|
+
client.table("runner_progress").upsert({
|
|
490
|
+
"runner_id": f"pipeline-{stage_name}",
|
|
491
|
+
"runner_name": stage_name,
|
|
492
|
+
"total": 1,
|
|
493
|
+
"completed": 1,
|
|
494
|
+
"failed": 0,
|
|
495
|
+
}).execute()
|
|
496
|
+
except Exception:
|
|
497
|
+
logger.warning("Failed to mark stage %s as completed", stage_name)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""ds.utils — Foundation layer: Supabase client, caching, token pool, config."""
|
|
2
|
+
|
|
3
|
+
from datasmith.utils.core import Settings, get_logger, with_backoff
|
|
4
|
+
from datasmith.utils.db import batch_upsert, fetch_all, get_async_client, get_client, stable_hash, supabase_cached
|
|
5
|
+
from datasmith.utils.tokens import TokenPool
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Settings",
|
|
9
|
+
"TokenPool",
|
|
10
|
+
"batch_upsert",
|
|
11
|
+
"fetch_all",
|
|
12
|
+
"get_async_client",
|
|
13
|
+
"get_client",
|
|
14
|
+
"get_logger",
|
|
15
|
+
"stable_hash",
|
|
16
|
+
"supabase_cached",
|
|
17
|
+
"with_backoff",
|
|
18
|
+
]
|
datasmith/utils/core.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Core configuration, logging, and retry utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Callable, TypeVar
|
|
10
|
+
|
|
11
|
+
from pydantic_settings import BaseSettings
|
|
12
|
+
|
|
13
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Settings(BaseSettings):
|
|
17
|
+
"""Application settings loaded from environment / tokens.env."""
|
|
18
|
+
|
|
19
|
+
supabase_url: str = ""
|
|
20
|
+
supabase_key: str = ""
|
|
21
|
+
gh_tokens: str = ""
|
|
22
|
+
dspy_model: str = ""
|
|
23
|
+
dspy_api_key: str = ""
|
|
24
|
+
dspy_api_base: str = ""
|
|
25
|
+
dspy_max_tokens: int = 16000
|
|
26
|
+
dockerhub_username: str = ""
|
|
27
|
+
dockerhub_token: str = ""
|
|
28
|
+
hf_token_path: str = ""
|
|
29
|
+
|
|
30
|
+
model_config = {"env_file": "tokens.env", "env_file_encoding": "utf-8", "extra": "ignore"}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_logger(name: str | None = None) -> logging.Logger:
|
|
34
|
+
"""Return a logger under the ``datasmith`` namespace."""
|
|
35
|
+
full_name = f"datasmith.{name}" if name else "datasmith"
|
|
36
|
+
logger = logging.getLogger(full_name)
|
|
37
|
+
if not logging.getLogger("datasmith").handlers:
|
|
38
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
39
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s", datefmt="%H:%M:%S"))
|
|
40
|
+
root = logging.getLogger("datasmith")
|
|
41
|
+
root.addHandler(handler)
|
|
42
|
+
root.setLevel(logging.INFO)
|
|
43
|
+
return logger
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def with_backoff(max_retries: int = 3, base_delay: float = 1.0) -> Callable[[F], F]:
|
|
47
|
+
"""Decorator: retry with exponential backoff on transient failures."""
|
|
48
|
+
|
|
49
|
+
def decorator(func: F) -> F:
|
|
50
|
+
@functools.wraps(func)
|
|
51
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
52
|
+
delay = base_delay
|
|
53
|
+
last_exc: Exception | None = None
|
|
54
|
+
for attempt in range(max_retries + 1):
|
|
55
|
+
try:
|
|
56
|
+
return func(*args, **kwargs)
|
|
57
|
+
except Exception as exc:
|
|
58
|
+
last_exc = exc
|
|
59
|
+
if attempt == max_retries:
|
|
60
|
+
raise
|
|
61
|
+
time.sleep(delay)
|
|
62
|
+
delay *= 2
|
|
63
|
+
raise last_exc # type: ignore[misc] # unreachable but satisfies mypy
|
|
64
|
+
|
|
65
|
+
return wrapper # type: ignore[return-value]
|
|
66
|
+
|
|
67
|
+
return decorator
|