fc-data 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasmith/__init__.py +330 -0
- datasmith/__init__.pyi +194 -0
- datasmith/agents/__init__.py +31 -0
- datasmith/agents/classifiers.py +272 -0
- datasmith/agents/codex.py +25 -0
- datasmith/agents/config.py +108 -0
- datasmith/agents/extractors.py +197 -0
- datasmith/agents/installed/README.md +52 -0
- datasmith/agents/installed/__init__.py +22 -0
- datasmith/agents/installed/base.py +240 -0
- datasmith/agents/installed/claude.py +134 -0
- datasmith/agents/installed/codex.py +91 -0
- datasmith/agents/installed/gemini.py +118 -0
- datasmith/agents/installed/none.py +27 -0
- datasmith/agents/sandbox.py +547 -0
- datasmith/agents/synthesizer.py +439 -0
- datasmith/agents/templates/AGENTS.md.j2 +150 -0
- datasmith/agents/templates/sandbox_verify.py +428 -0
- datasmith/docker/__init__.py +31 -0
- datasmith/docker/context.py +112 -0
- datasmith/docker/images.py +158 -0
- datasmith/docker/publish.py +56 -0
- datasmith/docker/templates/Dockerfile.base +26 -0
- datasmith/docker/templates/Dockerfile.pr +42 -0
- datasmith/docker/templates/Dockerfile.repo +11 -0
- datasmith/docker/templates/docker_build_base.sh +780 -0
- datasmith/docker/templates/docker_build_env.sh +309 -0
- datasmith/docker/templates/docker_build_final.sh +106 -0
- datasmith/docker/templates/docker_build_pkg.sh +99 -0
- datasmith/docker/templates/docker_build_run.sh +124 -0
- datasmith/docker/templates/entrypoint.sh +62 -0
- datasmith/docker/templates/parser.py +1405 -0
- datasmith/docker/templates/profile.sh +199 -0
- datasmith/docker/templates/pytest_runner.py +692 -0
- datasmith/docker/templates/run-tests.sh +197 -0
- datasmith/docker/verifiers.py +131 -0
- datasmith/filters.py +154 -0
- datasmith/github/__init__.py +22 -0
- datasmith/github/client.py +333 -0
- datasmith/github/hooks.py +50 -0
- datasmith/github/links.py +110 -0
- datasmith/github/models.py +206 -0
- datasmith/github/render.py +173 -0
- datasmith/github/search.py +66 -0
- datasmith/github/templates/comment.md.j2 +5 -0
- datasmith/github/templates/final.md.j2 +66 -0
- datasmith/github/templates/issues.md.j2 +21 -0
- datasmith/github/templates/repo.md.j2 +1 -0
- datasmith/preflight.py +162 -0
- datasmith/publish/__init__.py +13 -0
- datasmith/publish/huggingface.py +104 -0
- datasmith/publish/pipeline.py +60 -0
- datasmith/publish/records.py +91 -0
- datasmith/py.typed +1 -0
- datasmith/resolution/__init__.py +14 -0
- datasmith/resolution/blocklist.py +145 -0
- datasmith/resolution/cache.py +120 -0
- datasmith/resolution/constants.py +277 -0
- datasmith/resolution/dependency_resolver.py +174 -0
- datasmith/resolution/git_utils.py +378 -0
- datasmith/resolution/import_analyzer.py +66 -0
- datasmith/resolution/metadata_parser.py +412 -0
- datasmith/resolution/models.py +41 -0
- datasmith/resolution/orchestrator.py +522 -0
- datasmith/resolution/package_filters.py +312 -0
- datasmith/resolution/python_manager.py +110 -0
- datasmith/runners/__init__.py +15 -0
- datasmith/runners/base.py +112 -0
- datasmith/runners/classify_prs.py +48 -0
- datasmith/runners/render_problems.py +113 -0
- datasmith/runners/resolve_packages.py +66 -0
- datasmith/runners/scrape_commits.py +166 -0
- datasmith/runners/scrape_repos.py +44 -0
- datasmith/runners/synthesize_images.py +310 -0
- datasmith/update/__init__.py +5 -0
- datasmith/update/cli.py +169 -0
- datasmith/update/offline.py +173 -0
- datasmith/update/pipeline.py +497 -0
- datasmith/utils/__init__.py +18 -0
- datasmith/utils/core.py +67 -0
- datasmith/utils/db.py +156 -0
- datasmith/utils/tokens.py +65 -0
- fc_data-0.2.0.dist-info/METADATA +441 -0
- fc_data-0.2.0.dist-info/RECORD +87 -0
- fc_data-0.2.0.dist-info/WHEEL +4 -0
- fc_data-0.2.0.dist-info/entry_points.txt +2 -0
- fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
datasmith/utils/db.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Supabase client, ``@supabase_cached`` decorator, and batch upsert helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from typing import Any, Callable, TypeVar, cast
|
|
10
|
+
|
|
11
|
+
from supabase import Client, create_client
|
|
12
|
+
|
|
13
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
14
|
+
|
|
15
|
+
_client: Client | None = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_client() -> Client:
|
|
19
|
+
"""Return a singleton Supabase client from env vars."""
|
|
20
|
+
global _client
|
|
21
|
+
if _client is None:
|
|
22
|
+
url = os.environ.get("SUPABASE_URL", "")
|
|
23
|
+
key = os.environ.get("SUPABASE_KEY", "")
|
|
24
|
+
if not url or not key:
|
|
25
|
+
raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set")
|
|
26
|
+
_client = create_client(url, key)
|
|
27
|
+
return _client
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def get_async_client() -> Any:
|
|
31
|
+
"""Return an async Supabase client.
|
|
32
|
+
|
|
33
|
+
Imported lazily to avoid import errors when supabase async extras
|
|
34
|
+
are not installed.
|
|
35
|
+
"""
|
|
36
|
+
from supabase import acreate_client
|
|
37
|
+
|
|
38
|
+
url = os.environ.get("SUPABASE_URL", "")
|
|
39
|
+
key = os.environ.get("SUPABASE_KEY", "")
|
|
40
|
+
if not url or not key:
|
|
41
|
+
raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set")
|
|
42
|
+
return await acreate_client(url, key)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def stable_hash(*args: Any) -> str:
|
|
46
|
+
"""Produce a deterministic SHA-256 hex digest for the given arguments.
|
|
47
|
+
|
|
48
|
+
Arguments are serialized to canonical JSON (sorted keys, no whitespace).
|
|
49
|
+
"""
|
|
50
|
+
payload = json.dumps(args, sort_keys=True, default=str, separators=(",", ":"))
|
|
51
|
+
return hashlib.sha256(payload.encode()).hexdigest()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def batch_upsert(table: str, rows: list[dict[str, Any]], chunk_size: int = 100) -> int:
|
|
55
|
+
"""Insert/update *rows* into *table* in chunks. Returns total row count."""
|
|
56
|
+
if not rows:
|
|
57
|
+
return 0
|
|
58
|
+
client = get_client()
|
|
59
|
+
total = 0
|
|
60
|
+
for i in range(0, len(rows), chunk_size):
|
|
61
|
+
chunk = rows[i : i + chunk_size]
|
|
62
|
+
client.table(table).upsert(chunk).execute()
|
|
63
|
+
total += len(chunk)
|
|
64
|
+
return total
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def fetch_all(
|
|
68
|
+
table: str,
|
|
69
|
+
select: str = "*",
|
|
70
|
+
filters: dict[str, Any] | None = None,
|
|
71
|
+
is_null: list[str] | None = None,
|
|
72
|
+
gte_filters: dict[str, Any] | None = None,
|
|
73
|
+
lte_filters: dict[str, Any] | None = None,
|
|
74
|
+
neq_filters: dict[str, Any] | None = None,
|
|
75
|
+
page_size: int = 1000,
|
|
76
|
+
) -> list[dict[str, Any]]:
|
|
77
|
+
"""Paginate through all rows matching the query.
|
|
78
|
+
|
|
79
|
+
Supabase/PostgREST caps responses at 1 000 rows by default.
|
|
80
|
+
This helper fetches successive pages using ``range()`` until
|
|
81
|
+
a page returns fewer than *page_size* rows.
|
|
82
|
+
"""
|
|
83
|
+
client = get_client()
|
|
84
|
+
rows: list[dict[str, Any]] = []
|
|
85
|
+
offset = 0
|
|
86
|
+
while True:
|
|
87
|
+
query = client.table(table).select(select)
|
|
88
|
+
for col, val in (filters or {}).items():
|
|
89
|
+
query = query.eq(col, val)
|
|
90
|
+
for col in is_null or []:
|
|
91
|
+
query = query.is_(col, "null")
|
|
92
|
+
for col, val in (gte_filters or {}).items():
|
|
93
|
+
query = query.gte(col, val)
|
|
94
|
+
for col, val in (lte_filters or {}).items():
|
|
95
|
+
query = query.lte(col, val)
|
|
96
|
+
for col, val in (neq_filters or {}).items():
|
|
97
|
+
query = query.neq(col, val)
|
|
98
|
+
resp = query.range(offset, offset + page_size - 1).execute()
|
|
99
|
+
page = cast(list[dict[str, Any]], resp.data or [])
|
|
100
|
+
rows.extend(page)
|
|
101
|
+
if len(page) < page_size:
|
|
102
|
+
break
|
|
103
|
+
offset += page_size
|
|
104
|
+
return rows
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def supabase_cached(func: F) -> F:
|
|
108
|
+
"""Decorator that caches function results in the Supabase ``hook_cache`` table.
|
|
109
|
+
|
|
110
|
+
The decorated function's first positional argument must expose a
|
|
111
|
+
``cache_key`` attribute (e.g. a Pydantic model with that property).
|
|
112
|
+
|
|
113
|
+
Special kwarg ``force=True`` bypasses the cache lookup and overwrites
|
|
114
|
+
the stored value.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
@functools.wraps(func)
|
|
118
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
119
|
+
force = kwargs.pop("force", False)
|
|
120
|
+
|
|
121
|
+
# Build cache key components
|
|
122
|
+
entity = args[0] if args else None
|
|
123
|
+
entity_key = getattr(entity, "cache_key", "unknown")
|
|
124
|
+
hook_name = func.__name__
|
|
125
|
+
args_hash = stable_hash(args[1:], kwargs)
|
|
126
|
+
|
|
127
|
+
client = get_client()
|
|
128
|
+
|
|
129
|
+
if not force:
|
|
130
|
+
resp = (
|
|
131
|
+
client.table("hook_cache")
|
|
132
|
+
.select("result_json")
|
|
133
|
+
.eq("entity_key", entity_key)
|
|
134
|
+
.eq("hook_name", hook_name)
|
|
135
|
+
.eq("args_hash", args_hash)
|
|
136
|
+
.execute()
|
|
137
|
+
)
|
|
138
|
+
if resp.data:
|
|
139
|
+
first = cast(dict[str, Any], resp.data[0])
|
|
140
|
+
return first["result_json"]
|
|
141
|
+
|
|
142
|
+
result = func(*args, **kwargs)
|
|
143
|
+
|
|
144
|
+
# Serialize: Pydantic models → dict, everything else → as-is
|
|
145
|
+
stored = result.model_dump(mode="json") if hasattr(result, "model_dump") else result
|
|
146
|
+
|
|
147
|
+
client.table("hook_cache").upsert({
|
|
148
|
+
"entity_key": entity_key,
|
|
149
|
+
"hook_name": hook_name,
|
|
150
|
+
"args_hash": args_hash,
|
|
151
|
+
"result_json": stored,
|
|
152
|
+
}).execute()
|
|
153
|
+
|
|
154
|
+
return result
|
|
155
|
+
|
|
156
|
+
return wrapper # type: ignore[return-value]
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""GitHub token pool with rotation and rate-limit awareness."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class _RateLimit:
|
|
13
|
+
remaining: int = 5000
|
|
14
|
+
reset_at: float = 0.0
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TokenPool:
|
|
18
|
+
"""Thread-safe pool of GitHub tokens with automatic rotation.
|
|
19
|
+
|
|
20
|
+
Tokens are read from the ``GH_TOKENS`` environment variable (comma-separated)
|
|
21
|
+
or can be passed directly.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, tokens: list[str] | None = None) -> None:
|
|
25
|
+
if tokens is None:
|
|
26
|
+
raw = os.environ.get("GH_TOKENS", os.environ.get("GH_TOKEN", ""))
|
|
27
|
+
tokens = [t.strip() for t in raw.split(",") if t.strip()]
|
|
28
|
+
if not tokens:
|
|
29
|
+
raise ValueError("No GitHub tokens provided (set GH_TOKENS env var)")
|
|
30
|
+
self._tokens = tokens
|
|
31
|
+
self._lock = threading.Lock()
|
|
32
|
+
self._rate_limits: dict[str, _RateLimit] = {t: _RateLimit() for t in tokens}
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def size(self) -> int:
|
|
36
|
+
return len(self._tokens)
|
|
37
|
+
|
|
38
|
+
def get_token(self) -> str:
|
|
39
|
+
"""Return a token that is not currently rate-limited.
|
|
40
|
+
|
|
41
|
+
Blocks if all tokens are exhausted until the earliest reset time.
|
|
42
|
+
"""
|
|
43
|
+
while True:
|
|
44
|
+
with self._lock:
|
|
45
|
+
now = time.time()
|
|
46
|
+
for token in self._tokens:
|
|
47
|
+
rl = self._rate_limits[token]
|
|
48
|
+
if rl.remaining > 0 or rl.reset_at <= now:
|
|
49
|
+
# Reset if window has passed
|
|
50
|
+
if rl.reset_at <= now:
|
|
51
|
+
rl.remaining = 5000
|
|
52
|
+
return token
|
|
53
|
+
|
|
54
|
+
# All exhausted — find earliest reset
|
|
55
|
+
earliest = min(rl.reset_at for rl in self._rate_limits.values())
|
|
56
|
+
|
|
57
|
+
wait = max(0.1, earliest - time.time())
|
|
58
|
+
time.sleep(min(wait, 5.0)) # cap sleep to re-check periodically
|
|
59
|
+
|
|
60
|
+
def report_rate_limit(self, token: str, remaining: int = 0, reset_at: float = 0.0) -> None:
|
|
61
|
+
"""Update rate-limit state for a token (called on 429/403)."""
|
|
62
|
+
with self._lock:
|
|
63
|
+
if token in self._rate_limits:
|
|
64
|
+
self._rate_limits[token].remaining = remaining
|
|
65
|
+
self._rate_limits[token].reset_at = reset_at
|
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fc-data
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python toolchain for building and maintaining FormulaCode benchmark tasks.
|
|
5
|
+
Project-URL: Homepage, https://formula-code.github.io/datasmith/
|
|
6
|
+
Project-URL: Repository, https://github.com/formula-code/datasmith
|
|
7
|
+
Project-URL: Documentation, https://formula-code.github.io/datasmith/
|
|
8
|
+
Author-email: Atharva Sehgal <atharva.sehgal@gmail.com>
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: python
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: <4.0,>=3.9
|
|
18
|
+
Requires-Dist: dspy>=2.6.27
|
|
19
|
+
Requires-Dist: gitpython
|
|
20
|
+
Requires-Dist: httpx>=0.27
|
|
21
|
+
Requires-Dist: huggingface-hub>=0.20
|
|
22
|
+
Requires-Dist: jinja2>=3.1.6
|
|
23
|
+
Requires-Dist: json5>=0.9
|
|
24
|
+
Requires-Dist: portkey-ai>=1.14.3
|
|
25
|
+
Requires-Dist: pyarrow>=14.0
|
|
26
|
+
Requires-Dist: pydantic-settings>=2.0
|
|
27
|
+
Requires-Dist: pydantic>=2.0
|
|
28
|
+
Requires-Dist: python-dotenv>=1.1.1
|
|
29
|
+
Requires-Dist: python-on-whales>=0.60
|
|
30
|
+
Requires-Dist: supabase>=2.0
|
|
31
|
+
Requires-Dist: tiktoken
|
|
32
|
+
Requires-Dist: tomli>=2.0; python_version < '3.11'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+

|
|
36
|
+
|
|
37
|
+
<p align="center">
|
|
38
|
+
<a href="https://formula-code.github.io/">
|
|
39
|
+
<img src="https://img.shields.io/badge/%F0%9F%8C%90%20Website-0A7A5E?style=for-the-badge" alt="FormulaCode Website">
|
|
40
|
+
</a>
|
|
41
|
+
<a href="https://huggingface.co/papers/2603.16011">
|
|
42
|
+
<img src="https://img.shields.io/badge/Paper-1F6FEB?style=for-the-badge&logo=arxiv&logoColor=white" alt="FormulaCode Paper">
|
|
43
|
+
</a>
|
|
44
|
+
<a href="https://formula-code.github.io/leaderboard/">
|
|
45
|
+
<img src="https://img.shields.io/badge/%F0%9F%93%88%20Leaderboard-EA580C?style=for-the-badge&logoColor=white" alt="FormulaCode Leaderboard">
|
|
46
|
+
</a>
|
|
47
|
+
<a href="https://formula-code.github.io/registry/">
|
|
48
|
+
<img src="https://img.shields.io/badge/%F0%9F%93%88%20Live%20Task%20Registry-7F1D1D?style=for-the-badge&logoColor=white" alt="FormulaCode Live Task Registry">
|
|
49
|
+
</a>
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
[FormulaCode](https://formula-code.github.io/) is a *continually updating* benchmark for evaluating the holistic ability of LLM agents to optimize codebases. FormulaCode consists of two parts: a [pipeline](https://github.com/formula-code/datasmith) to construct performance optimization tasks, and an [execution harness](https://github.com/formula-code/terminal-bench) that connects a language model to our terminal sandbox. _This repository contains the task generation pipeline._
|
|
53
|
+
|
|
54
|
+
`fc-data` is a python package for automatically curating and managing [FormulaCode](https://formula-code.github.io/) tasks. After installation, fc-data is designed to run as a monthly CRON job that updates the FormulaCode dataset with new commits and repositories.
|
|
55
|
+
|
|
56
|
+
## High level overview
|
|
57
|
+
|
|
58
|
+
```mermaid
|
|
59
|
+
graph LR
|
|
60
|
+
A --->|scrape| B
|
|
61
|
+
A2 <-->|sync| B
|
|
62
|
+
B -->|publish| C
|
|
63
|
+
B -->|publish| D
|
|
64
|
+
|
|
65
|
+
A[Github]
|
|
66
|
+
A2[Supabase]
|
|
67
|
+
B["`fc-data
|
|
68
|
+
(This repository)`"]
|
|
69
|
+
C[DockerHub]
|
|
70
|
+
D[HuggingFace]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Use cases
|
|
74
|
+
|
|
75
|
+
`fc-data` is designed primarily to enable continual dataset updates for FormulaCode. After [installation](#installation), the monthly update is a single command:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
$ pip install fc-data
|
|
79
|
+
$ fc-data --start-date 2026-02-01 --end-date 2026-03-01
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
This runs six stages in order: scrape repos, scrape commits, classify PRs, resolve packages, synthesize Docker images, and publish the docker images to DockerHub and the PRs to HuggingFace The dataset is versioned by month (e.g. `formulacode@2026-03`). In our servers, this command runs as a monthly CRON job.
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
However, this isn't the only use case for `fc-data`. We've designed `fc-data` to helps you manage your custom github-centric benchmark. Each benchmark contains a task which revolves around a GitHub Issue (or Pull request; which is just an issue with extra details). We include some helpful properties to start off:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from datasmith.github import PR, GitHubClient
|
|
89
|
+
from datasmith.utils import TokenPool
|
|
90
|
+
|
|
91
|
+
# Every task starts with a PR.
|
|
92
|
+
pr = PR(repository="astropy/astropy", issue_number=16222)
|
|
93
|
+
|
|
94
|
+
# PRs are frozen Pydantic v2 models — immutable after creation.
|
|
95
|
+
pr.merge_commit_sha # the merge commit sha
|
|
96
|
+
pr.base_sha # base branch commit
|
|
97
|
+
pr.cache_key # "astropy/astropy:16222" — used for Supabase caching
|
|
98
|
+
|
|
99
|
+
# Or fetch a fully-hydrated PR (tries Supabase first, then GitHub API):
|
|
100
|
+
pr = await PR.fetch("astropy/astropy", 16222)
|
|
101
|
+
pr.merge_commit_sha # now populated from the database or API
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
You can also fetch live data from GitHub using the async client directly:
|
|
105
|
+
```python
|
|
106
|
+
pool = TokenPool() # reads GH_TOKENS env var, rotates tokens on rate-limit
|
|
107
|
+
gh = GitHubClient(pool)
|
|
108
|
+
|
|
109
|
+
# Fetch a PR from the GitHub API.
|
|
110
|
+
pr = await gh.get_pr("pandas-dev", "pandas", 16222)
|
|
111
|
+
|
|
112
|
+
# Fetch the diff as a string.
|
|
113
|
+
diff = await gh.get_diff("pandas-dev", "pandas", 16222)
|
|
114
|
+
|
|
115
|
+
# Fetch the timeline of events.
|
|
116
|
+
events = await gh.get_timeline("pandas-dev", "pandas", 16222)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Want to extract structured information from the PR? Use our built-in agents or define your own!
|
|
120
|
+
```python
|
|
121
|
+
from datasmith.github import render_problem_statement, scrape_links
|
|
122
|
+
|
|
123
|
+
# Render a problem statement from the PR and its linked issues.
|
|
124
|
+
statement = render_problem_statement(pr, anonymize=True)
|
|
125
|
+
|
|
126
|
+
# You can also scrape for linked issues via BFS.
|
|
127
|
+
issues = await scrape_links(pr, gh.get_issue, depth=2, only_issues=True, limit=6)
|
|
128
|
+
|
|
129
|
+
# Then pass them into the renderer for richer context.
|
|
130
|
+
statement = render_problem_statement(pr, issues=issues, repo_description="pandas is a data analysis library")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
Don't like the current set of operations? Define your own!
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
# You can register custom hooks for dataset-specific operations.
|
|
138
|
+
from datasmith.github import HookRegistry
|
|
139
|
+
|
|
140
|
+
from dspy import ChainOfThought
|
|
141
|
+
summarizer = ChainOfThought("document -> summary")
|
|
142
|
+
|
|
143
|
+
def summarize(pr):
|
|
144
|
+
doc = render_problem_statement(pr, anonymize=True)
|
|
145
|
+
return summarizer(doc).summary
|
|
146
|
+
|
|
147
|
+
HookRegistry.register("summarize", summarize) # auto-wrapped with @supabase_cached
|
|
148
|
+
|
|
149
|
+
# Now use it:
|
|
150
|
+
pr = PR(repository="astropy/astropy", issue_number=16222)
|
|
151
|
+
HookRegistry.call("summarize", pr) # first call: hits LLM
|
|
152
|
+
HookRegistry.call("summarize", pr) # second call: reads from Supabase cache. No cost!
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Almost all our supported operations can be run asynchronously. Here's how to run some FormulaCode-specific operations at scale:
|
|
156
|
+
```python
|
|
157
|
+
from datasmith.runners import ClassifyPRsRunner
|
|
158
|
+
from datasmith.agents import PerfClassifier, ClassifyJudge
|
|
159
|
+
|
|
160
|
+
runner = ClassifyPRsRunner(PerfClassifier(), ClassifyJudge(), n_concurrent=64)
|
|
161
|
+
await runner.run(pr_items)
|
|
162
|
+
# Progress tracked in Supabase runner_progress table.
|
|
163
|
+
# Per-item failures logged in runner_failures — the runner never aborts.
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
By default, each operation is cached in Supabase so you don't keep hitting expensive hooks.
|
|
167
|
+
|
|
168
|
+
A pull request is useless if you cannot build a reproducible environment for it. fc-data supports building docker images for any pull request using a three-tier hierarchy:
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
from datasmith.docker import ImageManager, MultiObjVerifier, SmokeVerifier, ProfileVerifier
|
|
172
|
+
|
|
173
|
+
mgr = ImageManager()
|
|
174
|
+
mgr.build_base_image() # formulacode/base:latest (uses the default Dockerfile.base)
|
|
175
|
+
mgr.build_repo_image("pandas-dev", "pandas",) # formulacode/pandas-dev-pandas:latest (Look up Dockerfile.repo for pandas-dev/pandas that should be stored in supabase or fallback to the default Dockerfile.repo)
|
|
176
|
+
mgr.build_pr_image("pandas-dev", "pandas", 16222,) # formulacode/pandas-dev-pandas:16222 (Look up Dockerfile.pr for pandas-dev/pandas:16222 that should be stored in supabase or fallback to the default Dockerfile.pr)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# Alternatively, if the user wants to use a custom Dockerfile, they can do so by:
|
|
180
|
+
|
|
181
|
+
mgr.build_base_image(context="path/to/custom/context")
|
|
182
|
+
mgr.build_repo_image("pandas-dev", "pandas", context="path/to/custom/context")
|
|
183
|
+
mgr.build_pr_image("pandas-dev", "pandas", 16222, context="path/to/custom/context")
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# Verify an image with a chain of verifiers — short-circuits on first failure.
|
|
187
|
+
verifier = MultiObjVerifier(verifiers=[
|
|
188
|
+
SmokeVerifier("pandas"), # can we import the package?
|
|
189
|
+
ProfileVerifier(timeout=300), # can we discover and run ASV benchmarks?
|
|
190
|
+
])
|
|
191
|
+
result = verifier.verify("formulacode/pandas-dev-pandas:16222")
|
|
192
|
+
# result.ok, result.rc, result.stdout, result.stderr, result.duration_s
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
One of the main features of `fc-data` is the ability to automatically synthesize docker containers for a pull request. The synthesizer is a state machine that checks Supabase for cached contexts, tries similar build scripts, then falls back to an installed CLI agent (Claude Code, Codex, or Gemini — auto-detected):
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from datasmith.agents import Synthesizer
|
|
199
|
+
from datasmith.docker import MultiObjVerifier, SmokeVerifier, ProfileVerifier
|
|
200
|
+
from datasmith.docker.context import DockerContext
|
|
201
|
+
|
|
202
|
+
# The verifier chain validates each synthesis attempt.
|
|
203
|
+
verifier = MultiObjVerifier(verifiers=[
|
|
204
|
+
SmokeVerifier("pandas"), # can we import the package?
|
|
205
|
+
ProfileVerifier(timeout=300), # can we discover and run ASV benchmarks?
|
|
206
|
+
])
|
|
207
|
+
|
|
208
|
+
# Load a base Docker build context (Dockerfile + shell scripts) to iterate on.
|
|
209
|
+
base_context = DockerContext.from_directory("dataset/formulacode_verified/pandas-dev_pandas/abc123")
|
|
210
|
+
|
|
211
|
+
synth = Synthesizer(max_attempts=3)
|
|
212
|
+
ctx = synth.run(
|
|
213
|
+
owner="pandas-dev",
|
|
214
|
+
repo="pandas",
|
|
215
|
+
issue_number=16222,
|
|
216
|
+
pr_context="This PR optimizes groupby performance by ...",
|
|
217
|
+
verifier=verifier,
|
|
218
|
+
sha="abc123def456",
|
|
219
|
+
base_context=base_context,
|
|
220
|
+
env_payload='{"dependencies": ["numpy==1.26.0", "cython==3.0.0"]}',
|
|
221
|
+
python_version="3.10",
|
|
222
|
+
)
|
|
223
|
+
# Checking cache for pandas-dev/pandas@abc123def456... [MISS]
|
|
224
|
+
# Found 4 similar scripts from pandas-dev/pandas
|
|
225
|
+
# Attempt 1/4 with similar script... [FAIL]
|
|
226
|
+
# Launching claude agent sandbox in /tmp/synthesis-xxx...
|
|
227
|
+
# Sandbox synthesis succeeded [PASS]
|
|
228
|
+
# Saved context for pandas-dev/pandas@abc123def456
|
|
229
|
+
#
|
|
230
|
+
# On success, the DockerContext is persisted to Supabase's candidate_containers table.
|
|
231
|
+
# ctx is a DockerContext with the working build scripts, or None if all attempts failed.
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
If ALL attempts fail, `synthesize` logs every attempt (stderr, stdout, model, script used) to Supabase's `build_attempts` table and returns `None`. Failed PRs can be retried later — the logged attempts provide context for debugging or a future synthesis run.
|
|
235
|
+
|
|
236
|
+
This can be run asynchronously as well for multiple tasks (WARNING: Might be expensive!):
|
|
237
|
+
```python
|
|
238
|
+
from datasmith.runners import SynthesizeImagesRunner
|
|
239
|
+
|
|
240
|
+
runner = SynthesizeImagesRunner(synth, verifier, n_concurrent=8)
|
|
241
|
+
await runner.run(pr_items)
|
|
242
|
+
# Returns None entries for PRs where synthesis failed.
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
How do we make a dataset out of this? Query Supabase directly and publish:
|
|
246
|
+
```python
|
|
247
|
+
from datasmith.utils.db import get_client
|
|
248
|
+
from datasmith.publish import records_from_supabase, HuggingFacePublisher
|
|
249
|
+
|
|
250
|
+
# Query all verified, unpublished perf PRs from the last month.
|
|
251
|
+
records = records_from_supabase(start_date="2026-02-01", end_date="2026-03-01")
|
|
252
|
+
|
|
253
|
+
# Or query Supabase directly for more control.
|
|
254
|
+
sb = get_client()
|
|
255
|
+
rows = sb.table("pull_requests") \
|
|
256
|
+
.select("*") \
|
|
257
|
+
.eq("is_performance_commit", True) \
|
|
258
|
+
.not_.is_("container_name", "null") \
|
|
259
|
+
.execute()
|
|
260
|
+
|
|
261
|
+
# Publish to HuggingFace as a versioned Parquet dataset.
|
|
262
|
+
hf = HuggingFacePublisher()
|
|
263
|
+
hf.publish(records, version="formulacode@2026-03")
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
We define tasks using `terminal-bench`'s formulacode adapter for evaluation:
|
|
267
|
+
```python
|
|
268
|
+
from terminal_bench.adapters.formulacode import FormulaCodeAdapter
|
|
269
|
+
from terminal_bench.harness.harness import Harness
|
|
270
|
+
|
|
271
|
+
adapter = FormulaCodeAdapter(task_dir="fctasks/", force=True)
|
|
272
|
+
adapter.generate_task(pr.to_record())
|
|
273
|
+
|
|
274
|
+
run = Harness(
|
|
275
|
+
output_path="fcevals/",
|
|
276
|
+
dataset_path="dataset_path",
|
|
277
|
+
task_ids=[pr.to_record().task_id],
|
|
278
|
+
agent_configs=[
|
|
279
|
+
{"agent_name": "nop", "model_name": "nop"},
|
|
280
|
+
{"agent_name": "oracle", "model_name": "oracle"},
|
|
281
|
+
],
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
print(run.results[0].is_resolved) # Did the oracle get a speedup > 1.00 over baseline?
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
## Database schema
|
|
288
|
+
|
|
289
|
+
There are xix tables in Supabase (Postgres):
|
|
290
|
+
|
|
291
|
+
| Table | Primary key | Purpose |
|
|
292
|
+
|-------|-------------|---------|
|
|
293
|
+
| `repositories` | `(owner, repo)` | Scraped GitHub repos (language, stars, topics, description) |
|
|
294
|
+
| `pull_requests` | `(owner, repo, issue_number)` | PR metadata, classification, rendered problems, publish status |
|
|
295
|
+
| `hook_cache` | `(entity_key, hook_name, args_hash)` | Deterministic cache for `@supabase_cached` |
|
|
296
|
+
| `build_attempts` | `id` (serial) | Every Docker build attempt (model, script, ok, stderr/stdout tails) |
|
|
297
|
+
| `runner_progress` | `runner_id` | Per-runner progress (total, completed, failed) |
|
|
298
|
+
| `runner_failures` | `id` (serial) | Per-item failure details (error message, traceback) |
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
## Installation
|
|
302
|
+
|
|
303
|
+
Install [uv](https://astral.sh/uv/) and [Node.js](https://nodejs.org/) (for Supabase CLI), then set up the development environment:
|
|
304
|
+
|
|
305
|
+
```bash
|
|
306
|
+
# Install uv
|
|
307
|
+
$ curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
308
|
+
# Install npm (for Supabase CLI)
|
|
309
|
+
$ curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash
|
|
310
|
+
$ nvm install --lts
|
|
311
|
+
$ nvm use --lts
|
|
312
|
+
# Install dev environment and pre-commit hooks
|
|
313
|
+
$ make install
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
Create a `tokens.env` file in the repo root:
|
|
317
|
+
```bash
|
|
318
|
+
# Supabase (required)
|
|
319
|
+
SUPABASE_URL=http://127.0.0.1:54321
|
|
320
|
+
SUPABASE_KEY=your-service-role-key
|
|
321
|
+
|
|
322
|
+
# GitHub (required — comma-separated for multiple tokens)
|
|
323
|
+
GH_TOKENS=github_pat_xxx,github_pat_yyy
|
|
324
|
+
|
|
325
|
+
# LLM backends (for classification and synthesis)
|
|
326
|
+
DSPY_MODEL=openai/gpt-oss-120b
|
|
327
|
+
DSPY_API_BASE=http://localhost:30000/v1
|
|
328
|
+
DSPY_API_KEY=local
|
|
329
|
+
DSPY_MAX_TOKENS=16000
|
|
330
|
+
|
|
331
|
+
# DockerHub (for publishing)
|
|
332
|
+
DOCKERHUB_USERNAME=formulacode
|
|
333
|
+
DOCKERHUB_TOKEN=dckr_pat_xxxxx
|
|
334
|
+
|
|
335
|
+
# HuggingFace (for dataset publishing)
|
|
336
|
+
HF_TOKEN_PATH=/path/to/huggingface/token
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Supabase
|
|
340
|
+
|
|
341
|
+
Start the local Supabase instance and apply all migrations:
|
|
342
|
+
```bash
|
|
343
|
+
$ npx supabase start # starts Postgres, Auth, Storage, Studio, etc.
|
|
344
|
+
$ npx supabase migration up --local # apply migrations in supabase/migrations/
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
Common commands:
|
|
348
|
+
```bash
|
|
349
|
+
$ npx supabase status # show URLs, ports, and service health
|
|
350
|
+
$ npx supabase migration list --local # list applied / pending migrations
|
|
351
|
+
$ npx supabase db reset # wipe and recreate from migrations (destructive)
|
|
352
|
+
$ npx supabase stop # stop all containers
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
Studio is available at the URL printed by `supabase status` (default `http://127.0.0.1:54323`) — use it to browse tables, run SQL, and inspect data.
|
|
356
|
+
|
|
357
|
+
Running `preflight` ensures that all the variables are properly defined:
|
|
358
|
+
```bash
|
|
359
|
+
$ python -m datasmith.preflight
|
|
360
|
+
|
|
361
|
+
== Environment ==
|
|
362
|
+
[OK] SUPABASE_URL — http://127.0.0.1:54...
|
|
363
|
+
[OK] SUPABASE_KEY — ***
|
|
364
|
+
[OK] GH_TOKENS — 3 token(s)
|
|
365
|
+
[OK] HF_TOKEN — /path/to/huggingface/token
|
|
366
|
+
|
|
367
|
+
== Supabase ==
|
|
368
|
+
[OK] Connection
|
|
369
|
+
|
|
370
|
+
== Docker ==
|
|
371
|
+
[OK] Docker daemon
|
|
372
|
+
|
|
373
|
+
== GitHub ==
|
|
374
|
+
[OK] API access — remaining=4998
|
|
375
|
+
|
|
376
|
+
========================================
|
|
377
|
+
All checks passed!
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
After that works, run the tests locally. Each new functionality MUST have a test:
|
|
381
|
+
```bash
|
|
382
|
+
$ make check # ruff lint + mypy type check
|
|
383
|
+
$ make test # pytest
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
## Updating FormulaCode
|
|
387
|
+
|
|
388
|
+
The monthly update is a single command:
|
|
389
|
+
```bash
|
|
390
|
+
$ fc-data --start-date 2026-02-01 --end-date 2026-03-01
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
This runs six stages in order: scrape repos, scrape commits, classify PRs, resolve packages, synthesize Docker images, and publish to DockerHub + HuggingFace. Options:
|
|
394
|
+
|
|
395
|
+
```bash
|
|
396
|
+
$ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --resume # skip completed stages
|
|
397
|
+
$ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --stage 4 # run only package resolution
|
|
398
|
+
$ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --dry-run # log without executing
|
|
399
|
+
$ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --stage 5 \
|
|
400
|
+
--agent codex --n-concurrent 5 --tasks-per-repo 5 # synthesis with codex, capped
|
|
401
|
+
$ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --stage 5 \
|
|
402
|
+
--force # re-run synthesis for all tasks
|
|
403
|
+
```
|
|
404
|
+
|
|
405
|
+
| Flag | Description |
|
|
406
|
+
|------|-------------|
|
|
407
|
+
| `--resume` | Skip stages already marked complete and resume from the next pending stage |
|
|
408
|
+
| `--stage N` | Run only stage N (1–6) |
|
|
409
|
+
| `--dry-run` | Log what each stage would do without executing |
|
|
410
|
+
| `--n-concurrent N` | Max concurrent items per runner stage |
|
|
411
|
+
| `--tasks-per-repo N` | Cap tasks per repository for stage 5 (synthesize_images) |
|
|
412
|
+
| `--agent {claude,codex,gemini}` | CLI agent for stage 5 synthesis (default: auto-detect first available) |
|
|
413
|
+
| `--force` | Re-run synthesis even for tasks that already have a container or cached context (stage 5 only) |
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
## Dataset verification
|
|
417
|
+
|
|
418
|
+
Each task lives in `dataset/formulacode_verified/<owner_repo>/<sha>/` with a multi-stage Dockerfile and shell build scripts. The verification loop:
|
|
419
|
+
|
|
420
|
+
```bash
|
|
421
|
+
$ python dataset/verify.py --task dataset/formulacode_verified/<owner_repo>/<sha>
|
|
422
|
+
# Check failure.json for errors -> edit docker_build_pkg.sh / docker_build_run.sh -> rerun
|
|
423
|
+
# Done when verification_success.json appears
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
Only modify `docker_build_pkg.sh` and `docker_build_run.sh` during verification fixes.
|
|
427
|
+
|
|
428
|
+
```bash
|
|
429
|
+
$ python scratch/scripts/prepare_formulacode_dataset.py \
|
|
430
|
+
--input scratch/artifacts/pipeflush/perfonly_commits_master.parquet \
|
|
431
|
+
--output scratch/artifacts/pipeflush/perfonly_enriched.parquet \
|
|
432
|
+
--dockerhub-repository formulacode/all \
|
|
433
|
+
--upload-to-hf formulacode/formulacode-all \
|
|
434
|
+
--hf-verified-filter /path/to/valid_tasks.json
|
|
435
|
+
```
|
|
436
|
+
|
|
437
|
+
> Requires `HF_TOKEN` in `tokens.env`. The upload creates `default`, `verified`, and per-month (`YYYY-MM`) configs on Hugging Face.
|
|
438
|
+
|
|
439
|
+
### Evaluation
|
|
440
|
+
|
|
441
|
+
Evaluation is done in FormulaCode's fork of the [terminal-bench](https://github.com/formula-code/fc-eval) evaluation framework.
|