fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
datasmith/utils/db.py ADDED
@@ -0,0 +1,156 @@
1
+ """Supabase client, ``@supabase_cached`` decorator, and batch upsert helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import functools
6
+ import hashlib
7
+ import json
8
+ import os
9
+ from typing import Any, Callable, TypeVar, cast
10
+
11
+ from supabase import Client, create_client
12
+
13
+ F = TypeVar("F", bound=Callable[..., Any])
14
+
15
+ _client: Client | None = None
16
+
17
+
18
+ def get_client() -> Client:
19
+ """Return a singleton Supabase client from env vars."""
20
+ global _client
21
+ if _client is None:
22
+ url = os.environ.get("SUPABASE_URL", "")
23
+ key = os.environ.get("SUPABASE_KEY", "")
24
+ if not url or not key:
25
+ raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set")
26
+ _client = create_client(url, key)
27
+ return _client
28
+
29
+
30
+ async def get_async_client() -> Any:
31
+ """Return an async Supabase client.
32
+
33
+ Imported lazily to avoid import errors when supabase async extras
34
+ are not installed.
35
+ """
36
+ from supabase import acreate_client
37
+
38
+ url = os.environ.get("SUPABASE_URL", "")
39
+ key = os.environ.get("SUPABASE_KEY", "")
40
+ if not url or not key:
41
+ raise ValueError("SUPABASE_URL and SUPABASE_KEY must be set")
42
+ return await acreate_client(url, key)
43
+
44
+
45
+ def stable_hash(*args: Any) -> str:
46
+ """Produce a deterministic SHA-256 hex digest for the given arguments.
47
+
48
+ Arguments are serialized to canonical JSON (sorted keys, no whitespace).
49
+ """
50
+ payload = json.dumps(args, sort_keys=True, default=str, separators=(",", ":"))
51
+ return hashlib.sha256(payload.encode()).hexdigest()
52
+
53
+
54
+ def batch_upsert(table: str, rows: list[dict[str, Any]], chunk_size: int = 100) -> int:
55
+ """Insert/update *rows* into *table* in chunks. Returns total row count."""
56
+ if not rows:
57
+ return 0
58
+ client = get_client()
59
+ total = 0
60
+ for i in range(0, len(rows), chunk_size):
61
+ chunk = rows[i : i + chunk_size]
62
+ client.table(table).upsert(chunk).execute()
63
+ total += len(chunk)
64
+ return total
65
+
66
+
67
+ def fetch_all(
68
+ table: str,
69
+ select: str = "*",
70
+ filters: dict[str, Any] | None = None,
71
+ is_null: list[str] | None = None,
72
+ gte_filters: dict[str, Any] | None = None,
73
+ lte_filters: dict[str, Any] | None = None,
74
+ neq_filters: dict[str, Any] | None = None,
75
+ page_size: int = 1000,
76
+ ) -> list[dict[str, Any]]:
77
+ """Paginate through all rows matching the query.
78
+
79
+ Supabase/PostgREST caps responses at 1 000 rows by default.
80
+ This helper fetches successive pages using ``range()`` until
81
+ a page returns fewer than *page_size* rows.
82
+ """
83
+ client = get_client()
84
+ rows: list[dict[str, Any]] = []
85
+ offset = 0
86
+ while True:
87
+ query = client.table(table).select(select)
88
+ for col, val in (filters or {}).items():
89
+ query = query.eq(col, val)
90
+ for col in is_null or []:
91
+ query = query.is_(col, "null")
92
+ for col, val in (gte_filters or {}).items():
93
+ query = query.gte(col, val)
94
+ for col, val in (lte_filters or {}).items():
95
+ query = query.lte(col, val)
96
+ for col, val in (neq_filters or {}).items():
97
+ query = query.neq(col, val)
98
+ resp = query.range(offset, offset + page_size - 1).execute()
99
+ page = cast(list[dict[str, Any]], resp.data or [])
100
+ rows.extend(page)
101
+ if len(page) < page_size:
102
+ break
103
+ offset += page_size
104
+ return rows
105
+
106
+
107
+ def supabase_cached(func: F) -> F:
108
+ """Decorator that caches function results in the Supabase ``hook_cache`` table.
109
+
110
+ The decorated function's first positional argument must expose a
111
+ ``cache_key`` attribute (e.g. a Pydantic model with that property).
112
+
113
+ Special kwarg ``force=True`` bypasses the cache lookup and overwrites
114
+ the stored value.
115
+ """
116
+
117
+ @functools.wraps(func)
118
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
119
+ force = kwargs.pop("force", False)
120
+
121
+ # Build cache key components
122
+ entity = args[0] if args else None
123
+ entity_key = getattr(entity, "cache_key", "unknown")
124
+ hook_name = func.__name__
125
+ args_hash = stable_hash(args[1:], kwargs)
126
+
127
+ client = get_client()
128
+
129
+ if not force:
130
+ resp = (
131
+ client.table("hook_cache")
132
+ .select("result_json")
133
+ .eq("entity_key", entity_key)
134
+ .eq("hook_name", hook_name)
135
+ .eq("args_hash", args_hash)
136
+ .execute()
137
+ )
138
+ if resp.data:
139
+ first = cast(dict[str, Any], resp.data[0])
140
+ return first["result_json"]
141
+
142
+ result = func(*args, **kwargs)
143
+
144
+ # Serialize: Pydantic models → dict, everything else → as-is
145
+ stored = result.model_dump(mode="json") if hasattr(result, "model_dump") else result
146
+
147
+ client.table("hook_cache").upsert({
148
+ "entity_key": entity_key,
149
+ "hook_name": hook_name,
150
+ "args_hash": args_hash,
151
+ "result_json": stored,
152
+ }).execute()
153
+
154
+ return result
155
+
156
+ return wrapper # type: ignore[return-value]
@@ -0,0 +1,65 @@
1
+ """GitHub token pool with rotation and rate-limit awareness."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import threading
7
+ import time
8
+ from dataclasses import dataclass
9
+
10
+
11
+ @dataclass
12
+ class _RateLimit:
13
+ remaining: int = 5000
14
+ reset_at: float = 0.0
15
+
16
+
17
+ class TokenPool:
18
+ """Thread-safe pool of GitHub tokens with automatic rotation.
19
+
20
+ Tokens are read from the ``GH_TOKENS`` environment variable (comma-separated)
21
+ or can be passed directly.
22
+ """
23
+
24
+ def __init__(self, tokens: list[str] | None = None) -> None:
25
+ if tokens is None:
26
+ raw = os.environ.get("GH_TOKENS", os.environ.get("GH_TOKEN", ""))
27
+ tokens = [t.strip() for t in raw.split(",") if t.strip()]
28
+ if not tokens:
29
+ raise ValueError("No GitHub tokens provided (set GH_TOKENS env var)")
30
+ self._tokens = tokens
31
+ self._lock = threading.Lock()
32
+ self._rate_limits: dict[str, _RateLimit] = {t: _RateLimit() for t in tokens}
33
+
34
+ @property
35
+ def size(self) -> int:
36
+ return len(self._tokens)
37
+
38
+ def get_token(self) -> str:
39
+ """Return a token that is not currently rate-limited.
40
+
41
+ Blocks if all tokens are exhausted until the earliest reset time.
42
+ """
43
+ while True:
44
+ with self._lock:
45
+ now = time.time()
46
+ for token in self._tokens:
47
+ rl = self._rate_limits[token]
48
+ if rl.remaining > 0 or rl.reset_at <= now:
49
+ # Reset if window has passed
50
+ if rl.reset_at <= now:
51
+ rl.remaining = 5000
52
+ return token
53
+
54
+ # All exhausted — find earliest reset
55
+ earliest = min(rl.reset_at for rl in self._rate_limits.values())
56
+
57
+ wait = max(0.1, earliest - time.time())
58
+ time.sleep(min(wait, 5.0)) # cap sleep to re-check periodically
59
+
60
+ def report_rate_limit(self, token: str, remaining: int = 0, reset_at: float = 0.0) -> None:
61
+ """Update rate-limit state for a token (called on 429/403)."""
62
+ with self._lock:
63
+ if token in self._rate_limits:
64
+ self._rate_limits[token].remaining = remaining
65
+ self._rate_limits[token].reset_at = reset_at
@@ -0,0 +1,441 @@
1
+ Metadata-Version: 2.4
2
+ Name: fc-data
3
+ Version: 0.2.0
4
+ Summary: Python toolchain for building and maintaining FormulaCode benchmark tasks.
5
+ Project-URL: Homepage, https://formula-code.github.io/datasmith/
6
+ Project-URL: Repository, https://github.com/formula-code/datasmith
7
+ Project-URL: Documentation, https://formula-code.github.io/datasmith/
8
+ Author-email: Atharva Sehgal <atharva.sehgal@gmail.com>
9
+ License-File: LICENSE
10
+ Keywords: python
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Requires-Python: <4.0,>=3.9
18
+ Requires-Dist: dspy>=2.6.27
19
+ Requires-Dist: gitpython
20
+ Requires-Dist: httpx>=0.27
21
+ Requires-Dist: huggingface-hub>=0.20
22
+ Requires-Dist: jinja2>=3.1.6
23
+ Requires-Dist: json5>=0.9
24
+ Requires-Dist: portkey-ai>=1.14.3
25
+ Requires-Dist: pyarrow>=14.0
26
+ Requires-Dist: pydantic-settings>=2.0
27
+ Requires-Dist: pydantic>=2.0
28
+ Requires-Dist: python-dotenv>=1.1.1
29
+ Requires-Dist: python-on-whales>=0.60
30
+ Requires-Dist: supabase>=2.0
31
+ Requires-Dist: tiktoken
32
+ Requires-Dist: tomli>=2.0; python_version < '3.11'
33
+ Description-Content-Type: text/markdown
34
+
35
+ ![banner](static/formula-code-datasmith.png)
36
+
37
+ <p align="center">
38
+ <a href="https://formula-code.github.io/">
39
+ <img src="https://img.shields.io/badge/%F0%9F%8C%90%20Website-0A7A5E?style=for-the-badge" alt="FormulaCode Website">
40
+ </a>
41
+ <a href="https://huggingface.co/papers/2603.16011">
42
+ <img src="https://img.shields.io/badge/Paper-1F6FEB?style=for-the-badge&logo=arxiv&logoColor=white" alt="FormulaCode Paper">
43
+ </a>
44
+ <a href="https://formula-code.github.io/leaderboard/">
45
+ <img src="https://img.shields.io/badge/%F0%9F%93%88%20Leaderboard-EA580C?style=for-the-badge&logoColor=white" alt="FormulaCode Leaderboard">
46
+ </a>
47
+ <a href="https://formula-code.github.io/registry/">
48
+ <img src="https://img.shields.io/badge/%F0%9F%93%88%20Live%20Task%20Registry-7F1D1D?style=for-the-badge&logoColor=white" alt="FormulaCode Live Task Registry">
49
+ </a>
50
+ </p>
51
+
52
+ [FormulaCode](https://formula-code.github.io/) is a *continually updating* benchmark for evaluating the holistic ability of LLM agents to optimize codebases. FormulaCode consists of two parts: a [pipeline](https://github.com/formula-code/datasmith) to construct performance optimization tasks, and an [execution harness](https://github.com/formula-code/terminal-bench) that connects a language model to our terminal sandbox. _This repository contains the task generation pipeline._
53
+
54
+ `fc-data` is a python package for automatically curating and managing [FormulaCode](https://formula-code.github.io/) tasks. After installation, fc-data is designed to run as a monthly CRON job that updates the FormulaCode dataset with new commits and repositories.
55
+
56
+ ## High level overview
57
+
58
+ ```mermaid
59
+ graph LR
60
+ A --->|scrape| B
61
+ A2 <-->|sync| B
62
+ B -->|publish| C
63
+ B -->|publish| D
64
+
65
+ A[Github]
66
+ A2[Supabase]
67
+ B["`fc-data
68
+ (This repository)`"]
69
+ C[DockerHub]
70
+ D[HuggingFace]
71
+ ```
72
+
73
+ ## Use cases
74
+
75
+ `fc-data` is designed primarily to enable continual dataset updates for FormulaCode. After [installation](#installation), the monthly update is a single command:
76
+
77
+ ```bash
78
+ $ pip install fc-data
79
+ $ fc-data --start-date 2026-02-01 --end-date 2026-03-01
80
+ ```
81
+
82
+ This runs six stages in order: scrape repos, scrape commits, classify PRs, resolve packages, synthesize Docker images, and publish the docker images to DockerHub and the PRs to HuggingFace The dataset is versioned by month (e.g. `formulacode@2026-03`). In our servers, this command runs as a monthly CRON job.
83
+
84
+
85
+ However, this isn't the only use case for `fc-data`. We've designed `fc-data` to helps you manage your custom github-centric benchmark. Each benchmark contains a task which revolves around a GitHub Issue (or Pull request; which is just an issue with extra details). We include some helpful properties to start off:
86
+
87
+ ```python
88
+ from datasmith.github import PR, GitHubClient
89
+ from datasmith.utils import TokenPool
90
+
91
+ # Every task starts with a PR.
92
+ pr = PR(repository="astropy/astropy", issue_number=16222)
93
+
94
+ # PRs are frozen Pydantic v2 models — immutable after creation.
95
+ pr.merge_commit_sha # the merge commit sha
96
+ pr.base_sha # base branch commit
97
+ pr.cache_key # "astropy/astropy:16222" — used for Supabase caching
98
+
99
+ # Or fetch a fully-hydrated PR (tries Supabase first, then GitHub API):
100
+ pr = await PR.fetch("astropy/astropy", 16222)
101
+ pr.merge_commit_sha # now populated from the database or API
102
+ ```
103
+
104
+ You can also fetch live data from GitHub using the async client directly:
105
+ ```python
106
+ pool = TokenPool() # reads GH_TOKENS env var, rotates tokens on rate-limit
107
+ gh = GitHubClient(pool)
108
+
109
+ # Fetch a PR from the GitHub API.
110
+ pr = await gh.get_pr("pandas-dev", "pandas", 16222)
111
+
112
+ # Fetch the diff as a string.
113
+ diff = await gh.get_diff("pandas-dev", "pandas", 16222)
114
+
115
+ # Fetch the timeline of events.
116
+ events = await gh.get_timeline("pandas-dev", "pandas", 16222)
117
+ ```
118
+
119
+ Want to extract structured information from the PR? Use our built-in agents or define your own!
120
+ ```python
121
+ from datasmith.github import render_problem_statement, scrape_links
122
+
123
+ # Render a problem statement from the PR and its linked issues.
124
+ statement = render_problem_statement(pr, anonymize=True)
125
+
126
+ # You can also scrape for linked issues via BFS.
127
+ issues = await scrape_links(pr, gh.get_issue, depth=2, only_issues=True, limit=6)
128
+
129
+ # Then pass them into the renderer for richer context.
130
+ statement = render_problem_statement(pr, issues=issues, repo_description="pandas is a data analysis library")
131
+ ```
132
+
133
+
134
+ Don't like the current set of operations? Define your own!
135
+
136
+ ```python
137
+ # You can register custom hooks for dataset-specific operations.
138
+ from datasmith.github import HookRegistry
139
+
140
+ from dspy import ChainOfThought
141
+ summarizer = ChainOfThought("document -> summary")
142
+
143
+ def summarize(pr):
144
+ doc = render_problem_statement(pr, anonymize=True)
145
+ return summarizer(doc).summary
146
+
147
+ HookRegistry.register("summarize", summarize) # auto-wrapped with @supabase_cached
148
+
149
+ # Now use it:
150
+ pr = PR(repository="astropy/astropy", issue_number=16222)
151
+ HookRegistry.call("summarize", pr) # first call: hits LLM
152
+ HookRegistry.call("summarize", pr) # second call: reads from Supabase cache. No cost!
153
+ ```
154
+
155
+ Almost all our supported operations can be run asynchronously. Here's how to run some FormulaCode-specific operations at scale:
156
+ ```python
157
+ from datasmith.runners import ClassifyPRsRunner
158
+ from datasmith.agents import PerfClassifier, ClassifyJudge
159
+
160
+ runner = ClassifyPRsRunner(PerfClassifier(), ClassifyJudge(), n_concurrent=64)
161
+ await runner.run(pr_items)
162
+ # Progress tracked in Supabase runner_progress table.
163
+ # Per-item failures logged in runner_failures — the runner never aborts.
164
+ ```
165
+
166
+ By default, each operation is cached in Supabase so you don't keep hitting expensive hooks.
167
+
168
+ A pull request is useless if you cannot build a reproducible environment for it. fc-data supports building docker images for any pull request using a three-tier hierarchy:
169
+
170
+ ```python
171
+ from datasmith.docker import ImageManager, MultiObjVerifier, SmokeVerifier, ProfileVerifier
172
+
173
+ mgr = ImageManager()
174
+ mgr.build_base_image() # formulacode/base:latest (uses the default Dockerfile.base)
175
+ mgr.build_repo_image("pandas-dev", "pandas",) # formulacode/pandas-dev-pandas:latest (Look up Dockerfile.repo for pandas-dev/pandas that should be stored in supabase or fallback to the default Dockerfile.repo)
176
+ mgr.build_pr_image("pandas-dev", "pandas", 16222,) # formulacode/pandas-dev-pandas:16222 (Look up Dockerfile.pr for pandas-dev/pandas:16222 that should be stored in supabase or fallback to the default Dockerfile.pr)
177
+
178
+
179
+ # Alternatively, if the user wants to use a custom Dockerfile, they can do so by:
180
+
181
+ mgr.build_base_image(context="path/to/custom/context")
182
+ mgr.build_repo_image("pandas-dev", "pandas", context="path/to/custom/context")
183
+ mgr.build_pr_image("pandas-dev", "pandas", 16222, context="path/to/custom/context")
184
+
185
+
186
+ # Verify an image with a chain of verifiers — short-circuits on first failure.
187
+ verifier = MultiObjVerifier(verifiers=[
188
+ SmokeVerifier("pandas"), # can we import the package?
189
+ ProfileVerifier(timeout=300), # can we discover and run ASV benchmarks?
190
+ ])
191
+ result = verifier.verify("formulacode/pandas-dev-pandas:16222")
192
+ # result.ok, result.rc, result.stdout, result.stderr, result.duration_s
193
+ ```
194
+
195
+ One of the main features of `fc-data` is the ability to automatically synthesize docker containers for a pull request. The synthesizer is a state machine that checks Supabase for cached contexts, tries similar build scripts, then falls back to an installed CLI agent (Claude Code, Codex, or Gemini — auto-detected):
196
+
197
+ ```python
198
+ from datasmith.agents import Synthesizer
199
+ from datasmith.docker import MultiObjVerifier, SmokeVerifier, ProfileVerifier
200
+ from datasmith.docker.context import DockerContext
201
+
202
+ # The verifier chain validates each synthesis attempt.
203
+ verifier = MultiObjVerifier(verifiers=[
204
+ SmokeVerifier("pandas"), # can we import the package?
205
+ ProfileVerifier(timeout=300), # can we discover and run ASV benchmarks?
206
+ ])
207
+
208
+ # Load a base Docker build context (Dockerfile + shell scripts) to iterate on.
209
+ base_context = DockerContext.from_directory("dataset/formulacode_verified/pandas-dev_pandas/abc123")
210
+
211
+ synth = Synthesizer(max_attempts=3)
212
+ ctx = synth.run(
213
+ owner="pandas-dev",
214
+ repo="pandas",
215
+ issue_number=16222,
216
+ pr_context="This PR optimizes groupby performance by ...",
217
+ verifier=verifier,
218
+ sha="abc123def456",
219
+ base_context=base_context,
220
+ env_payload='{"dependencies": ["numpy==1.26.0", "cython==3.0.0"]}',
221
+ python_version="3.10",
222
+ )
223
+ # Checking cache for pandas-dev/pandas@abc123def456... [MISS]
224
+ # Found 4 similar scripts from pandas-dev/pandas
225
+ # Attempt 1/4 with similar script... [FAIL]
226
+ # Launching claude agent sandbox in /tmp/synthesis-xxx...
227
+ # Sandbox synthesis succeeded [PASS]
228
+ # Saved context for pandas-dev/pandas@abc123def456
229
+ #
230
+ # On success, the DockerContext is persisted to Supabase's candidate_containers table.
231
+ # ctx is a DockerContext with the working build scripts, or None if all attempts failed.
232
+ ```
233
+
234
+ If ALL attempts fail, `synthesize` logs every attempt (stderr, stdout, model, script used) to Supabase's `build_attempts` table and returns `None`. Failed PRs can be retried later — the logged attempts provide context for debugging or a future synthesis run.
235
+
236
+ This can be run asynchronously as well for multiple tasks (WARNING: Might be expensive!):
237
+ ```python
238
+ from datasmith.runners import SynthesizeImagesRunner
239
+
240
+ runner = SynthesizeImagesRunner(synth, verifier, n_concurrent=8)
241
+ await runner.run(pr_items)
242
+ # Returns None entries for PRs where synthesis failed.
243
+ ```
244
+
245
+ How do we make a dataset out of this? Query Supabase directly and publish:
246
+ ```python
247
+ from datasmith.utils.db import get_client
248
+ from datasmith.publish import records_from_supabase, HuggingFacePublisher
249
+
250
+ # Query all verified, unpublished perf PRs from the last month.
251
+ records = records_from_supabase(start_date="2026-02-01", end_date="2026-03-01")
252
+
253
+ # Or query Supabase directly for more control.
254
+ sb = get_client()
255
+ rows = sb.table("pull_requests") \
256
+ .select("*") \
257
+ .eq("is_performance_commit", True) \
258
+ .not_.is_("container_name", "null") \
259
+ .execute()
260
+
261
+ # Publish to HuggingFace as a versioned Parquet dataset.
262
+ hf = HuggingFacePublisher()
263
+ hf.publish(records, version="formulacode@2026-03")
264
+ ```
265
+
266
+ We define tasks using `terminal-bench`'s formulacode adapter for evaluation:
267
+ ```python
268
+ from terminal_bench.adapters.formulacode import FormulaCodeAdapter
269
+ from terminal_bench.harness.harness import Harness
270
+
271
+ adapter = FormulaCodeAdapter(task_dir="fctasks/", force=True)
272
+ adapter.generate_task(pr.to_record())
273
+
274
+ run = Harness(
275
+ output_path="fcevals/",
276
+ dataset_path="dataset_path",
277
+ task_ids=[pr.to_record().task_id],
278
+ agent_configs=[
279
+ {"agent_name": "nop", "model_name": "nop"},
280
+ {"agent_name": "oracle", "model_name": "oracle"},
281
+ ],
282
+ )
283
+
284
+ print(run.results[0].is_resolved) # Did the oracle get a speedup > 1.00 over baseline?
285
+ ```
286
+
287
+ ## Database schema
288
+
289
+ There are xix tables in Supabase (Postgres):
290
+
291
+ | Table | Primary key | Purpose |
292
+ |-------|-------------|---------|
293
+ | `repositories` | `(owner, repo)` | Scraped GitHub repos (language, stars, topics, description) |
294
+ | `pull_requests` | `(owner, repo, issue_number)` | PR metadata, classification, rendered problems, publish status |
295
+ | `hook_cache` | `(entity_key, hook_name, args_hash)` | Deterministic cache for `@supabase_cached` |
296
+ | `build_attempts` | `id` (serial) | Every Docker build attempt (model, script, ok, stderr/stdout tails) |
297
+ | `runner_progress` | `runner_id` | Per-runner progress (total, completed, failed) |
298
+ | `runner_failures` | `id` (serial) | Per-item failure details (error message, traceback) |
299
+
300
+
301
+ ## Installation
302
+
303
+ Install [uv](https://astral.sh/uv/) and [Node.js](https://nodejs.org/) (for Supabase CLI), then set up the development environment:
304
+
305
+ ```bash
306
+ # Install uv
307
+ $ curl -LsSf https://astral.sh/uv/install.sh | sh
308
+ # Install npm (for Supabase CLI)
309
+ $ curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash
310
+ $ nvm install --lts
311
+ $ nvm use --lts
312
+ # Install dev environment and pre-commit hooks
313
+ $ make install
314
+ ```
315
+
316
+ Create a `tokens.env` file in the repo root:
317
+ ```bash
318
+ # Supabase (required)
319
+ SUPABASE_URL=http://127.0.0.1:54321
320
+ SUPABASE_KEY=your-service-role-key
321
+
322
+ # GitHub (required — comma-separated for multiple tokens)
323
+ GH_TOKENS=github_pat_xxx,github_pat_yyy
324
+
325
+ # LLM backends (for classification and synthesis)
326
+ DSPY_MODEL=openai/gpt-oss-120b
327
+ DSPY_API_BASE=http://localhost:30000/v1
328
+ DSPY_API_KEY=local
329
+ DSPY_MAX_TOKENS=16000
330
+
331
+ # DockerHub (for publishing)
332
+ DOCKERHUB_USERNAME=formulacode
333
+ DOCKERHUB_TOKEN=dckr_pat_xxxxx
334
+
335
+ # HuggingFace (for dataset publishing)
336
+ HF_TOKEN_PATH=/path/to/huggingface/token
337
+ ```
338
+
339
+ ### Supabase
340
+
341
+ Start the local Supabase instance and apply all migrations:
342
+ ```bash
343
+ $ npx supabase start # starts Postgres, Auth, Storage, Studio, etc.
344
+ $ npx supabase migration up --local # apply migrations in supabase/migrations/
345
+ ```
346
+
347
+ Common commands:
348
+ ```bash
349
+ $ npx supabase status # show URLs, ports, and service health
350
+ $ npx supabase migration list --local # list applied / pending migrations
351
+ $ npx supabase db reset # wipe and recreate from migrations (destructive)
352
+ $ npx supabase stop # stop all containers
353
+ ```
354
+
355
+ Studio is available at the URL printed by `supabase status` (default `http://127.0.0.1:54323`) — use it to browse tables, run SQL, and inspect data.
356
+
357
+ Running `preflight` ensures that all the variables are properly defined:
358
+ ```bash
359
+ $ python -m datasmith.preflight
360
+
361
+ == Environment ==
362
+ [OK] SUPABASE_URL — http://127.0.0.1:54...
363
+ [OK] SUPABASE_KEY — ***
364
+ [OK] GH_TOKENS — 3 token(s)
365
+ [OK] HF_TOKEN — /path/to/huggingface/token
366
+
367
+ == Supabase ==
368
+ [OK] Connection
369
+
370
+ == Docker ==
371
+ [OK] Docker daemon
372
+
373
+ == GitHub ==
374
+ [OK] API access — remaining=4998
375
+
376
+ ========================================
377
+ All checks passed!
378
+ ```
379
+
380
+ After that works, run the tests locally. Each new functionality MUST have a test:
381
+ ```bash
382
+ $ make check # ruff lint + mypy type check
383
+ $ make test # pytest
384
+ ```
385
+
386
+ ## Updating FormulaCode
387
+
388
+ The monthly update is a single command:
389
+ ```bash
390
+ $ fc-data --start-date 2026-02-01 --end-date 2026-03-01
391
+ ```
392
+
393
+ This runs six stages in order: scrape repos, scrape commits, classify PRs, resolve packages, synthesize Docker images, and publish to DockerHub + HuggingFace. Options:
394
+
395
+ ```bash
396
+ $ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --resume # skip completed stages
397
+ $ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --stage 4 # run only package resolution
398
+ $ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --dry-run # log without executing
399
+ $ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --stage 5 \
400
+ --agent codex --n-concurrent 5 --tasks-per-repo 5 # synthesis with codex, capped
401
+ $ fc-data --start-date 2026-02-01 --end-date 2026-03-01 --stage 5 \
402
+ --force # re-run synthesis for all tasks
403
+ ```
404
+
405
+ | Flag | Description |
406
+ |------|-------------|
407
+ | `--resume` | Skip stages already marked complete and resume from the next pending stage |
408
+ | `--stage N` | Run only stage N (1–6) |
409
+ | `--dry-run` | Log what each stage would do without executing |
410
+ | `--n-concurrent N` | Max concurrent items per runner stage |
411
+ | `--tasks-per-repo N` | Cap tasks per repository for stage 5 (synthesize_images) |
412
+ | `--agent {claude,codex,gemini}` | CLI agent for stage 5 synthesis (default: auto-detect first available) |
413
+ | `--force` | Re-run synthesis even for tasks that already have a container or cached context (stage 5 only) |
414
+
415
+
416
+ ## Dataset verification
417
+
418
+ Each task lives in `dataset/formulacode_verified/<owner_repo>/<sha>/` with a multi-stage Dockerfile and shell build scripts. The verification loop:
419
+
420
+ ```bash
421
+ $ python dataset/verify.py --task dataset/formulacode_verified/<owner_repo>/<sha>
422
+ # Check failure.json for errors -> edit docker_build_pkg.sh / docker_build_run.sh -> rerun
423
+ # Done when verification_success.json appears
424
+ ```
425
+
426
+ Only modify `docker_build_pkg.sh` and `docker_build_run.sh` during verification fixes.
427
+
428
+ ```bash
429
+ $ python scratch/scripts/prepare_formulacode_dataset.py \
430
+ --input scratch/artifacts/pipeflush/perfonly_commits_master.parquet \
431
+ --output scratch/artifacts/pipeflush/perfonly_enriched.parquet \
432
+ --dockerhub-repository formulacode/all \
433
+ --upload-to-hf formulacode/formulacode-all \
434
+ --hf-verified-filter /path/to/valid_tasks.json
435
+ ```
436
+
437
+ > Requires `HF_TOKEN` in `tokens.env`. The upload creates `default`, `verified`, and per-month (`YYYY-MM`) configs on Hugging Face.
438
+
439
+ ### Evaluation
440
+
441
+ Evaluation is done in FormulaCode's fork of the [terminal-bench](https://github.com/formula-code/fc-eval) evaluation framework.