fc-data 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasmith/__init__.py +330 -0
- datasmith/__init__.pyi +194 -0
- datasmith/agents/__init__.py +31 -0
- datasmith/agents/classifiers.py +272 -0
- datasmith/agents/codex.py +25 -0
- datasmith/agents/config.py +108 -0
- datasmith/agents/extractors.py +197 -0
- datasmith/agents/installed/README.md +52 -0
- datasmith/agents/installed/__init__.py +22 -0
- datasmith/agents/installed/base.py +240 -0
- datasmith/agents/installed/claude.py +134 -0
- datasmith/agents/installed/codex.py +91 -0
- datasmith/agents/installed/gemini.py +118 -0
- datasmith/agents/installed/none.py +27 -0
- datasmith/agents/sandbox.py +547 -0
- datasmith/agents/synthesizer.py +439 -0
- datasmith/agents/templates/AGENTS.md.j2 +150 -0
- datasmith/agents/templates/sandbox_verify.py +428 -0
- datasmith/docker/__init__.py +31 -0
- datasmith/docker/context.py +112 -0
- datasmith/docker/images.py +158 -0
- datasmith/docker/publish.py +56 -0
- datasmith/docker/templates/Dockerfile.base +26 -0
- datasmith/docker/templates/Dockerfile.pr +42 -0
- datasmith/docker/templates/Dockerfile.repo +11 -0
- datasmith/docker/templates/docker_build_base.sh +780 -0
- datasmith/docker/templates/docker_build_env.sh +309 -0
- datasmith/docker/templates/docker_build_final.sh +106 -0
- datasmith/docker/templates/docker_build_pkg.sh +99 -0
- datasmith/docker/templates/docker_build_run.sh +124 -0
- datasmith/docker/templates/entrypoint.sh +62 -0
- datasmith/docker/templates/parser.py +1405 -0
- datasmith/docker/templates/profile.sh +199 -0
- datasmith/docker/templates/pytest_runner.py +692 -0
- datasmith/docker/templates/run-tests.sh +197 -0
- datasmith/docker/verifiers.py +131 -0
- datasmith/filters.py +154 -0
- datasmith/github/__init__.py +22 -0
- datasmith/github/client.py +333 -0
- datasmith/github/hooks.py +50 -0
- datasmith/github/links.py +110 -0
- datasmith/github/models.py +206 -0
- datasmith/github/render.py +173 -0
- datasmith/github/search.py +66 -0
- datasmith/github/templates/comment.md.j2 +5 -0
- datasmith/github/templates/final.md.j2 +66 -0
- datasmith/github/templates/issues.md.j2 +21 -0
- datasmith/github/templates/repo.md.j2 +1 -0
- datasmith/preflight.py +162 -0
- datasmith/publish/__init__.py +13 -0
- datasmith/publish/huggingface.py +104 -0
- datasmith/publish/pipeline.py +60 -0
- datasmith/publish/records.py +91 -0
- datasmith/py.typed +1 -0
- datasmith/resolution/__init__.py +14 -0
- datasmith/resolution/blocklist.py +145 -0
- datasmith/resolution/cache.py +120 -0
- datasmith/resolution/constants.py +277 -0
- datasmith/resolution/dependency_resolver.py +174 -0
- datasmith/resolution/git_utils.py +378 -0
- datasmith/resolution/import_analyzer.py +66 -0
- datasmith/resolution/metadata_parser.py +412 -0
- datasmith/resolution/models.py +41 -0
- datasmith/resolution/orchestrator.py +522 -0
- datasmith/resolution/package_filters.py +312 -0
- datasmith/resolution/python_manager.py +110 -0
- datasmith/runners/__init__.py +15 -0
- datasmith/runners/base.py +112 -0
- datasmith/runners/classify_prs.py +48 -0
- datasmith/runners/render_problems.py +113 -0
- datasmith/runners/resolve_packages.py +66 -0
- datasmith/runners/scrape_commits.py +166 -0
- datasmith/runners/scrape_repos.py +44 -0
- datasmith/runners/synthesize_images.py +310 -0
- datasmith/update/__init__.py +5 -0
- datasmith/update/cli.py +169 -0
- datasmith/update/offline.py +173 -0
- datasmith/update/pipeline.py +497 -0
- datasmith/utils/__init__.py +18 -0
- datasmith/utils/core.py +67 -0
- datasmith/utils/db.py +156 -0
- datasmith/utils/tokens.py +65 -0
- fc_data-0.2.0.dist-info/METADATA +441 -0
- fc_data-0.2.0.dist-info/RECORD +87 -0
- fc_data-0.2.0.dist-info/WHEEL +4 -0
- fc_data-0.2.0.dist-info/entry_points.txt +2 -0
- fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Problem statement rendering with Jinja2 templates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from jinja2 import Environment, FileSystemLoader
|
|
9
|
+
|
|
10
|
+
from datasmith.github.models import PR, IssueExpanded
|
|
11
|
+
from datasmith.utils import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger("github.render")
|
|
14
|
+
|
|
15
|
+
_TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_env() -> Environment:
|
|
19
|
+
"""Return a Jinja2 environment pointing at the templates directory."""
|
|
20
|
+
return Environment(
|
|
21
|
+
loader=FileSystemLoader(_TEMPLATES_DIR),
|
|
22
|
+
autoescape=False,
|
|
23
|
+
trim_blocks=True,
|
|
24
|
+
lstrip_blocks=True,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Anonymizer:
|
|
29
|
+
"""Replace usernames, emails, and other PII with deterministic placeholders."""
|
|
30
|
+
|
|
31
|
+
_SIGNOFF = re.compile(
|
|
32
|
+
r"((?:Signed-off-by|Co-authored-by)\s*:\s*)(.*?)(\s*<[^>]+>)",
|
|
33
|
+
re.IGNORECASE,
|
|
34
|
+
)
|
|
35
|
+
_EMAIL = re.compile(r"[\w.+-]+@[\w-]+\.[\w.]+")
|
|
36
|
+
_MENTION = re.compile(r"@([\w-]+)")
|
|
37
|
+
_GITHUB_URL = re.compile(r"(https?://github\.com/)([\w.-]+)")
|
|
38
|
+
_USER_IMAGES = re.compile(r"https?://user-images\.githubusercontent\.com/[^\s)>\]]+")
|
|
39
|
+
_HOME_PATH = re.compile(r"(/(?:home|Users)/|[Cc]:\\Users\\)([\w.-]+)")
|
|
40
|
+
_IP_ADDR = re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b")
|
|
41
|
+
|
|
42
|
+
def __init__(self, known_usernames: set[str] | None = None) -> None:
|
|
43
|
+
self._map: dict[str, str] = {}
|
|
44
|
+
self._counter = 0
|
|
45
|
+
self._known_usernames = known_usernames or set()
|
|
46
|
+
|
|
47
|
+
def _placeholder(self, username: str) -> str:
|
|
48
|
+
"""Return a deterministic placeholder for *username* (case-insensitive)."""
|
|
49
|
+
key = username.lower()
|
|
50
|
+
if key not in self._map:
|
|
51
|
+
self._counter += 1
|
|
52
|
+
self._map[key] = f"user_{self._counter}"
|
|
53
|
+
return self._map[key]
|
|
54
|
+
|
|
55
|
+
def anonymize(self, text: str) -> str:
|
|
56
|
+
"""Strip PII and replace identifiers with deterministic placeholders.
|
|
57
|
+
|
|
58
|
+
Processing order:
|
|
59
|
+
|
|
60
|
+
1. Signed-off-by / Co-authored-by lines (before email stripping)
|
|
61
|
+
2. Email addresses → ``[email]``
|
|
62
|
+
3. ``@user`` mentions → ``@user_N``
|
|
63
|
+
4. GitHub profile/repo URLs (username segment)
|
|
64
|
+
5. ``user-images.githubusercontent.com`` URLs → ``[image]``
|
|
65
|
+
6. Home-directory paths (``/home/x``, ``/Users/x``, ``C:\\Users\\x``)
|
|
66
|
+
7. IP addresses → ``[ip]``
|
|
67
|
+
8. Known bare usernames (word-boundary, case-insensitive)
|
|
68
|
+
"""
|
|
69
|
+
# 1. Sign-off lines (must precede email stripping so we can match the <email>)
|
|
70
|
+
text = self._SIGNOFF.sub(lambda m: f"{m.group(1)}[name] <[email]>", text)
|
|
71
|
+
|
|
72
|
+
# 2. Emails
|
|
73
|
+
text = self._EMAIL.sub("[email]", text)
|
|
74
|
+
|
|
75
|
+
# 3. @mentions
|
|
76
|
+
text = self._MENTION.sub(lambda m: f"@{self._placeholder(m.group(1))}", text)
|
|
77
|
+
|
|
78
|
+
# 4. GitHub user/org URLs
|
|
79
|
+
text = self._GITHUB_URL.sub(lambda m: f"{m.group(1)}{self._placeholder(m.group(2))}", text)
|
|
80
|
+
|
|
81
|
+
# 5. User-uploaded image URLs
|
|
82
|
+
text = self._USER_IMAGES.sub("[image]", text)
|
|
83
|
+
|
|
84
|
+
# 6. Home-directory paths
|
|
85
|
+
text = self._HOME_PATH.sub(lambda m: f"{m.group(1)}{self._placeholder(m.group(2))}", text)
|
|
86
|
+
|
|
87
|
+
# 7. IP addresses
|
|
88
|
+
text = self._IP_ADDR.sub("[ip]", text)
|
|
89
|
+
|
|
90
|
+
# 8. Known bare usernames (longest first to avoid partial replacement)
|
|
91
|
+
for uname in sorted(self._known_usernames, key=len, reverse=True):
|
|
92
|
+
placeholder = self._placeholder(uname)
|
|
93
|
+
text = re.sub(rf"\b{re.escape(uname)}\b", placeholder, text, flags=re.IGNORECASE)
|
|
94
|
+
|
|
95
|
+
return text
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def render_problem_statement(
|
|
99
|
+
pr: PR,
|
|
100
|
+
issues: list[IssueExpanded] | None = None,
|
|
101
|
+
repo_description: str = "",
|
|
102
|
+
anonymize: bool = False,
|
|
103
|
+
known_usernames: set[str] | None = None,
|
|
104
|
+
extract: bool = True,
|
|
105
|
+
initial_observations: str | None = None,
|
|
106
|
+
) -> str:
|
|
107
|
+
"""Render the full problem statement for a FormulaCode task.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
pr:
|
|
112
|
+
The pull request providing the initial observations.
|
|
113
|
+
issues:
|
|
114
|
+
Optional list of linked issues to include.
|
|
115
|
+
repo_description:
|
|
116
|
+
A short description of the repository.
|
|
117
|
+
anonymize:
|
|
118
|
+
If ``True``, replace usernames and emails with placeholders.
|
|
119
|
+
known_usernames:
|
|
120
|
+
Additional usernames to scrub even without an ``@`` prefix.
|
|
121
|
+
Only used when *anonymize* is ``True``.
|
|
122
|
+
extract:
|
|
123
|
+
If ``True`` (default), use :class:`ProblemExtractor` to separate
|
|
124
|
+
problem observations from solution details, preventing information
|
|
125
|
+
leakage. Falls back to raw ``pr.body`` on failure.
|
|
126
|
+
Ignored when *initial_observations* is provided.
|
|
127
|
+
initial_observations:
|
|
128
|
+
If provided, use this text directly as the problem observations and
|
|
129
|
+
skip the :class:`ProblemExtractor` call entirely. Pass the output of
|
|
130
|
+
``ProblemExtraction.to_problem_markdown()`` here when the extraction
|
|
131
|
+
has already been performed outside this function.
|
|
132
|
+
"""
|
|
133
|
+
env = _get_env()
|
|
134
|
+
anon = Anonymizer(known_usernames=known_usernames) if anonymize else None
|
|
135
|
+
|
|
136
|
+
# Extract problem observations (strip solution details) or use raw body
|
|
137
|
+
if initial_observations is not None:
|
|
138
|
+
# Caller already ran ProblemExtractor — use the supplied text directly.
|
|
139
|
+
pass
|
|
140
|
+
elif extract:
|
|
141
|
+
try:
|
|
142
|
+
from datasmith.agents.extractors import ProblemExtractor
|
|
143
|
+
|
|
144
|
+
extractor = ProblemExtractor()
|
|
145
|
+
extraction = extractor.extract_problem(
|
|
146
|
+
pr_title=getattr(pr, "title", ""),
|
|
147
|
+
pr_body=getattr(pr, "body", ""),
|
|
148
|
+
)
|
|
149
|
+
initial_observations = extraction.to_problem_markdown() or getattr(pr, "body", "")
|
|
150
|
+
except Exception:
|
|
151
|
+
logger.warning("ProblemExtractor failed, falling back to raw PR body")
|
|
152
|
+
initial_observations = getattr(pr, "body", "")
|
|
153
|
+
else:
|
|
154
|
+
initial_observations = getattr(pr, "body", "")
|
|
155
|
+
|
|
156
|
+
# Render issues section
|
|
157
|
+
issues_text = ""
|
|
158
|
+
if issues:
|
|
159
|
+
tpl = env.get_template("issues.md.j2")
|
|
160
|
+
issues_text = tpl.render(issues=issues)
|
|
161
|
+
|
|
162
|
+
# Render final
|
|
163
|
+
tpl = env.get_template("final.md.j2")
|
|
164
|
+
rendered = tpl.render(
|
|
165
|
+
repo_description=repo_description,
|
|
166
|
+
initial_observations=initial_observations,
|
|
167
|
+
issues=issues_text,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if anon:
|
|
171
|
+
rendered = anon.anonymize(rendered)
|
|
172
|
+
|
|
173
|
+
return rendered
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""GitHub Code Search helpers for repo discovery."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datasmith.utils import get_logger
|
|
6
|
+
|
|
7
|
+
logger = get_logger("github.search")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def search_repos_by_file(
|
|
11
|
+
gh: object,
|
|
12
|
+
filename: str = "asv.conf.json",
|
|
13
|
+
min_stars: int = 100,
|
|
14
|
+
) -> list[tuple[str, str]]:
|
|
15
|
+
"""Discover repos containing *filename* via the GitHub Code Search API.
|
|
16
|
+
|
|
17
|
+
Returns a deduplicated list of ``(owner, repo)`` tuples for repos that are
|
|
18
|
+
not forks, not archived, and have at least *min_stars* stars.
|
|
19
|
+
"""
|
|
20
|
+
from datasmith.github.client import GitHubClient
|
|
21
|
+
|
|
22
|
+
if not isinstance(gh, GitHubClient):
|
|
23
|
+
raise TypeError(f"Expected GitHubClient, got {type(gh).__name__}")
|
|
24
|
+
|
|
25
|
+
seen: set[str] = set()
|
|
26
|
+
results: list[tuple[str, str]] = []
|
|
27
|
+
|
|
28
|
+
async for item in gh.search_code(f"filename:{filename}"):
|
|
29
|
+
repo_data = item.get("repository", {})
|
|
30
|
+
full_name: str = repo_data.get("full_name", "")
|
|
31
|
+
if not full_name or full_name in seen:
|
|
32
|
+
continue
|
|
33
|
+
seen.add(full_name)
|
|
34
|
+
|
|
35
|
+
# Skip forks (available in search response)
|
|
36
|
+
if repo_data.get("fork", False):
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
# Skip the ASV tool itself
|
|
40
|
+
if full_name == "airspeed-velocity/asv":
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
owner, repo = full_name.split("/", 1)
|
|
44
|
+
results.append((owner, repo))
|
|
45
|
+
|
|
46
|
+
logger.info("Code search found %d candidate repos for filename:%s", len(results), filename)
|
|
47
|
+
|
|
48
|
+
# Fetch full metadata to filter by archived/disabled/stars
|
|
49
|
+
filtered: list[tuple[str, str]] = []
|
|
50
|
+
for owner, repo in results:
|
|
51
|
+
resp = await gh._request("GET", f"/repos/{owner}/{repo}")
|
|
52
|
+
if resp is None:
|
|
53
|
+
continue
|
|
54
|
+
data = resp.json()
|
|
55
|
+
if data.get("archived", False) or data.get("disabled", False):
|
|
56
|
+
logger.debug("%s/%s: archived/disabled — skipped", owner, repo)
|
|
57
|
+
continue
|
|
58
|
+
stars = data.get("stargazers_count", 0)
|
|
59
|
+
if stars < min_stars:
|
|
60
|
+
logger.debug("%s/%s: %d stars — below threshold", owner, repo, stars)
|
|
61
|
+
continue
|
|
62
|
+
filtered.append((owner, repo))
|
|
63
|
+
logger.debug("%s/%s: %d stars — included", owner, repo, stars)
|
|
64
|
+
|
|
65
|
+
logger.info("Filtered to %d repos with >= %d stars", len(filtered), min_stars)
|
|
66
|
+
return filtered
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
**Objective:**
|
|
2
|
+
You are a performance optimization expert. Speed up the repository **while maintaining correctness**.
|
|
3
|
+
|
|
4
|
+
**Tooling:**
|
|
5
|
+
The micromamba environment includes **Pytest** for testing and **Airspeed Velocity (ASV)** for benchmarking measurements and profiling.
|
|
6
|
+
|
|
7
|
+
**Process:**
|
|
8
|
+
|
|
9
|
+
**1. Scan & Baseline**
|
|
10
|
+
|
|
11
|
+
- Read the code and any hints.
|
|
12
|
+
- Map likely bottlenecks.
|
|
13
|
+
- Establish a **baseline** by running the **relevant** ASV benchmarks.
|
|
14
|
+
|
|
15
|
+
**2. Benchmark (ASV)**
|
|
16
|
+
- Read through relevant benchmarks.
|
|
17
|
+
- Prefer targeted runs using `--bench=<regex>`; full-suite runs are too time-consuming and are discouraged.
|
|
18
|
+
- Commands:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# Always pin to current interpreter
|
|
22
|
+
asv run --python=same --bench="<regex>"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
* Find benchmarks via `asv_benchmarks.txt` or in the directory containing the ASV benchmarks.
|
|
26
|
+
* You may run multiple benchmarks at once using regexes.
|
|
27
|
+
|
|
28
|
+
**3. Profile Hotspots**
|
|
29
|
+
|
|
30
|
+
* Profile **relevant** benchmarks to locate hot paths.
|
|
31
|
+
* Use ASV's built-in profiling support.
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
asv profile --python=same --config=<path-to-asv.*.json> <benchmark_name>
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
**4. Optimize**
|
|
38
|
+
|
|
39
|
+
* Make targeted changes that address the hot paths while maintaining correctness.
|
|
40
|
+
* Always follow the **Operating Principles** below.
|
|
41
|
+
|
|
42
|
+
**Operating Principles**
|
|
43
|
+
|
|
44
|
+
* **One change/command at a time** (code edit, ASV run, profiling).
|
|
45
|
+
* **Baseline first**, then iterate.
|
|
46
|
+
* **Target the hot paths** shown by profiling.
|
|
47
|
+
* **Evidence-driven**: justify changes with benchmark/profile data.
|
|
48
|
+
* **Correctness first**: never trade correctness for speed.
|
|
49
|
+
{% if repo_description %}
|
|
50
|
+
|
|
51
|
+
**Repository Description**
|
|
52
|
+
|
|
53
|
+
{{ repo_description }}
|
|
54
|
+
{% endif %}
|
|
55
|
+
|
|
56
|
+
**Task Description**
|
|
57
|
+
|
|
58
|
+
Your main goal is to optimize the code to run as fast as possible. Use the following information if needed to understand the problem:
|
|
59
|
+
|
|
60
|
+
{{ initial_observations }}
|
|
61
|
+
|
|
62
|
+
{% if issues %}
|
|
63
|
+
**Relevant Issues**
|
|
64
|
+
|
|
65
|
+
{{ issues }}
|
|
66
|
+
{% endif %}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{% for it in issues %}
|
|
2
|
+
# Issue #{{ it.number }}: {{ it.title }}
|
|
3
|
+
{% if it.description %}
|
|
4
|
+
Description:
|
|
5
|
+
{{ it.description }}
|
|
6
|
+
{% endif %}
|
|
7
|
+
|
|
8
|
+
{% if it.comments %}
|
|
9
|
+
Comments:
|
|
10
|
+
{% for c in it.comments %}
|
|
11
|
+
- {{ c }}
|
|
12
|
+
{% endfor %}
|
|
13
|
+
{% endif %}
|
|
14
|
+
|
|
15
|
+
{% if it.cross_references %}
|
|
16
|
+
Cross-references:
|
|
17
|
+
{% for x in it.cross_references %}
|
|
18
|
+
- {{ x }}
|
|
19
|
+
{% endfor %}
|
|
20
|
+
{% endif %}
|
|
21
|
+
{% endfor %}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
This repository is called {{ repo_name }}. {{ repo_name }} is written primarily in {{ repo_language }} and is described as a "{{ repo_description }}"{% if repo_topics %}, with topics including {{ repo_topics }}{% endif %}.
|
datasmith/preflight.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from datasmith.utils import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger("preflight")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _check(name: str, condition: bool, detail: str = "") -> bool:
|
|
13
|
+
status = "OK" if condition else "FAIL"
|
|
14
|
+
msg = f" [{status}] {name}"
|
|
15
|
+
if detail:
|
|
16
|
+
msg += f" — {detail}"
|
|
17
|
+
print(msg)
|
|
18
|
+
return condition
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_preflight() -> bool: # noqa: C901
|
|
22
|
+
"""Verify all prerequisites for the DataSmith pipeline."""
|
|
23
|
+
print("DataSmith Preflight Check")
|
|
24
|
+
print("=" * 40)
|
|
25
|
+
|
|
26
|
+
all_ok = True
|
|
27
|
+
|
|
28
|
+
# 1. Environment variables
|
|
29
|
+
print("\n== Environment ==")
|
|
30
|
+
supabase_url = os.environ.get("SUPABASE_URL", "")
|
|
31
|
+
all_ok &= _check(
|
|
32
|
+
"SUPABASE_URL", bool(supabase_url), supabase_url[:30] + "..." if len(supabase_url) > 30 else supabase_url
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
supabase_key = os.environ.get("SUPABASE_KEY", "")
|
|
36
|
+
all_ok &= _check("SUPABASE_KEY", bool(supabase_key), "***" if supabase_key else "")
|
|
37
|
+
|
|
38
|
+
gh_tokens = os.environ.get("GH_TOKENS", os.environ.get("GH_TOKEN", ""))
|
|
39
|
+
token_count = len([t for t in gh_tokens.split(",") if t.strip()]) if gh_tokens else 0
|
|
40
|
+
all_ok &= _check("GH_TOKENS", token_count > 0, f"{token_count} token(s)")
|
|
41
|
+
|
|
42
|
+
hf_token_path = os.environ.get("HF_TOKEN_PATH", "/mnt/sdd3/llama_atharvas/huggingface/token")
|
|
43
|
+
hf_exists = Path(hf_token_path).exists()
|
|
44
|
+
all_ok &= _check("HF_TOKEN", hf_exists, hf_token_path)
|
|
45
|
+
|
|
46
|
+
# 2. LLM Backend
|
|
47
|
+
print("\n== LLM Backend ==")
|
|
48
|
+
dspy_model = os.environ.get("DSPY_MODEL", "openai/gpt-oss-120b")
|
|
49
|
+
dspy_api_base = os.environ.get("DSPY_API_BASE", "http://localhost:30001/v1")
|
|
50
|
+
all_ok &= _check("DSPY_MODEL", bool(dspy_model), dspy_model)
|
|
51
|
+
all_ok &= _check("DSPY_API_BASE", bool(dspy_api_base), dspy_api_base)
|
|
52
|
+
|
|
53
|
+
if dspy_model and dspy_api_base:
|
|
54
|
+
try:
|
|
55
|
+
import httpx
|
|
56
|
+
|
|
57
|
+
# Strip "openai/" prefix for model matching
|
|
58
|
+
model_id = dspy_model.removeprefix("openai/")
|
|
59
|
+
models_url = f"{dspy_api_base}/models"
|
|
60
|
+
resp = httpx.get(models_url, timeout=10)
|
|
61
|
+
resp.raise_for_status()
|
|
62
|
+
models_data = resp.json().get("data", [])
|
|
63
|
+
model_ids = [m.get("id", "") for m in models_data]
|
|
64
|
+
found = model_id in model_ids
|
|
65
|
+
all_ok &= _check("Model available", found, f"{model_id} in {len(model_ids)} model(s)")
|
|
66
|
+
|
|
67
|
+
if found:
|
|
68
|
+
# Test completion
|
|
69
|
+
chat_url = f"{dspy_api_base}/chat/completions"
|
|
70
|
+
payload = {
|
|
71
|
+
"model": model_id,
|
|
72
|
+
"messages": [{"role": "user", "content": "Say OK"}],
|
|
73
|
+
"max_tokens": 64,
|
|
74
|
+
}
|
|
75
|
+
comp_resp = httpx.post(chat_url, json=payload, timeout=60)
|
|
76
|
+
comp_resp.raise_for_status()
|
|
77
|
+
choices = comp_resp.json().get("choices", [])
|
|
78
|
+
content = ""
|
|
79
|
+
if choices:
|
|
80
|
+
msg = choices[0].get("message") or {}
|
|
81
|
+
if isinstance(msg, dict):
|
|
82
|
+
content = msg.get("content") or msg.get("reasoning") or ""
|
|
83
|
+
finish = choices[0].get("finish_reason", "")
|
|
84
|
+
else:
|
|
85
|
+
finish = ""
|
|
86
|
+
detail = repr(content[:40]) if content else f"finish_reason={finish}"
|
|
87
|
+
all_ok &= _check("Test completion", True, detail)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
all_ok &= _check("LLM server", False, str(e)[:80])
|
|
90
|
+
|
|
91
|
+
# 3. Supabase connection
|
|
92
|
+
print("\n== Supabase ==")
|
|
93
|
+
try:
|
|
94
|
+
import datasmith.utils.db as db_mod
|
|
95
|
+
from datasmith.utils.db import get_client
|
|
96
|
+
|
|
97
|
+
db_mod._client = None # Force fresh connection
|
|
98
|
+
client = get_client()
|
|
99
|
+
# Try a simple query
|
|
100
|
+
client.table("repositories").select("owner").limit(1).execute()
|
|
101
|
+
all_ok &= _check("Connection", True)
|
|
102
|
+
db_mod._client = None
|
|
103
|
+
except Exception as e:
|
|
104
|
+
all_ok &= _check("Connection", False, str(e)[:80])
|
|
105
|
+
|
|
106
|
+
# 4. Docker
|
|
107
|
+
print("\n== Docker ==")
|
|
108
|
+
try:
|
|
109
|
+
from python_on_whales import DockerClient
|
|
110
|
+
|
|
111
|
+
docker = DockerClient()
|
|
112
|
+
docker.version()
|
|
113
|
+
all_ok &= _check("Docker daemon", True)
|
|
114
|
+
except Exception as e:
|
|
115
|
+
all_ok &= _check("Docker daemon", False, str(e)[:80])
|
|
116
|
+
|
|
117
|
+
# 5. GitHub tokens
|
|
118
|
+
print("\n== GitHub ==")
|
|
119
|
+
if token_count > 0:
|
|
120
|
+
try:
|
|
121
|
+
import httpx
|
|
122
|
+
|
|
123
|
+
total_remaining = 0
|
|
124
|
+
failed_tokens = 0
|
|
125
|
+
tokens = [t.strip() for t in gh_tokens.split(",") if t.strip()]
|
|
126
|
+
for token in tokens:
|
|
127
|
+
resp = httpx.get(
|
|
128
|
+
"https://api.github.com/rate_limit",
|
|
129
|
+
headers={"Authorization": f"Bearer {token}"},
|
|
130
|
+
timeout=10,
|
|
131
|
+
)
|
|
132
|
+
if resp.status_code == 200:
|
|
133
|
+
total_remaining += resp.json()["rate"]["remaining"]
|
|
134
|
+
else:
|
|
135
|
+
failed_tokens += 1
|
|
136
|
+
|
|
137
|
+
if failed_tokens == len(tokens):
|
|
138
|
+
all_ok &= _check("API access", False, "all tokens failed")
|
|
139
|
+
else:
|
|
140
|
+
detail = f"remaining={total_remaining} across {len(tokens) - failed_tokens}/{len(tokens)} token(s)"
|
|
141
|
+
all_ok &= _check("API access", True, detail)
|
|
142
|
+
except Exception as e:
|
|
143
|
+
all_ok &= _check("API access", False, str(e)[:80])
|
|
144
|
+
else:
|
|
145
|
+
all_ok &= _check("API access", False, "no tokens")
|
|
146
|
+
|
|
147
|
+
# Summary
|
|
148
|
+
print("\n" + "=" * 40)
|
|
149
|
+
if all_ok:
|
|
150
|
+
print("All checks passed!")
|
|
151
|
+
else:
|
|
152
|
+
print("Some checks failed. Fix issues above before running the pipeline.")
|
|
153
|
+
|
|
154
|
+
return all_ok
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
if __name__ == "__main__":
|
|
158
|
+
from datasmith import setup_environment
|
|
159
|
+
|
|
160
|
+
setup_environment()
|
|
161
|
+
ok = run_preflight()
|
|
162
|
+
sys.exit(0 if ok else 1)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""ds.publish — DockerHub + HuggingFace publishing pipeline."""
|
|
2
|
+
|
|
3
|
+
from datasmith.publish.huggingface import HuggingFacePublisher
|
|
4
|
+
from datasmith.publish.pipeline import publish_pipeline
|
|
5
|
+
from datasmith.publish.records import records_from_parquet, records_from_supabase, records_to_parquet
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"HuggingFacePublisher",
|
|
9
|
+
"publish_pipeline",
|
|
10
|
+
"records_from_parquet",
|
|
11
|
+
"records_from_supabase",
|
|
12
|
+
"records_to_parquet",
|
|
13
|
+
]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from datasmith.github.models import FormulaCodeRecord
|
|
8
|
+
from datasmith.publish.records import records_to_parquet
|
|
9
|
+
from datasmith.utils import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger("publish.huggingface")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HuggingFacePublisher:
|
|
15
|
+
"""Publish FormulaCode records to HuggingFace as versioned Parquet datasets."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
repo_id: str = "formulacode/formulacode",
|
|
20
|
+
token_path: str = "",
|
|
21
|
+
) -> None:
|
|
22
|
+
self._repo_id = repo_id
|
|
23
|
+
self._token_path = token_path or os.environ.get("HF_TOKEN_PATH", "/mnt/sdd3/llama_atharvas/huggingface/token")
|
|
24
|
+
self._token: str | None = None
|
|
25
|
+
|
|
26
|
+
def _get_token(self) -> str:
|
|
27
|
+
if self._token is None:
|
|
28
|
+
path = Path(self._token_path)
|
|
29
|
+
if path.exists():
|
|
30
|
+
self._token = path.read_text().strip()
|
|
31
|
+
else:
|
|
32
|
+
self._token = os.environ.get("HF_TOKEN", "")
|
|
33
|
+
if not self._token:
|
|
34
|
+
raise ValueError(f"HuggingFace token not found at {self._token_path} or HF_TOKEN env var")
|
|
35
|
+
return self._token
|
|
36
|
+
|
|
37
|
+
def publish(self, records: list[FormulaCodeRecord], version: str) -> None:
|
|
38
|
+
"""Upload records as Parquet to HuggingFace Hub."""
|
|
39
|
+
from huggingface_hub import HfApi
|
|
40
|
+
|
|
41
|
+
token = self._get_token()
|
|
42
|
+
parquet_bytes = records_to_parquet(records)
|
|
43
|
+
if not parquet_bytes:
|
|
44
|
+
logger.warning("No records to publish")
|
|
45
|
+
return
|
|
46
|
+
|
|
47
|
+
api = HfApi(token=token)
|
|
48
|
+
|
|
49
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
50
|
+
parquet_path = os.path.join(tmpdir, f"{version}.parquet")
|
|
51
|
+
with open(parquet_path, "wb") as f:
|
|
52
|
+
f.write(parquet_bytes)
|
|
53
|
+
|
|
54
|
+
api.upload_file(
|
|
55
|
+
path_or_fileobj=parquet_path,
|
|
56
|
+
path_in_repo=f"data/{version}.parquet",
|
|
57
|
+
repo_id=self._repo_id,
|
|
58
|
+
repo_type="dataset",
|
|
59
|
+
commit_message=f"Add {version} data ({len(records)} records)",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
logger.info("Published %d records as %s to %s", len(records), version, self._repo_id)
|
|
63
|
+
|
|
64
|
+
def create_dataset_card(self, version: str) -> str:
|
|
65
|
+
"""Generate a YAML dataset card string."""
|
|
66
|
+
card = f"""---
|
|
67
|
+
language:
|
|
68
|
+
- en
|
|
69
|
+
license: apache-2.0
|
|
70
|
+
tags:
|
|
71
|
+
- code
|
|
72
|
+
- benchmarks
|
|
73
|
+
- performance
|
|
74
|
+
- optimization
|
|
75
|
+
pretty_name: FormulaCode
|
|
76
|
+
size_categories:
|
|
77
|
+
- 1K<n<10K
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
# FormulaCode Dataset
|
|
81
|
+
|
|
82
|
+
Performance optimization benchmark dataset.
|
|
83
|
+
|
|
84
|
+
## Version: {version}
|
|
85
|
+
|
|
86
|
+
## Schema
|
|
87
|
+
|
|
88
|
+
| Field | Type | Description |
|
|
89
|
+
|-------|------|-------------|
|
|
90
|
+
| task_id | string | Unique task identifier (owner__repo-issue_number) |
|
|
91
|
+
| owner | string | Repository owner |
|
|
92
|
+
| repo | string | Repository name |
|
|
93
|
+
| issue_number | int | PR number |
|
|
94
|
+
| gt_hash | string | Ground truth merge commit SHA |
|
|
95
|
+
| base_commit | string | Base commit SHA |
|
|
96
|
+
| date | string | Merge date |
|
|
97
|
+
| instructions | string | Task instructions / problem statement |
|
|
98
|
+
| classification | string | Optimization type category |
|
|
99
|
+
| difficulty | string | easy / medium / hard |
|
|
100
|
+
| container_name | string | Docker container name |
|
|
101
|
+
| patch | string | Ground truth patch |
|
|
102
|
+
| image_name | string | Docker image name |
|
|
103
|
+
"""
|
|
104
|
+
return card
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
|
|
5
|
+
from datasmith.publish.huggingface import HuggingFacePublisher
|
|
6
|
+
from datasmith.publish.records import records_from_supabase
|
|
7
|
+
from datasmith.utils import get_client, get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger("publish.pipeline")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def publish_pipeline(
|
|
13
|
+
start_date: str,
|
|
14
|
+
end_date: str,
|
|
15
|
+
dockerhub_push: bool = True,
|
|
16
|
+
hf_publish: bool = True,
|
|
17
|
+
) -> int:
|
|
18
|
+
"""Full publishing pipeline: query DB -> push DockerHub -> upload HuggingFace -> mark published.
|
|
19
|
+
|
|
20
|
+
Returns the number of records published.
|
|
21
|
+
"""
|
|
22
|
+
records = records_from_supabase(start_date=start_date, end_date=end_date)
|
|
23
|
+
if not records:
|
|
24
|
+
logger.info("No unpublished records found for %s to %s", start_date, end_date)
|
|
25
|
+
return 0
|
|
26
|
+
|
|
27
|
+
logger.info("Found %d unpublished records", len(records))
|
|
28
|
+
|
|
29
|
+
version = f"formulacode@{datetime.now(tz=timezone.utc).strftime('%Y-%m')}"
|
|
30
|
+
|
|
31
|
+
# DockerHub push (optional)
|
|
32
|
+
if dockerhub_push:
|
|
33
|
+
from datasmith.docker.publish import DockerHubPublisher
|
|
34
|
+
|
|
35
|
+
publisher = DockerHubPublisher()
|
|
36
|
+
for record in records:
|
|
37
|
+
if record.container_name:
|
|
38
|
+
try:
|
|
39
|
+
publisher.push(record.container_name)
|
|
40
|
+
except Exception:
|
|
41
|
+
logger.warning("Failed to push %s", record.container_name)
|
|
42
|
+
|
|
43
|
+
# HuggingFace publish (optional)
|
|
44
|
+
if hf_publish:
|
|
45
|
+
hf = HuggingFacePublisher()
|
|
46
|
+
hf.publish(records, version)
|
|
47
|
+
|
|
48
|
+
# Mark as published in Supabase
|
|
49
|
+
client = get_client()
|
|
50
|
+
now = datetime.now(tz=timezone.utc).isoformat()
|
|
51
|
+
for record in records:
|
|
52
|
+
try:
|
|
53
|
+
client.table("pull_requests").update({"published_at": now}).eq("owner", record.owner).eq(
|
|
54
|
+
"repo", record.repo
|
|
55
|
+
).eq("issue_number", record.issue_number).execute()
|
|
56
|
+
except Exception:
|
|
57
|
+
logger.warning("Failed to mark %s as published", record.task_id)
|
|
58
|
+
|
|
59
|
+
logger.info("Published %d records as %s", len(records), version)
|
|
60
|
+
return len(records)
|