fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,173 @@
1
+ """Problem statement rendering with Jinja2 templates."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+
8
+ from jinja2 import Environment, FileSystemLoader
9
+
10
+ from datasmith.github.models import PR, IssueExpanded
11
+ from datasmith.utils import get_logger
12
+
13
+ logger = get_logger("github.render")
14
+
15
+ _TEMPLATES_DIR = os.path.join(os.path.dirname(__file__), "templates")
16
+
17
+
18
+ def _get_env() -> Environment:
19
+ """Return a Jinja2 environment pointing at the templates directory."""
20
+ return Environment(
21
+ loader=FileSystemLoader(_TEMPLATES_DIR),
22
+ autoescape=False,
23
+ trim_blocks=True,
24
+ lstrip_blocks=True,
25
+ )
26
+
27
+
28
+ class Anonymizer:
29
+ """Replace usernames, emails, and other PII with deterministic placeholders."""
30
+
31
+ _SIGNOFF = re.compile(
32
+ r"((?:Signed-off-by|Co-authored-by)\s*:\s*)(.*?)(\s*<[^>]+>)",
33
+ re.IGNORECASE,
34
+ )
35
+ _EMAIL = re.compile(r"[\w.+-]+@[\w-]+\.[\w.]+")
36
+ _MENTION = re.compile(r"@([\w-]+)")
37
+ _GITHUB_URL = re.compile(r"(https?://github\.com/)([\w.-]+)")
38
+ _USER_IMAGES = re.compile(r"https?://user-images\.githubusercontent\.com/[^\s)>\]]+")
39
+ _HOME_PATH = re.compile(r"(/(?:home|Users)/|[Cc]:\\Users\\)([\w.-]+)")
40
+ _IP_ADDR = re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b")
41
+
42
+ def __init__(self, known_usernames: set[str] | None = None) -> None:
43
+ self._map: dict[str, str] = {}
44
+ self._counter = 0
45
+ self._known_usernames = known_usernames or set()
46
+
47
+ def _placeholder(self, username: str) -> str:
48
+ """Return a deterministic placeholder for *username* (case-insensitive)."""
49
+ key = username.lower()
50
+ if key not in self._map:
51
+ self._counter += 1
52
+ self._map[key] = f"user_{self._counter}"
53
+ return self._map[key]
54
+
55
+ def anonymize(self, text: str) -> str:
56
+ """Strip PII and replace identifiers with deterministic placeholders.
57
+
58
+ Processing order:
59
+
60
+ 1. Signed-off-by / Co-authored-by lines (before email stripping)
61
+ 2. Email addresses → ``[email]``
62
+ 3. ``@user`` mentions → ``@user_N``
63
+ 4. GitHub profile/repo URLs (username segment)
64
+ 5. ``user-images.githubusercontent.com`` URLs → ``[image]``
65
+ 6. Home-directory paths (``/home/x``, ``/Users/x``, ``C:\\Users\\x``)
66
+ 7. IP addresses → ``[ip]``
67
+ 8. Known bare usernames (word-boundary, case-insensitive)
68
+ """
69
+ # 1. Sign-off lines (must precede email stripping so we can match the <email>)
70
+ text = self._SIGNOFF.sub(lambda m: f"{m.group(1)}[name] <[email]>", text)
71
+
72
+ # 2. Emails
73
+ text = self._EMAIL.sub("[email]", text)
74
+
75
+ # 3. @mentions
76
+ text = self._MENTION.sub(lambda m: f"@{self._placeholder(m.group(1))}", text)
77
+
78
+ # 4. GitHub user/org URLs
79
+ text = self._GITHUB_URL.sub(lambda m: f"{m.group(1)}{self._placeholder(m.group(2))}", text)
80
+
81
+ # 5. User-uploaded image URLs
82
+ text = self._USER_IMAGES.sub("[image]", text)
83
+
84
+ # 6. Home-directory paths
85
+ text = self._HOME_PATH.sub(lambda m: f"{m.group(1)}{self._placeholder(m.group(2))}", text)
86
+
87
+ # 7. IP addresses
88
+ text = self._IP_ADDR.sub("[ip]", text)
89
+
90
+ # 8. Known bare usernames (longest first to avoid partial replacement)
91
+ for uname in sorted(self._known_usernames, key=len, reverse=True):
92
+ placeholder = self._placeholder(uname)
93
+ text = re.sub(rf"\b{re.escape(uname)}\b", placeholder, text, flags=re.IGNORECASE)
94
+
95
+ return text
96
+
97
+
98
+ def render_problem_statement(
99
+ pr: PR,
100
+ issues: list[IssueExpanded] | None = None,
101
+ repo_description: str = "",
102
+ anonymize: bool = False,
103
+ known_usernames: set[str] | None = None,
104
+ extract: bool = True,
105
+ initial_observations: str | None = None,
106
+ ) -> str:
107
+ """Render the full problem statement for a FormulaCode task.
108
+
109
+ Parameters
110
+ ----------
111
+ pr:
112
+ The pull request providing the initial observations.
113
+ issues:
114
+ Optional list of linked issues to include.
115
+ repo_description:
116
+ A short description of the repository.
117
+ anonymize:
118
+ If ``True``, replace usernames and emails with placeholders.
119
+ known_usernames:
120
+ Additional usernames to scrub even without an ``@`` prefix.
121
+ Only used when *anonymize* is ``True``.
122
+ extract:
123
+ If ``True`` (default), use :class:`ProblemExtractor` to separate
124
+ problem observations from solution details, preventing information
125
+ leakage. Falls back to raw ``pr.body`` on failure.
126
+ Ignored when *initial_observations* is provided.
127
+ initial_observations:
128
+ If provided, use this text directly as the problem observations and
129
+ skip the :class:`ProblemExtractor` call entirely. Pass the output of
130
+ ``ProblemExtraction.to_problem_markdown()`` here when the extraction
131
+ has already been performed outside this function.
132
+ """
133
+ env = _get_env()
134
+ anon = Anonymizer(known_usernames=known_usernames) if anonymize else None
135
+
136
+ # Extract problem observations (strip solution details) or use raw body
137
+ if initial_observations is not None:
138
+ # Caller already ran ProblemExtractor — use the supplied text directly.
139
+ pass
140
+ elif extract:
141
+ try:
142
+ from datasmith.agents.extractors import ProblemExtractor
143
+
144
+ extractor = ProblemExtractor()
145
+ extraction = extractor.extract_problem(
146
+ pr_title=getattr(pr, "title", ""),
147
+ pr_body=getattr(pr, "body", ""),
148
+ )
149
+ initial_observations = extraction.to_problem_markdown() or getattr(pr, "body", "")
150
+ except Exception:
151
+ logger.warning("ProblemExtractor failed, falling back to raw PR body")
152
+ initial_observations = getattr(pr, "body", "")
153
+ else:
154
+ initial_observations = getattr(pr, "body", "")
155
+
156
+ # Render issues section
157
+ issues_text = ""
158
+ if issues:
159
+ tpl = env.get_template("issues.md.j2")
160
+ issues_text = tpl.render(issues=issues)
161
+
162
+ # Render final
163
+ tpl = env.get_template("final.md.j2")
164
+ rendered = tpl.render(
165
+ repo_description=repo_description,
166
+ initial_observations=initial_observations,
167
+ issues=issues_text,
168
+ )
169
+
170
+ if anon:
171
+ rendered = anon.anonymize(rendered)
172
+
173
+ return rendered
@@ -0,0 +1,66 @@
1
+ """GitHub Code Search helpers for repo discovery."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datasmith.utils import get_logger
6
+
7
+ logger = get_logger("github.search")
8
+
9
+
10
+ async def search_repos_by_file(
11
+ gh: object,
12
+ filename: str = "asv.conf.json",
13
+ min_stars: int = 100,
14
+ ) -> list[tuple[str, str]]:
15
+ """Discover repos containing *filename* via the GitHub Code Search API.
16
+
17
+ Returns a deduplicated list of ``(owner, repo)`` tuples for repos that are
18
+ not forks, not archived, and have at least *min_stars* stars.
19
+ """
20
+ from datasmith.github.client import GitHubClient
21
+
22
+ if not isinstance(gh, GitHubClient):
23
+ raise TypeError(f"Expected GitHubClient, got {type(gh).__name__}")
24
+
25
+ seen: set[str] = set()
26
+ results: list[tuple[str, str]] = []
27
+
28
+ async for item in gh.search_code(f"filename:{filename}"):
29
+ repo_data = item.get("repository", {})
30
+ full_name: str = repo_data.get("full_name", "")
31
+ if not full_name or full_name in seen:
32
+ continue
33
+ seen.add(full_name)
34
+
35
+ # Skip forks (available in search response)
36
+ if repo_data.get("fork", False):
37
+ continue
38
+
39
+ # Skip the ASV tool itself
40
+ if full_name == "airspeed-velocity/asv":
41
+ continue
42
+
43
+ owner, repo = full_name.split("/", 1)
44
+ results.append((owner, repo))
45
+
46
+ logger.info("Code search found %d candidate repos for filename:%s", len(results), filename)
47
+
48
+ # Fetch full metadata to filter by archived/disabled/stars
49
+ filtered: list[tuple[str, str]] = []
50
+ for owner, repo in results:
51
+ resp = await gh._request("GET", f"/repos/{owner}/{repo}")
52
+ if resp is None:
53
+ continue
54
+ data = resp.json()
55
+ if data.get("archived", False) or data.get("disabled", False):
56
+ logger.debug("%s/%s: archived/disabled — skipped", owner, repo)
57
+ continue
58
+ stars = data.get("stargazers_count", 0)
59
+ if stars < min_stars:
60
+ logger.debug("%s/%s: %d stars — below threshold", owner, repo, stars)
61
+ continue
62
+ filtered.append((owner, repo))
63
+ logger.debug("%s/%s: %d stars — included", owner, repo, stars)
64
+
65
+ logger.info("Filtered to %d repos with >= %d stars", len(filtered), min_stars)
66
+ return filtered
@@ -0,0 +1,5 @@
1
+ **@{{ user_login }}** — {{ timestamp }}
2
+
3
+ {{ body }}
4
+
5
+ Links mentioned: {{ links_str }}
@@ -0,0 +1,66 @@
1
+ **Objective:**
2
+ You are a performance optimization expert. Speed up the repository **while maintaining correctness**.
3
+
4
+ **Tooling:**
5
+ The micromamba environment includes **Pytest** for testing and **Airspeed Velocity (ASV)** for benchmarking measurements and profiling.
6
+
7
+ **Process:**
8
+
9
+ **1. Scan & Baseline**
10
+
11
+ - Read the code and any hints.
12
+ - Map likely bottlenecks.
13
+ - Establish a **baseline** by running the **relevant** ASV benchmarks.
14
+
15
+ **2. Benchmark (ASV)**
16
+ - Read through relevant benchmarks.
17
+ - Prefer targeted runs using `--bench=<regex>`; full-suite runs are too time-consuming and are discouraged.
18
+ - Commands:
19
+
20
+ ```bash
21
+ # Always pin to current interpreter
22
+ asv run --python=same --bench="<regex>"
23
+ ```
24
+
25
+ * Find benchmarks via `asv_benchmarks.txt` or in the directory containing the ASV benchmarks.
26
+ * You may run multiple benchmarks at once using regexes.
27
+
28
+ **3. Profile Hotspots**
29
+
30
+ * Profile **relevant** benchmarks to locate hot paths.
31
+ * Use ASV's built-in profiling support.
32
+
33
+ ```bash
34
+ asv profile --python=same --config=<path-to-asv.*.json> <benchmark_name>
35
+ ```
36
+
37
+ **4. Optimize**
38
+
39
+ * Make targeted changes that address the hot paths while maintaining correctness.
40
+ * Always follow the **Operating Principles** below.
41
+
42
+ **Operating Principles**
43
+
44
+ * **One change/command at a time** (code edit, ASV run, profiling).
45
+ * **Baseline first**, then iterate.
46
+ * **Target the hot paths** shown by profiling.
47
+ * **Evidence-driven**: justify changes with benchmark/profile data.
48
+ * **Correctness first**: never trade correctness for speed.
49
+ {% if repo_description %}
50
+
51
+ **Repository Description**
52
+
53
+ {{ repo_description }}
54
+ {% endif %}
55
+
56
+ **Task Description**
57
+
58
+ Your main goal is to optimize the code to run as fast as possible. Use the following information if needed to understand the problem:
59
+
60
+ {{ initial_observations }}
61
+
62
+ {% if issues %}
63
+ **Relevant Issues**
64
+
65
+ {{ issues }}
66
+ {% endif %}
@@ -0,0 +1,21 @@
1
+ {% for it in issues %}
2
+ # Issue #{{ it.number }}: {{ it.title }}
3
+ {% if it.description %}
4
+ Description:
5
+ {{ it.description }}
6
+ {% endif %}
7
+
8
+ {% if it.comments %}
9
+ Comments:
10
+ {% for c in it.comments %}
11
+ - {{ c }}
12
+ {% endfor %}
13
+ {% endif %}
14
+
15
+ {% if it.cross_references %}
16
+ Cross-references:
17
+ {% for x in it.cross_references %}
18
+ - {{ x }}
19
+ {% endfor %}
20
+ {% endif %}
21
+ {% endfor %}
@@ -0,0 +1 @@
1
+ This repository is called {{ repo_name }}. {{ repo_name }} is written primarily in {{ repo_language }} and is described as a "{{ repo_description }}"{% if repo_topics %}, with topics including {{ repo_topics }}{% endif %}.
datasmith/preflight.py ADDED
@@ -0,0 +1,162 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from datasmith.utils import get_logger
8
+
9
+ logger = get_logger("preflight")
10
+
11
+
12
+ def _check(name: str, condition: bool, detail: str = "") -> bool:
13
+ status = "OK" if condition else "FAIL"
14
+ msg = f" [{status}] {name}"
15
+ if detail:
16
+ msg += f" — {detail}"
17
+ print(msg)
18
+ return condition
19
+
20
+
21
+ def run_preflight() -> bool: # noqa: C901
22
+ """Verify all prerequisites for the DataSmith pipeline."""
23
+ print("DataSmith Preflight Check")
24
+ print("=" * 40)
25
+
26
+ all_ok = True
27
+
28
+ # 1. Environment variables
29
+ print("\n== Environment ==")
30
+ supabase_url = os.environ.get("SUPABASE_URL", "")
31
+ all_ok &= _check(
32
+ "SUPABASE_URL", bool(supabase_url), supabase_url[:30] + "..." if len(supabase_url) > 30 else supabase_url
33
+ )
34
+
35
+ supabase_key = os.environ.get("SUPABASE_KEY", "")
36
+ all_ok &= _check("SUPABASE_KEY", bool(supabase_key), "***" if supabase_key else "")
37
+
38
+ gh_tokens = os.environ.get("GH_TOKENS", os.environ.get("GH_TOKEN", ""))
39
+ token_count = len([t for t in gh_tokens.split(",") if t.strip()]) if gh_tokens else 0
40
+ all_ok &= _check("GH_TOKENS", token_count > 0, f"{token_count} token(s)")
41
+
42
+ hf_token_path = os.environ.get("HF_TOKEN_PATH", "/mnt/sdd3/llama_atharvas/huggingface/token")
43
+ hf_exists = Path(hf_token_path).exists()
44
+ all_ok &= _check("HF_TOKEN", hf_exists, hf_token_path)
45
+
46
+ # 2. LLM Backend
47
+ print("\n== LLM Backend ==")
48
+ dspy_model = os.environ.get("DSPY_MODEL", "openai/gpt-oss-120b")
49
+ dspy_api_base = os.environ.get("DSPY_API_BASE", "http://localhost:30001/v1")
50
+ all_ok &= _check("DSPY_MODEL", bool(dspy_model), dspy_model)
51
+ all_ok &= _check("DSPY_API_BASE", bool(dspy_api_base), dspy_api_base)
52
+
53
+ if dspy_model and dspy_api_base:
54
+ try:
55
+ import httpx
56
+
57
+ # Strip "openai/" prefix for model matching
58
+ model_id = dspy_model.removeprefix("openai/")
59
+ models_url = f"{dspy_api_base}/models"
60
+ resp = httpx.get(models_url, timeout=10)
61
+ resp.raise_for_status()
62
+ models_data = resp.json().get("data", [])
63
+ model_ids = [m.get("id", "") for m in models_data]
64
+ found = model_id in model_ids
65
+ all_ok &= _check("Model available", found, f"{model_id} in {len(model_ids)} model(s)")
66
+
67
+ if found:
68
+ # Test completion
69
+ chat_url = f"{dspy_api_base}/chat/completions"
70
+ payload = {
71
+ "model": model_id,
72
+ "messages": [{"role": "user", "content": "Say OK"}],
73
+ "max_tokens": 64,
74
+ }
75
+ comp_resp = httpx.post(chat_url, json=payload, timeout=60)
76
+ comp_resp.raise_for_status()
77
+ choices = comp_resp.json().get("choices", [])
78
+ content = ""
79
+ if choices:
80
+ msg = choices[0].get("message") or {}
81
+ if isinstance(msg, dict):
82
+ content = msg.get("content") or msg.get("reasoning") or ""
83
+ finish = choices[0].get("finish_reason", "")
84
+ else:
85
+ finish = ""
86
+ detail = repr(content[:40]) if content else f"finish_reason={finish}"
87
+ all_ok &= _check("Test completion", True, detail)
88
+ except Exception as e:
89
+ all_ok &= _check("LLM server", False, str(e)[:80])
90
+
91
+ # 3. Supabase connection
92
+ print("\n== Supabase ==")
93
+ try:
94
+ import datasmith.utils.db as db_mod
95
+ from datasmith.utils.db import get_client
96
+
97
+ db_mod._client = None # Force fresh connection
98
+ client = get_client()
99
+ # Try a simple query
100
+ client.table("repositories").select("owner").limit(1).execute()
101
+ all_ok &= _check("Connection", True)
102
+ db_mod._client = None
103
+ except Exception as e:
104
+ all_ok &= _check("Connection", False, str(e)[:80])
105
+
106
+ # 4. Docker
107
+ print("\n== Docker ==")
108
+ try:
109
+ from python_on_whales import DockerClient
110
+
111
+ docker = DockerClient()
112
+ docker.version()
113
+ all_ok &= _check("Docker daemon", True)
114
+ except Exception as e:
115
+ all_ok &= _check("Docker daemon", False, str(e)[:80])
116
+
117
+ # 5. GitHub tokens
118
+ print("\n== GitHub ==")
119
+ if token_count > 0:
120
+ try:
121
+ import httpx
122
+
123
+ total_remaining = 0
124
+ failed_tokens = 0
125
+ tokens = [t.strip() for t in gh_tokens.split(",") if t.strip()]
126
+ for token in tokens:
127
+ resp = httpx.get(
128
+ "https://api.github.com/rate_limit",
129
+ headers={"Authorization": f"Bearer {token}"},
130
+ timeout=10,
131
+ )
132
+ if resp.status_code == 200:
133
+ total_remaining += resp.json()["rate"]["remaining"]
134
+ else:
135
+ failed_tokens += 1
136
+
137
+ if failed_tokens == len(tokens):
138
+ all_ok &= _check("API access", False, "all tokens failed")
139
+ else:
140
+ detail = f"remaining={total_remaining} across {len(tokens) - failed_tokens}/{len(tokens)} token(s)"
141
+ all_ok &= _check("API access", True, detail)
142
+ except Exception as e:
143
+ all_ok &= _check("API access", False, str(e)[:80])
144
+ else:
145
+ all_ok &= _check("API access", False, "no tokens")
146
+
147
+ # Summary
148
+ print("\n" + "=" * 40)
149
+ if all_ok:
150
+ print("All checks passed!")
151
+ else:
152
+ print("Some checks failed. Fix issues above before running the pipeline.")
153
+
154
+ return all_ok
155
+
156
+
157
+ if __name__ == "__main__":
158
+ from datasmith import setup_environment
159
+
160
+ setup_environment()
161
+ ok = run_preflight()
162
+ sys.exit(0 if ok else 1)
@@ -0,0 +1,13 @@
1
+ """ds.publish — DockerHub + HuggingFace publishing pipeline."""
2
+
3
+ from datasmith.publish.huggingface import HuggingFacePublisher
4
+ from datasmith.publish.pipeline import publish_pipeline
5
+ from datasmith.publish.records import records_from_parquet, records_from_supabase, records_to_parquet
6
+
7
+ __all__ = [
8
+ "HuggingFacePublisher",
9
+ "publish_pipeline",
10
+ "records_from_parquet",
11
+ "records_from_supabase",
12
+ "records_to_parquet",
13
+ ]
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ from datasmith.github.models import FormulaCodeRecord
8
+ from datasmith.publish.records import records_to_parquet
9
+ from datasmith.utils import get_logger
10
+
11
+ logger = get_logger("publish.huggingface")
12
+
13
+
14
+ class HuggingFacePublisher:
15
+ """Publish FormulaCode records to HuggingFace as versioned Parquet datasets."""
16
+
17
+ def __init__(
18
+ self,
19
+ repo_id: str = "formulacode/formulacode",
20
+ token_path: str = "",
21
+ ) -> None:
22
+ self._repo_id = repo_id
23
+ self._token_path = token_path or os.environ.get("HF_TOKEN_PATH", "/mnt/sdd3/llama_atharvas/huggingface/token")
24
+ self._token: str | None = None
25
+
26
+ def _get_token(self) -> str:
27
+ if self._token is None:
28
+ path = Path(self._token_path)
29
+ if path.exists():
30
+ self._token = path.read_text().strip()
31
+ else:
32
+ self._token = os.environ.get("HF_TOKEN", "")
33
+ if not self._token:
34
+ raise ValueError(f"HuggingFace token not found at {self._token_path} or HF_TOKEN env var")
35
+ return self._token
36
+
37
+ def publish(self, records: list[FormulaCodeRecord], version: str) -> None:
38
+ """Upload records as Parquet to HuggingFace Hub."""
39
+ from huggingface_hub import HfApi
40
+
41
+ token = self._get_token()
42
+ parquet_bytes = records_to_parquet(records)
43
+ if not parquet_bytes:
44
+ logger.warning("No records to publish")
45
+ return
46
+
47
+ api = HfApi(token=token)
48
+
49
+ with tempfile.TemporaryDirectory() as tmpdir:
50
+ parquet_path = os.path.join(tmpdir, f"{version}.parquet")
51
+ with open(parquet_path, "wb") as f:
52
+ f.write(parquet_bytes)
53
+
54
+ api.upload_file(
55
+ path_or_fileobj=parquet_path,
56
+ path_in_repo=f"data/{version}.parquet",
57
+ repo_id=self._repo_id,
58
+ repo_type="dataset",
59
+ commit_message=f"Add {version} data ({len(records)} records)",
60
+ )
61
+
62
+ logger.info("Published %d records as %s to %s", len(records), version, self._repo_id)
63
+
64
+ def create_dataset_card(self, version: str) -> str:
65
+ """Generate a YAML dataset card string."""
66
+ card = f"""---
67
+ language:
68
+ - en
69
+ license: apache-2.0
70
+ tags:
71
+ - code
72
+ - benchmarks
73
+ - performance
74
+ - optimization
75
+ pretty_name: FormulaCode
76
+ size_categories:
77
+ - 1K<n<10K
78
+ ---
79
+
80
+ # FormulaCode Dataset
81
+
82
+ Performance optimization benchmark dataset.
83
+
84
+ ## Version: {version}
85
+
86
+ ## Schema
87
+
88
+ | Field | Type | Description |
89
+ |-------|------|-------------|
90
+ | task_id | string | Unique task identifier (owner__repo-issue_number) |
91
+ | owner | string | Repository owner |
92
+ | repo | string | Repository name |
93
+ | issue_number | int | PR number |
94
+ | gt_hash | string | Ground truth merge commit SHA |
95
+ | base_commit | string | Base commit SHA |
96
+ | date | string | Merge date |
97
+ | instructions | string | Task instructions / problem statement |
98
+ | classification | string | Optimization type category |
99
+ | difficulty | string | easy / medium / hard |
100
+ | container_name | string | Docker container name |
101
+ | patch | string | Ground truth patch |
102
+ | image_name | string | Docker image name |
103
+ """
104
+ return card
@@ -0,0 +1,60 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+
5
+ from datasmith.publish.huggingface import HuggingFacePublisher
6
+ from datasmith.publish.records import records_from_supabase
7
+ from datasmith.utils import get_client, get_logger
8
+
9
+ logger = get_logger("publish.pipeline")
10
+
11
+
12
+ async def publish_pipeline(
13
+ start_date: str,
14
+ end_date: str,
15
+ dockerhub_push: bool = True,
16
+ hf_publish: bool = True,
17
+ ) -> int:
18
+ """Full publishing pipeline: query DB -> push DockerHub -> upload HuggingFace -> mark published.
19
+
20
+ Returns the number of records published.
21
+ """
22
+ records = records_from_supabase(start_date=start_date, end_date=end_date)
23
+ if not records:
24
+ logger.info("No unpublished records found for %s to %s", start_date, end_date)
25
+ return 0
26
+
27
+ logger.info("Found %d unpublished records", len(records))
28
+
29
+ version = f"formulacode@{datetime.now(tz=timezone.utc).strftime('%Y-%m')}"
30
+
31
+ # DockerHub push (optional)
32
+ if dockerhub_push:
33
+ from datasmith.docker.publish import DockerHubPublisher
34
+
35
+ publisher = DockerHubPublisher()
36
+ for record in records:
37
+ if record.container_name:
38
+ try:
39
+ publisher.push(record.container_name)
40
+ except Exception:
41
+ logger.warning("Failed to push %s", record.container_name)
42
+
43
+ # HuggingFace publish (optional)
44
+ if hf_publish:
45
+ hf = HuggingFacePublisher()
46
+ hf.publish(records, version)
47
+
48
+ # Mark as published in Supabase
49
+ client = get_client()
50
+ now = datetime.now(tz=timezone.utc).isoformat()
51
+ for record in records:
52
+ try:
53
+ client.table("pull_requests").update({"published_at": now}).eq("owner", record.owner).eq(
54
+ "repo", record.repo
55
+ ).eq("issue_number", record.issue_number).execute()
56
+ except Exception:
57
+ logger.warning("Failed to mark %s as published", record.task_id)
58
+
59
+ logger.info("Published %d records as %s", len(records), version)
60
+ return len(records)