fc-data 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datasmith/__init__.py +330 -0
- datasmith/__init__.pyi +194 -0
- datasmith/agents/__init__.py +31 -0
- datasmith/agents/classifiers.py +272 -0
- datasmith/agents/codex.py +25 -0
- datasmith/agents/config.py +108 -0
- datasmith/agents/extractors.py +197 -0
- datasmith/agents/installed/README.md +52 -0
- datasmith/agents/installed/__init__.py +22 -0
- datasmith/agents/installed/base.py +240 -0
- datasmith/agents/installed/claude.py +134 -0
- datasmith/agents/installed/codex.py +91 -0
- datasmith/agents/installed/gemini.py +118 -0
- datasmith/agents/installed/none.py +27 -0
- datasmith/agents/sandbox.py +547 -0
- datasmith/agents/synthesizer.py +439 -0
- datasmith/agents/templates/AGENTS.md.j2 +150 -0
- datasmith/agents/templates/sandbox_verify.py +428 -0
- datasmith/docker/__init__.py +31 -0
- datasmith/docker/context.py +112 -0
- datasmith/docker/images.py +158 -0
- datasmith/docker/publish.py +56 -0
- datasmith/docker/templates/Dockerfile.base +26 -0
- datasmith/docker/templates/Dockerfile.pr +42 -0
- datasmith/docker/templates/Dockerfile.repo +11 -0
- datasmith/docker/templates/docker_build_base.sh +780 -0
- datasmith/docker/templates/docker_build_env.sh +309 -0
- datasmith/docker/templates/docker_build_final.sh +106 -0
- datasmith/docker/templates/docker_build_pkg.sh +99 -0
- datasmith/docker/templates/docker_build_run.sh +124 -0
- datasmith/docker/templates/entrypoint.sh +62 -0
- datasmith/docker/templates/parser.py +1405 -0
- datasmith/docker/templates/profile.sh +199 -0
- datasmith/docker/templates/pytest_runner.py +692 -0
- datasmith/docker/templates/run-tests.sh +197 -0
- datasmith/docker/verifiers.py +131 -0
- datasmith/filters.py +154 -0
- datasmith/github/__init__.py +22 -0
- datasmith/github/client.py +333 -0
- datasmith/github/hooks.py +50 -0
- datasmith/github/links.py +110 -0
- datasmith/github/models.py +206 -0
- datasmith/github/render.py +173 -0
- datasmith/github/search.py +66 -0
- datasmith/github/templates/comment.md.j2 +5 -0
- datasmith/github/templates/final.md.j2 +66 -0
- datasmith/github/templates/issues.md.j2 +21 -0
- datasmith/github/templates/repo.md.j2 +1 -0
- datasmith/preflight.py +162 -0
- datasmith/publish/__init__.py +13 -0
- datasmith/publish/huggingface.py +104 -0
- datasmith/publish/pipeline.py +60 -0
- datasmith/publish/records.py +91 -0
- datasmith/py.typed +1 -0
- datasmith/resolution/__init__.py +14 -0
- datasmith/resolution/blocklist.py +145 -0
- datasmith/resolution/cache.py +120 -0
- datasmith/resolution/constants.py +277 -0
- datasmith/resolution/dependency_resolver.py +174 -0
- datasmith/resolution/git_utils.py +378 -0
- datasmith/resolution/import_analyzer.py +66 -0
- datasmith/resolution/metadata_parser.py +412 -0
- datasmith/resolution/models.py +41 -0
- datasmith/resolution/orchestrator.py +522 -0
- datasmith/resolution/package_filters.py +312 -0
- datasmith/resolution/python_manager.py +110 -0
- datasmith/runners/__init__.py +15 -0
- datasmith/runners/base.py +112 -0
- datasmith/runners/classify_prs.py +48 -0
- datasmith/runners/render_problems.py +113 -0
- datasmith/runners/resolve_packages.py +66 -0
- datasmith/runners/scrape_commits.py +166 -0
- datasmith/runners/scrape_repos.py +44 -0
- datasmith/runners/synthesize_images.py +310 -0
- datasmith/update/__init__.py +5 -0
- datasmith/update/cli.py +169 -0
- datasmith/update/offline.py +173 -0
- datasmith/update/pipeline.py +497 -0
- datasmith/utils/__init__.py +18 -0
- datasmith/utils/core.py +67 -0
- datasmith/utils/db.py +156 -0
- datasmith/utils/tokens.py +65 -0
- fc_data-0.2.0.dist-info/METADATA +441 -0
- fc_data-0.2.0.dist-info/RECORD +87 -0
- fc_data-0.2.0.dist-info/WHEEL +4 -0
- fc_data-0.2.0.dist-info/entry_points.txt +2 -0
- fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
datasmith/update/cli.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import signal
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
from datasmith.utils import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger("update.cli")
|
|
13
|
+
|
|
14
|
+
_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_STAGE_DESCRIPTIONS = {
|
|
18
|
+
1: "scrape_repos — Fetch repository metadata from GitHub for all tracked repos",
|
|
19
|
+
2: "scrape_commits — Scrape merged PR commits and patches from each repository",
|
|
20
|
+
3: "classify_prs — Use LLM agents to classify PRs as performance-related",
|
|
21
|
+
4: "resolve_packages — Resolve Python dependencies for performance commits via uv",
|
|
22
|
+
5: "render_problems — Scrape linked issues and render deconstructed problem contexts",
|
|
23
|
+
6: "synthesize_images — Generate Docker build contexts for confirmed performance commits",
|
|
24
|
+
7: "publish — Build, verify, and publish Docker images to DockerHub",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _stages_epilog() -> str:
|
|
29
|
+
lines = ["pipeline stages (run in order by default):"]
|
|
30
|
+
for num, desc in _STAGE_DESCRIPTIONS.items():
|
|
31
|
+
lines.append(f" {num}. {desc}")
|
|
32
|
+
return "\n".join(lines)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
36
|
+
parser = argparse.ArgumentParser(
|
|
37
|
+
prog="fc-data",
|
|
38
|
+
description="Run the FormulaCode update pipeline — discovers performance-improving "
|
|
39
|
+
"commits from GitHub, classifies them with LLM agents, and builds Docker images "
|
|
40
|
+
"for benchmark evaluation.",
|
|
41
|
+
epilog=_stages_epilog(),
|
|
42
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument("--start-date", required=True, help="Start of the date range to scan for commits (YYYY-MM-DD)")
|
|
45
|
+
parser.add_argument("--end-date", required=True, help="End of the date range to scan for commits (YYYY-MM-DD)")
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--resume",
|
|
48
|
+
action="store_true",
|
|
49
|
+
help="Skip stages already marked complete and resume from the next pending stage",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--stage",
|
|
53
|
+
type=int,
|
|
54
|
+
default=None,
|
|
55
|
+
action="append",
|
|
56
|
+
metavar="N",
|
|
57
|
+
help="Run only stage N (1-7); repeat to run multiple stages (e.g. --stage 1 --stage 2)",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument("--dry-run", action="store_true", help="Log what each stage would do without executing")
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--n-concurrent", type=int, default=None, metavar="N", help="Max concurrent items per runner stage"
|
|
62
|
+
)
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"--tasks-per-repo",
|
|
65
|
+
type=int,
|
|
66
|
+
default=None,
|
|
67
|
+
metavar="N",
|
|
68
|
+
help="Max tasks per repo for stages 5 (render_problems) and 6 (synthesize_images). Ignored by other stages.",
|
|
69
|
+
)
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--agent",
|
|
72
|
+
type=str,
|
|
73
|
+
default=None,
|
|
74
|
+
choices=["claude", "codex", "gemini", "none"],
|
|
75
|
+
help="CLI agent to use for stage 6 synthesis (default: auto-detect first available). "
|
|
76
|
+
"'none' skips LLM generation and relies only on similar-context matching.",
|
|
77
|
+
)
|
|
78
|
+
parser.add_argument(
|
|
79
|
+
"--force",
|
|
80
|
+
action="store_true",
|
|
81
|
+
help="Re-run even for tasks already processed; applies to stages 5 (render_problems) and 6 (synthesize_images)",
|
|
82
|
+
)
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--offline-source",
|
|
85
|
+
type=str,
|
|
86
|
+
default=None,
|
|
87
|
+
metavar="PATH",
|
|
88
|
+
help="Path to a parquet file with offline PR data to import into stages 1 and 2",
|
|
89
|
+
)
|
|
90
|
+
parser.add_argument(
|
|
91
|
+
"--min-stars",
|
|
92
|
+
type=int,
|
|
93
|
+
default=500,
|
|
94
|
+
metavar="N",
|
|
95
|
+
help="Minimum stars for GitHub code search repo discovery in stage 1 (default: 500)",
|
|
96
|
+
)
|
|
97
|
+
return parser.parse_args(argv)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def validate_dates(args: argparse.Namespace) -> None:
|
|
101
|
+
for name in ("start_date", "end_date"):
|
|
102
|
+
val = getattr(args, name)
|
|
103
|
+
if not _DATE_RE.match(val):
|
|
104
|
+
print(f"Error: {name.replace('_', '-')} must be YYYY-MM-DD, got '{val}'", file=sys.stderr)
|
|
105
|
+
sys.exit(1)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
_sigint_count = 0
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _sigint_handler(signum: int, frame: object) -> None:
|
|
112
|
+
"""Handle CTRL+C by killing agent subprocesses so worker threads unblock.
|
|
113
|
+
|
|
114
|
+
Agent subprocesses run in their own sessions (``start_new_session=True``)
|
|
115
|
+
and don't receive SIGINT from the terminal. Without this handler the
|
|
116
|
+
threads blocked on ``proc.communicate()`` never return and the process
|
|
117
|
+
hangs.
|
|
118
|
+
|
|
119
|
+
First CTRL+C: SIGTERM all tracked agent processes, raise KeyboardInterrupt.
|
|
120
|
+
Second CTRL+C: SIGKILL all tracked agent processes, force-exit immediately.
|
|
121
|
+
"""
|
|
122
|
+
global _sigint_count
|
|
123
|
+
_sigint_count += 1
|
|
124
|
+
|
|
125
|
+
from datasmith.agents.installed.base import terminate_all_agents
|
|
126
|
+
|
|
127
|
+
if _sigint_count >= 2:
|
|
128
|
+
logger.info("Force-killing all agent subprocesses")
|
|
129
|
+
terminate_all_agents(force=True)
|
|
130
|
+
os._exit(1)
|
|
131
|
+
|
|
132
|
+
logger.info("Interrupted — terminating agent subprocesses (press Ctrl+C again to force-quit)")
|
|
133
|
+
terminate_all_agents(force=False)
|
|
134
|
+
raise KeyboardInterrupt
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def main(argv: list[str] | None = None) -> None:
|
|
138
|
+
args = parse_args(argv)
|
|
139
|
+
validate_dates(args)
|
|
140
|
+
|
|
141
|
+
from datasmith.update.pipeline import Pipeline
|
|
142
|
+
|
|
143
|
+
signal.signal(signal.SIGINT, _sigint_handler)
|
|
144
|
+
|
|
145
|
+
pipeline = Pipeline(
|
|
146
|
+
dry_run=args.dry_run,
|
|
147
|
+
n_concurrent=args.n_concurrent,
|
|
148
|
+
tasks_per_repo=args.tasks_per_repo,
|
|
149
|
+
agent=args.agent,
|
|
150
|
+
force=args.force,
|
|
151
|
+
offline_source=args.offline_source,
|
|
152
|
+
min_stars=args.min_stars,
|
|
153
|
+
)
|
|
154
|
+
try:
|
|
155
|
+
asyncio.run(
|
|
156
|
+
pipeline.run(
|
|
157
|
+
start_date=args.start_date,
|
|
158
|
+
end_date=args.end_date,
|
|
159
|
+
resume=args.resume,
|
|
160
|
+
stage=args.stage, # None or list[int] from append action
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
except KeyboardInterrupt:
|
|
164
|
+
logger.info("Pipeline interrupted by user")
|
|
165
|
+
sys.exit(1)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
main()
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Convert rows from an offline parquet source into ``pull_requests`` records."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from datasmith.filters import symbolic_compliance
|
|
11
|
+
from datasmith.utils import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger("update.offline")
|
|
14
|
+
|
|
15
|
+
# Matches a data row in the markdown file-change table:
|
|
16
|
+
# | filename | additions | deletions | ... |
|
|
17
|
+
_TABLE_ROW_RE = re.compile(r"^\|\s*(?P<filename>[^|]+?)\s*\|\s*(?P<additions>\d+)\s*\|\s*(?P<deletions>\d+)\s*\|")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _sanitize_text(value: object) -> str:
|
|
21
|
+
"""Return a clean string, handling NaN/None and Postgres-illegal null bytes."""
|
|
22
|
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
|
23
|
+
return ""
|
|
24
|
+
return str(value).replace("\u0000", "")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_file_change_summary(summary: object) -> list[dict[str, Any]] | None:
|
|
28
|
+
"""Parse a markdown file-change table into the ``file_changes`` list format.
|
|
29
|
+
|
|
30
|
+
Expected input::
|
|
31
|
+
|
|
32
|
+
| File | Lines Added | Lines Removed | Total Changes |
|
|
33
|
+
|------|-------------|----------------|----------------|
|
|
34
|
+
| foo.py | 10 | 3 | 13 |
|
|
35
|
+
|
|
36
|
+
Returns ``None`` when *summary* is empty/NaN or contains no parseable rows.
|
|
37
|
+
"""
|
|
38
|
+
text = _sanitize_text(summary)
|
|
39
|
+
if not text:
|
|
40
|
+
return None
|
|
41
|
+
changes: list[dict[str, Any]] = []
|
|
42
|
+
for line in text.splitlines():
|
|
43
|
+
# Skip header and separator rows
|
|
44
|
+
if line.startswith("|--") or "Lines Added" in line or "File" in line:
|
|
45
|
+
continue
|
|
46
|
+
m = _TABLE_ROW_RE.match(line)
|
|
47
|
+
if m:
|
|
48
|
+
changes.append({
|
|
49
|
+
"filename": m.group("filename").strip(),
|
|
50
|
+
"additions": int(m.group("additions")),
|
|
51
|
+
"deletions": int(m.group("deletions")),
|
|
52
|
+
})
|
|
53
|
+
return changes or None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _extract_labels(raw_labels: Any) -> list[str]:
|
|
57
|
+
"""Extract label name strings from the parquet labels column.
|
|
58
|
+
|
|
59
|
+
The column stores a numpy array of label dicts (each with a ``name`` key),
|
|
60
|
+
or an empty array.
|
|
61
|
+
"""
|
|
62
|
+
if raw_labels is None:
|
|
63
|
+
return []
|
|
64
|
+
try:
|
|
65
|
+
return [label["name"] for label in raw_labels if isinstance(label, dict) and "name" in label]
|
|
66
|
+
except (TypeError, KeyError):
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _safe_str(value: object) -> str:
|
|
71
|
+
if value is None or (isinstance(value, float) and pd.isna(value)):
|
|
72
|
+
return ""
|
|
73
|
+
return str(value)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _dict_sha(value: object) -> str:
|
|
77
|
+
"""Extract ``sha`` from a dict-like value (pr_head / pr_base columns)."""
|
|
78
|
+
if isinstance(value, dict):
|
|
79
|
+
return str(value.get("sha", ""))
|
|
80
|
+
return ""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_offline_repo_names(path: str) -> list[tuple[str, str]]:
|
|
84
|
+
"""Return unique ``(owner, repo)`` pairs from an offline parquet file."""
|
|
85
|
+
df = pd.read_parquet(path, columns=["repo_name"])
|
|
86
|
+
pairs: list[tuple[str, str]] = []
|
|
87
|
+
seen: set[tuple[str, str]] = set()
|
|
88
|
+
for name in df["repo_name"].dropna().unique():
|
|
89
|
+
name = str(name).strip()
|
|
90
|
+
if "/" not in name:
|
|
91
|
+
continue
|
|
92
|
+
owner, repo = name.split("/", 1)
|
|
93
|
+
pair = (owner, repo)
|
|
94
|
+
if pair not in seen:
|
|
95
|
+
seen.add(pair)
|
|
96
|
+
pairs.append(pair)
|
|
97
|
+
return pairs
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _row_to_record(row: pd.Series) -> dict[str, Any]:
|
|
101
|
+
"""Convert a single parquet row into a ``pull_requests`` upsert record."""
|
|
102
|
+
repo_name = str(row["repo_name"])
|
|
103
|
+
owner, repo = repo_name.split("/", 1)
|
|
104
|
+
|
|
105
|
+
title = _sanitize_text(row.get("pr_title"))
|
|
106
|
+
body = _sanitize_text(row.get("pr_body"))
|
|
107
|
+
patch = _sanitize_text(row.get("original_patch"))
|
|
108
|
+
file_changes = parse_file_change_summary(row.get("file_change_summary"))
|
|
109
|
+
|
|
110
|
+
record: dict[str, Any] = {
|
|
111
|
+
"owner": owner,
|
|
112
|
+
"repo": repo,
|
|
113
|
+
"issue_number": int(row["pr_number"]),
|
|
114
|
+
"title": title,
|
|
115
|
+
"body": body,
|
|
116
|
+
"state": _safe_str(row.get("pr_state")),
|
|
117
|
+
"created_at": _safe_str(row.get("pr_created_at")) or None,
|
|
118
|
+
"merged_at": _safe_str(row.get("pr_merged_at")) or None,
|
|
119
|
+
"closed_at": _safe_str(row.get("pr_closed_at")) or None,
|
|
120
|
+
"merge_commit_sha": _safe_str(row.get("pr_merge_commit_sha")),
|
|
121
|
+
"base_sha": _dict_sha(row.get("pr_base")),
|
|
122
|
+
"head_sha": _dict_sha(row.get("pr_head")),
|
|
123
|
+
"labels": _extract_labels(row.get("pr_labels")),
|
|
124
|
+
"is_performance_commit_symbolic": symbolic_compliance(
|
|
125
|
+
title=title,
|
|
126
|
+
patch=patch or None,
|
|
127
|
+
file_changes=file_changes,
|
|
128
|
+
),
|
|
129
|
+
}
|
|
130
|
+
if patch:
|
|
131
|
+
record["patch"] = patch
|
|
132
|
+
if file_changes:
|
|
133
|
+
record["file_changes"] = file_changes
|
|
134
|
+
return record
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def load_offline_pull_requests(
|
|
138
|
+
path: str,
|
|
139
|
+
since: str | None = None,
|
|
140
|
+
until: str | None = None,
|
|
141
|
+
) -> list[dict[str, Any]]:
|
|
142
|
+
"""Load and convert parquet rows into ``pull_requests`` upsert records.
|
|
143
|
+
|
|
144
|
+
Filters by ``pr_merged_at`` using the same ``[since, until)`` semantics
|
|
145
|
+
as :class:`~datasmith.runners.scrape_commits.ScrapeCommitsRunner`.
|
|
146
|
+
"""
|
|
147
|
+
df = pd.read_parquet(path)
|
|
148
|
+
logger.info("Loaded %d rows from offline source %s", len(df), path)
|
|
149
|
+
|
|
150
|
+
# Date filtering on pr_merged_at
|
|
151
|
+
if since or until:
|
|
152
|
+
merged: pd.Series[Any] = pd.to_datetime(df["pr_merged_at"], utc=True, errors="coerce")
|
|
153
|
+
if since:
|
|
154
|
+
df = df[merged >= pd.Timestamp(since, tz="UTC")]
|
|
155
|
+
merged = merged.loc[df.index]
|
|
156
|
+
if until:
|
|
157
|
+
df = df[merged < pd.Timestamp(until, tz="UTC")]
|
|
158
|
+
logger.info("After date filtering: %d rows", len(df))
|
|
159
|
+
|
|
160
|
+
records: list[dict[str, Any]] = []
|
|
161
|
+
for _, row in df.iterrows():
|
|
162
|
+
try:
|
|
163
|
+
records.append(_row_to_record(row))
|
|
164
|
+
except Exception:
|
|
165
|
+
logger.warning(
|
|
166
|
+
"Skipping row %s/%s#%s: conversion error",
|
|
167
|
+
row.get("repo_name", "?"),
|
|
168
|
+
row.get("pr_number", "?"),
|
|
169
|
+
row.get("pr_merge_commit_sha", "?"),
|
|
170
|
+
exc_info=True,
|
|
171
|
+
)
|
|
172
|
+
logger.info("Converted %d records for upsert", len(records))
|
|
173
|
+
return records
|