fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,312 @@
1
+ """Filtering and normalizing package requirements."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import shlex
7
+ from collections.abc import Iterable, Mapping
8
+ from pathlib import Path
9
+
10
+ from git import Commit
11
+
12
+ from .constants import (
13
+ ALLOWLIST_COMMON_PYPI,
14
+ CONDA_SYSTEM_PACKAGES,
15
+ EXTRA_MARKER_RE,
16
+ GENERIC_LOCAL_NAMES,
17
+ NOT_REQUIREMENTS,
18
+ STDLIB,
19
+ )
20
+ from .git_utils import read_blob_text
21
+
22
+
23
+ def parse_extras_segment(token: str) -> list[str]:
24
+ """Extract extras from a token like 'package[extra1,extra2]'."""
25
+ if "[" not in token or not token.endswith("]"):
26
+ return []
27
+ segment = token[token.rfind("[") + 1 : -1]
28
+ if not segment:
29
+ return []
30
+ return [part.strip() for part in segment.split(",") if part.strip()]
31
+
32
+
33
+ def extras_from_install_commands(install_cmds: Iterable[str], extras_available: set[str]) -> set[str]:
34
+ """Extract extras requested in install commands."""
35
+ requested: set[str] = set()
36
+ for cmd in install_cmds:
37
+ if not cmd:
38
+ continue
39
+ for token in shlex.split(cmd):
40
+ for extra in parse_extras_segment(token):
41
+ if extra in extras_available:
42
+ requested.add(extra)
43
+ return requested
44
+
45
+
46
+ def extras_from_matrix(matrix: Mapping[str, set[str]] | None, extras_available: set[str]) -> set[str]:
47
+ """Extract extras from ASV matrix configuration."""
48
+ if not matrix:
49
+ return set()
50
+ requested: set[str] = set()
51
+ for values in matrix.values():
52
+ for value in values:
53
+ if value in extras_available:
54
+ requested.add(value)
55
+ return requested
56
+
57
+
58
+ def extract_requested_extras(
59
+ install_cmds: Iterable[str],
60
+ matrix: Mapping[str, set[str]] | None,
61
+ available: Iterable[str],
62
+ ) -> set[str]:
63
+ """Extract all requested extras from install commands and matrix."""
64
+ extras_available = set(available)
65
+ requested = extras_from_install_commands(install_cmds, extras_available)
66
+ requested.update(extras_from_matrix(matrix, extras_available))
67
+ return requested
68
+
69
+
70
+ def resolve_requirements_file(commit: Commit, rel_path: str, seen: set[str]) -> set[str]:
71
+ """Recursively resolve a requirements file from a commit."""
72
+ if rel_path in seen:
73
+ return set()
74
+ seen.add(rel_path)
75
+
76
+ requirements: set[str] = set()
77
+ content = read_blob_text(commit, rel_path)
78
+ if not content:
79
+ return requirements
80
+
81
+ for line in content.splitlines():
82
+ line = line.strip()
83
+ if not line or line.startswith("#"):
84
+ continue
85
+
86
+ tokens = line.split()
87
+ if len(tokens) >= 2 and tokens[0] in {"-r", "--requirement"}:
88
+ nested_path = tokens[1]
89
+ if "/" in rel_path:
90
+ base_dir = "/".join(rel_path.split("/")[:-1])
91
+ nested_path = f"{base_dir}/{nested_path}"
92
+ requirements.update(resolve_requirements_file(commit, nested_path, seen))
93
+ continue
94
+
95
+ requirements.add(line)
96
+
97
+ return requirements
98
+
99
+
100
+ def split_shell_command(cmd: str) -> list[str]:
101
+ """Split a shell command on operators like &&, ||, ; into separate commands."""
102
+ parts = re.split(r"\s*(?:&&|\|\||;)\s*", cmd)
103
+ return [p.strip() for p in parts if p.strip()]
104
+
105
+
106
+ def is_valid_direct_url(req: str) -> bool:
107
+ """Check if a requirement string is a valid direct URL for uv."""
108
+ if not req or not req.strip():
109
+ return False
110
+ req = req.strip()
111
+ if not (
112
+ req.startswith("http://")
113
+ or req.startswith("https://")
114
+ or req.startswith("git+")
115
+ or req.startswith("hg+")
116
+ or req.startswith("svn+")
117
+ or req.startswith("bzr+")
118
+ or req.startswith("file://")
119
+ ):
120
+ return False
121
+ ok_exts = (
122
+ ".whl",
123
+ ".tar.gz",
124
+ ".zip",
125
+ ".tar.bz2",
126
+ ".tar.lz",
127
+ ".tar.lzma",
128
+ ".tar.xz",
129
+ ".tar.zst",
130
+ ".tar",
131
+ ".tbz",
132
+ ".tgz",
133
+ ".tlz",
134
+ ".txz",
135
+ )
136
+ return any(req.lower().endswith(ext) for ext in ok_exts)
137
+
138
+
139
+ def is_valid_pypi_requirement(req: str) -> bool:
140
+ """Validate if a string looks like a valid PyPI requirement per PEP 508."""
141
+ if not req or not req.strip():
142
+ return False
143
+ req = req.strip()
144
+ if "{" in req or "}" in req or "$" in req:
145
+ return False
146
+ if any(op in req for op in ["&&", "||", ";;", "|", "&"]):
147
+ return False
148
+ if req.startswith("--"):
149
+ return False
150
+ if any(req.startswith(prefix) for prefix in ["http://", "https://", "git+", "hg+", "svn+", "bzr+", "file://"]):
151
+ return True
152
+ if req.startswith("."):
153
+ return False
154
+ pkg_match = re.match(r"^([A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?)", req)
155
+ if not pkg_match:
156
+ return False
157
+ pkg_name = pkg_match.group(1)
158
+ return not (pkg_name.startswith("_") or len(pkg_name) == 1)
159
+
160
+
161
+ def fix_marker_spacing(req: str) -> str:
162
+ """Fix missing spaces around 'and' and 'or' operators in PEP 508 markers."""
163
+ if "#" in req:
164
+ match = re.search(r"(?<!\s)#", req)
165
+ if match:
166
+ req = req[: match.start()]
167
+ if ";" not in req:
168
+ return req
169
+ parts = req.split(";", 1)
170
+ if len(parts) != 2:
171
+ return req
172
+ pkg_spec, marker = parts
173
+ marker = re.sub(r"(?<=[^\s])and(?=[^\s])", " and ", marker)
174
+ marker = re.sub(r"(?<=[^\s])or(?=[^\s])", " or ", marker)
175
+ return f"{pkg_spec};{marker}"
176
+
177
+
178
+ def normalize_requirement(req: str) -> list[str]:
179
+ """Normalize a token into one or more requirement strings."""
180
+ if not req or not req.strip():
181
+ return []
182
+ req = req.strip()
183
+ req = fix_marker_spacing(req)
184
+ if "{" in req or "}" in req or "$" in req:
185
+ return []
186
+ if any(op in req for op in ["&&", "||", ";;", "|", "&"]) or req.startswith("--"):
187
+ return []
188
+ if req in {"-r", "--requirement", "-c", "--constraint", "-e", "--editable"}:
189
+ return []
190
+ if req.startswith(("http://", "https://", "git+", "hg+", "svn+", "bzr+", "file://")):
191
+ return [req] if is_valid_direct_url(req) else []
192
+ if req.startswith("."):
193
+ return []
194
+ return [req]
195
+
196
+
197
+ def project_local_names(project_dir: Path) -> set[str]:
198
+ """Collect names that look like local modules/packages."""
199
+ names: set[str] = set()
200
+ skip_dirs = {"__pycache__", ".git", ".eggs", ".tox", "build", "dist", "node_modules"}
201
+ for py in project_dir.glob("*.py"):
202
+ if not py.name.startswith("_"):
203
+ names.add(py.stem)
204
+ for item in project_dir.rglob("*"):
205
+ if any(skip in item.parts for skip in skip_dirs):
206
+ continue
207
+ if item.is_dir():
208
+ if item.name.startswith(".") or item.name.startswith("_"):
209
+ continue
210
+ if (item / "__init__.py").exists():
211
+ names.add(item.name)
212
+ elif item.suffix == ".py":
213
+ if not item.name.startswith("_"):
214
+ names.add(item.stem)
215
+ return names
216
+
217
+
218
+ def clean_pinned(reqs: list[str]) -> list[str]:
219
+ """Removes lower-bound version specifiers from requirements that have both >= and <=."""
220
+ new_reqs = []
221
+ for r in reqs:
222
+ r = re.sub(r"\s+", "", r)
223
+ if ">=" in r and "<=" in r:
224
+ pkg_name = extract_pkg_name(r)
225
+ parts = re.split(r",\s*", r)
226
+ le_parts = [p for p in parts if "<=" in p]
227
+ if le_parts:
228
+ le_parts = [p if pkg_name in p else f"{pkg_name}{p}" for p in le_parts]
229
+ new_reqs.extend(le_parts)
230
+ else:
231
+ new_reqs.append(r)
232
+ else:
233
+ new_reqs.append(r)
234
+ return new_reqs
235
+
236
+
237
+ def extract_pkg_name(req: str) -> str:
238
+ """Extract package name from a requirement string."""
239
+ name = re.split(r"[<>=!;\s]", req, maxsplit=1)[0]
240
+ if "[" in name:
241
+ name = name.split("[", 1)[0]
242
+ return name.strip()
243
+
244
+
245
+ def filter_requirements_for_pypi( # noqa: C901
246
+ requirements: Iterable[str], *, project_dir: Path, own_import_name: str | None
247
+ ) -> list[str]:
248
+ """Remove things that are clearly not PyPI-installable."""
249
+ from .blocklist import get_blocklist, normalize_package_name
250
+
251
+ local_names = project_local_names(project_dir)
252
+ own_names = set()
253
+ if own_import_name:
254
+ own_names |= {own_import_name, own_import_name.replace("-", "_"), own_import_name.replace("_", "-")}
255
+
256
+ dynamic_blocklist = get_blocklist()
257
+
258
+ out: list[str] = []
259
+ for raw in requirements:
260
+ if not raw or not raw.strip():
261
+ continue
262
+ raw = raw.strip()
263
+ raw = fix_marker_spacing(raw)
264
+
265
+ if raw.startswith(("http://", "https://", "git+", "hg+", "svn+", "bzr+", "file://")):
266
+ if is_valid_direct_url(raw):
267
+ out.append(raw)
268
+ continue
269
+
270
+ name = extract_pkg_name(raw)
271
+ if not name:
272
+ continue
273
+ low = name.lower()
274
+
275
+ if low.startswith("python"):
276
+ suffix = low[6:]
277
+ if not suffix or suffix[0].isdigit() or suffix.startswith("."):
278
+ continue
279
+
280
+ if name.startswith("_") or len(name) == 1:
281
+ continue
282
+ if low in STDLIB or name in NOT_REQUIREMENTS:
283
+ continue
284
+
285
+ normalized_name = normalize_package_name(name)
286
+ if normalized_name in dynamic_blocklist:
287
+ continue
288
+ if low in CONDA_SYSTEM_PACKAGES:
289
+ continue
290
+ if low in GENERIC_LOCAL_NAMES and name not in ALLOWLIST_COMMON_PYPI:
291
+ continue
292
+ if name in own_names:
293
+ continue
294
+ if name in local_names and name not in ALLOWLIST_COMMON_PYPI:
295
+ continue
296
+
297
+ out.append(raw)
298
+
299
+ stripped: list[str] = []
300
+ for r in out:
301
+ r2 = EXTRA_MARKER_RE.sub("", r).strip()
302
+ r2 = re.sub(r"\s*;\s*$", "", r2)
303
+ stripped.append(r2)
304
+
305
+ deduped: list[str] = []
306
+ seen: set[str] = set()
307
+ for r in stripped:
308
+ if r not in seen:
309
+ seen.add(r)
310
+ deduped.append(r)
311
+
312
+ return deduped
@@ -0,0 +1,110 @@
1
+ """Python version management and uv interaction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as dt
6
+ import os
7
+ import subprocess
8
+ from pathlib import Path
9
+
10
+ from datasmith.utils import get_logger
11
+
12
+ logger = get_logger("resolution.python_manager")
13
+
14
+
15
+ def run_uv(
16
+ args: list[str],
17
+ *,
18
+ input_text: str | None = None,
19
+ cwd: Path | None = None,
20
+ extra_env: dict[str, str] | None = None,
21
+ check: bool = False,
22
+ ) -> subprocess.CompletedProcess:
23
+ """Run a uv command with specified arguments."""
24
+ env = os.environ.copy()
25
+ env.setdefault("UV_COLOR", "never")
26
+ env.setdefault("NO_COLOR", "1")
27
+ if extra_env:
28
+ env.update(extra_env)
29
+ cp = subprocess.run(
30
+ ["uv", *args],
31
+ input=input_text.encode("utf-8") if input_text is not None else None,
32
+ capture_output=True,
33
+ cwd=str(cwd) if cwd else None,
34
+ env=env,
35
+ )
36
+ if check and cp.returncode != 0:
37
+ raise RuntimeError(
38
+ f"uv {' '.join(args)} failed with code {cp.returncode}\n"
39
+ f"STDOUT:\n{cp.stdout.decode()}\nSTDERR:\n{cp.stderr.decode()}"
40
+ )
41
+ return cp
42
+
43
+
44
+ def ensure_python_version_available(version: str) -> bool:
45
+ """Ensure uv has the requested Python version available, downloading if needed."""
46
+ list_cp = run_uv(["python", "list"])
47
+ if list_cp.returncode == 0:
48
+ output = list_cp.stdout.decode()
49
+ if version in output or f"cpython-{version}" in output or version.replace(".", "") in output:
50
+ return True
51
+
52
+ install_cp = run_uv(["python", "install", version])
53
+ if install_cp.returncode == 0:
54
+ logger.debug("Successfully installed Python %s", version)
55
+ return True
56
+
57
+ logger.debug("Failed to install Python %s: %s", version, install_cp.stderr.decode())
58
+ return False
59
+
60
+
61
+ def filter_python_versions_by_commit_date( # noqa: C901
62
+ available_versions: set[tuple[int, ...]], commit_date: dt.datetime
63
+ ) -> list[tuple[int, ...]]:
64
+ """Filter Python versions to avoid anachronistic choices.
65
+
66
+ Note: Python 3.7 is excluded since it's EOL and not available in uv.
67
+ """
68
+ valid_versions = [v for v in available_versions if v >= (3, 8)]
69
+ if not valid_versions:
70
+ return []
71
+
72
+ py_releases = {
73
+ (3, 7): dt.datetime(2018, 6, 27, tzinfo=dt.timezone.utc),
74
+ (3, 8): dt.datetime(2019, 10, 14, tzinfo=dt.timezone.utc),
75
+ (3, 9): dt.datetime(2020, 10, 5, tzinfo=dt.timezone.utc),
76
+ (3, 10): dt.datetime(2021, 10, 4, tzinfo=dt.timezone.utc),
77
+ (3, 11): dt.datetime(2022, 10, 24, tzinfo=dt.timezone.utc),
78
+ (3, 12): dt.datetime(2023, 10, 2, tzinfo=dt.timezone.utc),
79
+ (3, 13): dt.datetime(2024, 10, 7, tzinfo=dt.timezone.utc),
80
+ }
81
+
82
+ grace_period = dt.timedelta(days=90)
83
+ filtered = []
84
+ for v in valid_versions:
85
+ version_key = (v[0], v[1])
86
+ release_date = py_releases.get(version_key)
87
+
88
+ if release_date is None:
89
+ if commit_date < dt.datetime(2024, 1, 1, tzinfo=dt.timezone.utc):
90
+ continue
91
+ filtered.append(v)
92
+ elif commit_date >= release_date - grace_period:
93
+ filtered.append(v)
94
+
95
+ if not filtered:
96
+ inferred = []
97
+ for version_key, release_date in sorted(py_releases.items(), reverse=True):
98
+ if version_key < (3, 8):
99
+ continue
100
+ if release_date <= commit_date + grace_period:
101
+ matching = [v for v in valid_versions if (v[0], v[1]) == version_key]
102
+ if matching:
103
+ inferred.extend(matching)
104
+ elif len(inferred) < 3:
105
+ inferred.append(version_key)
106
+ if len(inferred) >= 3:
107
+ break
108
+ filtered = inferred if inferred else [(3, 8)]
109
+
110
+ return sorted(filtered, reverse=True)
@@ -0,0 +1,15 @@
1
+ """ds.runners — Async runner infrastructure with Supabase progress tracking."""
2
+
3
+ from datasmith.runners.base import BaseRunner
4
+ from datasmith.runners.classify_prs import ClassifyPRsRunner
5
+ from datasmith.runners.scrape_commits import ScrapeCommitsRunner
6
+ from datasmith.runners.scrape_repos import ScrapeReposRunner
7
+ from datasmith.runners.synthesize_images import SynthesizeImagesRunner
8
+
9
+ __all__ = [
10
+ "BaseRunner",
11
+ "ClassifyPRsRunner",
12
+ "ScrapeCommitsRunner",
13
+ "ScrapeReposRunner",
14
+ "SynthesizeImagesRunner",
15
+ ]
@@ -0,0 +1,112 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import time
5
+ import traceback
6
+ import uuid
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, TypeVar
9
+
10
+ from datasmith.utils import get_client, get_logger
11
+
12
+ logger = get_logger("runners.base")
13
+
14
+ T = TypeVar("T")
15
+
16
+
17
+ class BaseRunner(ABC):
18
+ """Abstract async runner with Supabase progress tracking."""
19
+
20
+ def __init__(self, name: str, n_concurrent: int = 10) -> None:
21
+ self.name = name
22
+ self.runner_id = f"{name}-{uuid.uuid4().hex[:8]}"
23
+ self._n_concurrent = n_concurrent
24
+ self._completed = 0
25
+ self._failed = 0
26
+ self._total = 0
27
+ self._last_progress_update = 0.0
28
+
29
+ @abstractmethod
30
+ async def _process_item(self, item: Any) -> None: ...
31
+
32
+ async def run(self, items: list[Any]) -> None:
33
+ """Run the runner on a list of items with bounded concurrency."""
34
+ self._total = len(items)
35
+ self._completed = 0
36
+ self._failed = 0
37
+
38
+ self._init_progress()
39
+
40
+ sem = asyncio.Semaphore(self._n_concurrent)
41
+
42
+ async def _wrapped(item: Any) -> None:
43
+ async with sem:
44
+ try:
45
+ await self._process_item(item)
46
+ self._completed += 1
47
+ except Exception as exc:
48
+ self._failed += 1
49
+ self._log_failure(item, exc)
50
+ logger.exception("Failed processing item %s", self._item_id(item))
51
+ finally:
52
+ self._maybe_update_progress()
53
+
54
+ tasks = [asyncio.create_task(_wrapped(item)) for item in items]
55
+ try:
56
+ await asyncio.gather(*tasks)
57
+ except (KeyboardInterrupt, asyncio.CancelledError):
58
+ for t in tasks:
59
+ t.cancel()
60
+ await asyncio.gather(*tasks, return_exceptions=True)
61
+ raise
62
+ finally:
63
+ self._update_progress(force=True)
64
+
65
+ def _item_id(self, item: Any) -> str:
66
+ if hasattr(item, "cache_key"):
67
+ return str(item.cache_key)
68
+ return str(item)
69
+
70
+ def _init_progress(self) -> None:
71
+ try:
72
+ client = get_client()
73
+ client.table("runner_progress").upsert({
74
+ "runner_id": self.runner_id,
75
+ "runner_name": self.name,
76
+ "total": self._total,
77
+ "completed": 0,
78
+ "failed": 0,
79
+ }).execute()
80
+ except Exception:
81
+ logger.warning("Failed to initialize progress tracking")
82
+
83
+ def _maybe_update_progress(self) -> None:
84
+ now = time.time()
85
+ if (self._completed + self._failed) % 10 == 0 or now - self._last_progress_update > 30:
86
+ self._update_progress()
87
+
88
+ def _update_progress(self, force: bool = False) -> None:
89
+ self._last_progress_update = time.time()
90
+ try:
91
+ client = get_client()
92
+ client.table("runner_progress").upsert({
93
+ "runner_id": self.runner_id,
94
+ "runner_name": self.name,
95
+ "total": self._total,
96
+ "completed": self._completed,
97
+ "failed": self._failed,
98
+ }).execute()
99
+ except Exception:
100
+ logger.warning("Failed to update progress")
101
+
102
+ def _log_failure(self, item: Any, exc: Exception) -> None:
103
+ try:
104
+ client = get_client()
105
+ client.table("runner_failures").insert({
106
+ "runner_id": self.runner_id,
107
+ "item_id": self._item_id(item),
108
+ "error_message": str(exc),
109
+ "traceback": traceback.format_exc(),
110
+ }).execute()
111
+ except Exception:
112
+ logger.warning("Failed to log failure for %s", self._item_id(item))
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import functools
5
+ from typing import Any
6
+
7
+ from datasmith.runners.base import BaseRunner
8
+ from datasmith.utils import get_client, get_logger
9
+
10
+ logger = get_logger("runners.classify_prs")
11
+
12
+
13
+ class ClassifyPRsRunner(BaseRunner):
14
+ """Batch classification of PRs via LLM agents."""
15
+
16
+ def __init__(self, classifier: Any, judge: Any, n_concurrent: int = 5) -> None:
17
+ super().__init__(name="classify_prs", n_concurrent=n_concurrent)
18
+ self._classifier = classifier
19
+ self._judge = judge
20
+
21
+ async def _process_item(self, item: Any) -> None:
22
+ """Process a PR dict with owner, repo, issue_number, description, patch."""
23
+ owner = item["owner"]
24
+ repo = item["repo"]
25
+ issue_number = item["issue_number"]
26
+ description = item.get("description", "")
27
+ patch = item.get("patch", "")
28
+ file_change_summary = item.get("file_change_summary", "")
29
+
30
+ loop = asyncio.get_running_loop()
31
+
32
+ is_perf, _reason = await loop.run_in_executor(
33
+ None, functools.partial(self._classifier.classify, description, patch, file_change_summary)
34
+ )
35
+
36
+ update: dict[str, Any] = {"is_performance_commit": is_perf}
37
+
38
+ if is_perf:
39
+ decision = await loop.run_in_executor(None, functools.partial(self._judge.classify, description, patch))
40
+ update["classification"] = decision.category
41
+ update["difficulty"] = decision.difficulty
42
+
43
+ client = get_client()
44
+ client.table("pull_requests").update(update).eq("owner", owner).eq("repo", repo).eq(
45
+ "issue_number", issue_number
46
+ ).execute()
47
+
48
+ logger.info("Classified %s/%s#%d: perf=%s", owner, repo, issue_number, is_perf)