fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,497 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from datasmith.utils import get_client, get_logger
6
+ from datasmith.utils.db import fetch_all
7
+
8
+ logger = get_logger("update.pipeline")
9
+
10
+
11
+ def _cap_per_repo(items: list[dict[str, Any]], limit: int) -> list[dict[str, Any]]:
12
+ """Return at most *limit* randomly-sampled items per (owner, repo)."""
13
+ import random
14
+ from collections import defaultdict
15
+
16
+ by_repo: defaultdict[tuple[str, str], list[dict[str, Any]]] = defaultdict(list)
17
+ for it in items:
18
+ by_repo[(it["owner"], it["repo"])].append(it)
19
+
20
+ capped: list[dict[str, Any]] = []
21
+ for group in by_repo.values():
22
+ capped.extend(random.sample(group, min(limit, len(group))))
23
+ logger.info("Capped to %d tasks (%d per repo) from %d total", len(capped), limit, len(items))
24
+ return capped
25
+
26
+
27
+ def _fetch_repo_descriptions(rows: list[dict[str, Any]]) -> dict[tuple[str, str], str]:
28
+ """Batch-fetch repo descriptions for a set of rows with owner/repo keys."""
29
+ repo_keys = {(r["owner"], r["repo"]) for r in rows}
30
+ descriptions: dict[tuple[str, str], str] = {}
31
+ if repo_keys:
32
+ desc_rows = fetch_all("repositories", select="owner, repo, description")
33
+ for rd in desc_rows:
34
+ key = (rd["owner"], rd["repo"])
35
+ if key in repo_keys:
36
+ descriptions[key] = rd.get("description") or ""
37
+ return descriptions
38
+
39
+
40
+ STAGES = [
41
+ "scrape_repos",
42
+ "scrape_commits",
43
+ "classify_prs",
44
+ "resolve_packages",
45
+ "render_problems",
46
+ "synthesize_images",
47
+ "publish",
48
+ ]
49
+
50
+
51
+ def _build_file_change_summary(file_changes: list[dict[str, Any]] | None) -> str:
52
+ if not file_changes:
53
+ return ""
54
+ lines = [
55
+ "| File | Lines Added | Lines Removed |",
56
+ "|------|-------------|----------------|",
57
+ ]
58
+ for f in file_changes:
59
+ lines.append(f"| {f.get('filename', '')} | {f.get('additions', 0)} | {f.get('deletions', 0)} |")
60
+ return "\n".join(lines)
61
+
62
+
63
+ def _format_description(title: str, body: str) -> str:
64
+ parts = [p for p in (title.strip(), body.strip()) if p]
65
+ return "\n\n".join(parts)
66
+
67
+
68
+ class Pipeline:
69
+ """Orchestrate the full FormulaCode update pipeline."""
70
+
71
+ def __init__(
72
+ self,
73
+ dry_run: bool = False,
74
+ n_concurrent: int | None = None,
75
+ tasks_per_repo: int | None = None,
76
+ agent: str | None = None,
77
+ force: bool = False,
78
+ offline_source: str | None = None,
79
+ min_stars: int = 500,
80
+ ) -> None:
81
+ self._dry_run = dry_run
82
+ self._n_concurrent = n_concurrent
83
+ self._tasks_per_repo = tasks_per_repo
84
+ self._agent = agent
85
+ self._force = force
86
+ self._offline_source = offline_source
87
+ self._min_stars = min_stars
88
+ self._completed_stages: list[str] = []
89
+
90
+ async def run(
91
+ self,
92
+ start_date: str,
93
+ end_date: str,
94
+ resume: bool = False,
95
+ stage: int | list[int] | None = None,
96
+ ) -> None:
97
+ """Execute pipeline stages in order.
98
+
99
+ Args:
100
+ start_date: ISO date string (YYYY-MM-DD)
101
+ end_date: ISO date string (YYYY-MM-DD)
102
+ resume: If True, skip already-completed stages
103
+ stage: If set, run only these stages (1-based indices)
104
+ """
105
+ stages_to_run = STAGES
106
+
107
+ if stage is not None:
108
+ indices = [stage] if isinstance(stage, int) else stage
109
+ for s in indices:
110
+ if s < 1 or s > len(STAGES):
111
+ raise ValueError(f"Stage must be 1-{len(STAGES)}, got {s}")
112
+ stages_to_run = [STAGES[s - 1] for s in sorted(indices)]
113
+ elif resume:
114
+ completed = self._get_completed_stages()
115
+ stages_to_run = [s for s in STAGES if s not in completed]
116
+ if not stages_to_run:
117
+ logger.info("All stages already completed")
118
+ return
119
+
120
+ logger.info(
121
+ "Running pipeline stages: %s (dry_run=%s)",
122
+ ", ".join(stages_to_run),
123
+ self._dry_run,
124
+ )
125
+
126
+ for stage_name in stages_to_run:
127
+ if self._dry_run:
128
+ logger.info("[DRY RUN] Would run stage: %s", stage_name)
129
+ continue
130
+
131
+ logger.info("Starting stage: %s", stage_name)
132
+ try:
133
+ await self._run_stage(stage_name, start_date, end_date)
134
+ self._completed_stages.append(stage_name)
135
+ self._mark_stage_completed(stage_name)
136
+ logger.info("Completed stage: %s", stage_name)
137
+ except Exception:
138
+ logger.exception("Stage %s failed", stage_name)
139
+ raise
140
+
141
+ async def _run_stage(self, stage_name: str, start_date: str, end_date: str) -> None:
142
+ if stage_name == "scrape_repos":
143
+ await self._scrape_repos()
144
+ elif stage_name == "scrape_commits":
145
+ await self._scrape_commits(start_date, end_date)
146
+ elif stage_name == "classify_prs":
147
+ await self._classify_prs()
148
+ elif stage_name == "resolve_packages":
149
+ await self._resolve_packages(start_date, end_date)
150
+ elif stage_name == "render_problems":
151
+ await self._render_problems()
152
+ elif stage_name == "synthesize_images":
153
+ await self._synthesize_images()
154
+ elif stage_name == "publish":
155
+ await self._publish(start_date, end_date)
156
+
157
+ async def _scrape_repos(self) -> None:
158
+ from datasmith.github.client import GitHubClient
159
+ from datasmith.github.search import search_repos_by_file
160
+ from datasmith.runners.scrape_repos import ScrapeReposRunner
161
+ from datasmith.utils.tokens import TokenPool
162
+
163
+ pool = TokenPool()
164
+ gh = GitHubClient(pool)
165
+ runner = ScrapeReposRunner(gh, **({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}))
166
+
167
+ seen: set[tuple[str, str]] = set()
168
+ items: list[tuple[str, str]] = []
169
+
170
+ # 1. Discover repos via GitHub code search
171
+ discovered = await search_repos_by_file(gh, filename="asv.conf.json", min_stars=self._min_stars)
172
+ for pair in discovered:
173
+ if pair not in seen:
174
+ seen.add(pair)
175
+ items.append(pair)
176
+
177
+ # 2. Import repos from offline source (parquet) if provided
178
+ if self._offline_source:
179
+ from datasmith.update.offline import load_offline_repo_names
180
+
181
+ for pair in load_offline_repo_names(self._offline_source):
182
+ if pair not in seen:
183
+ seen.add(pair)
184
+ items.append(pair)
185
+ logger.info(
186
+ "Imported repos from offline source: %d new (total %d)",
187
+ len(items) - len(discovered),
188
+ len(items),
189
+ )
190
+
191
+ # 3. Also include repos already in the DB (metadata refresh)
192
+ rows = fetch_all("repositories", select="owner, repo")
193
+ for r in rows:
194
+ pair = (r["owner"], r["repo"])
195
+ if pair not in seen:
196
+ seen.add(pair)
197
+ items.append(pair)
198
+
199
+ logger.info("Total repos to process: %d", len(items))
200
+ await runner.run(items)
201
+ await gh.close()
202
+
203
+ async def _scrape_commits(self, start_date: str, end_date: str) -> None:
204
+ from datasmith.github.client import GitHubClient
205
+ from datasmith.runners.scrape_commits import ScrapeCommitsRunner
206
+ from datasmith.utils.tokens import TokenPool
207
+
208
+ pool = TokenPool()
209
+ gh = GitHubClient(pool)
210
+ kwargs: dict[str, Any] = {"since": start_date, "until": end_date}
211
+ if self._n_concurrent:
212
+ kwargs["n_concurrent"] = self._n_concurrent
213
+ runner = ScrapeCommitsRunner(gh, **kwargs)
214
+
215
+ rows = fetch_all("repositories", select="owner, repo")
216
+ items = [(r["owner"], r["repo"]) for r in rows]
217
+ await runner.run(items)
218
+ await gh.close()
219
+
220
+ # Bulk-import from offline source (parquet) if provided
221
+ if self._offline_source:
222
+ from datasmith.update.offline import load_offline_pull_requests
223
+ from datasmith.utils.db import batch_upsert
224
+
225
+ records = load_offline_pull_requests(self._offline_source, start_date, end_date)
226
+ n = batch_upsert("pull_requests", records)
227
+ logger.info("Imported %d pull request records from offline source", n)
228
+
229
+ async def _classify_prs(self) -> None:
230
+ from datasmith.agents.classifiers import ClassifyJudge, PerfClassifier
231
+ from datasmith.agents.config import AgentConfig, configure_dspy
232
+ from datasmith.runners.classify_prs import ClassifyPRsRunner
233
+
234
+ configure_dspy(AgentConfig.from_env())
235
+
236
+ classifier = PerfClassifier()
237
+ judge = ClassifyJudge()
238
+ runner = ClassifyPRsRunner(
239
+ classifier,
240
+ judge,
241
+ **({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}),
242
+ )
243
+
244
+ classify_kwargs: dict[str, Any] = {
245
+ "select": "owner, repo, issue_number, title, body, patch, file_changes",
246
+ "filters": {"is_performance_commit_symbolic": True},
247
+ }
248
+ if not self._force:
249
+ classify_kwargs["is_null"] = ["is_performance_commit"]
250
+ rows = fetch_all("pull_requests", **classify_kwargs)
251
+ items = [
252
+ {
253
+ "owner": r["owner"],
254
+ "repo": r["repo"],
255
+ "issue_number": r["issue_number"],
256
+ "description": _format_description(r.get("title", ""), r.get("body", "")),
257
+ "patch": r.get("patch", ""),
258
+ "file_change_summary": _build_file_change_summary(r.get("file_changes")),
259
+ }
260
+ for r in rows
261
+ ]
262
+ await runner.run(items)
263
+
264
+ async def _resolve_packages(self, start_date: str, end_date: str) -> None:
265
+ from datasmith.runners.resolve_packages import ResolvePackagesRunner
266
+
267
+ runner = ResolvePackagesRunner(
268
+ **({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}),
269
+ )
270
+
271
+ # Get performance-classified PRs within the date range
272
+ rows = fetch_all(
273
+ "pull_requests",
274
+ select="owner, repo, merge_commit_sha",
275
+ filters={"is_performance_commit": True},
276
+ gte_filters={"created_at": start_date},
277
+ lte_filters={"created_at": end_date},
278
+ )
279
+
280
+ # Deduplicate by (owner, repo, sha) — multiple PRs may share the same commit
281
+ seen: set[tuple[str, str, str]] = set()
282
+ items: list[dict[str, Any]] = []
283
+ for r in rows:
284
+ sha = r.get("merge_commit_sha", "")
285
+ if not sha:
286
+ continue
287
+ key = (r["owner"], r["repo"], sha)
288
+ if key in seen:
289
+ continue
290
+ seen.add(key)
291
+ items.append({"owner": r["owner"], "repo": r["repo"], "sha": sha})
292
+
293
+ # Skip items already in the packages table (unless --force)
294
+ if items and not self._force:
295
+ existing_rows = fetch_all("packages", select="owner, repo, sha")
296
+ existing_keys = {(e["owner"], e["repo"], e["sha"]) for e in existing_rows}
297
+ items = [it for it in items if (it["owner"], it["repo"], it["sha"]) not in existing_keys]
298
+
299
+ logger.info("Resolving packages for %d commits", len(items))
300
+ await runner.run(items)
301
+
302
+ async def _render_problems(self) -> None:
303
+ from datasmith.agents.config import AgentConfig, configure_dspy
304
+ from datasmith.github.client import GitHubClient
305
+ from datasmith.runners.render_problems import RenderProblemsRunner
306
+ from datasmith.utils.tokens import TokenPool
307
+
308
+ configure_dspy(AgentConfig.from_env())
309
+
310
+ pool = TokenPool()
311
+ gh = GitHubClient(pool)
312
+ runner = RenderProblemsRunner(
313
+ gh=gh,
314
+ **({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}),
315
+ )
316
+
317
+ # Fetch performance-classified PRs
318
+ rows = fetch_all(
319
+ "pull_requests",
320
+ select="owner, repo, issue_number, merge_commit_sha, title, body, created_at",
321
+ filters={"is_performance_commit": True, "is_performance_commit_symbolic": True},
322
+ neq_filters={"merge_commit_sha": ""},
323
+ )
324
+
325
+ # Only process PRs whose commit has can_install=True resolved packages
326
+ pkg_rows = fetch_all(
327
+ "packages",
328
+ select="owner, repo, sha",
329
+ filters={"can_install": True},
330
+ )
331
+ installable: set[tuple[str, str, str]] = {(p["owner"], p["repo"], p["sha"]) for p in pkg_rows}
332
+
333
+ repo_descriptions = _fetch_repo_descriptions(rows)
334
+
335
+ # Skip PRs already processed (have a candidate_prs row) unless --force
336
+ existing_keys: set[tuple[str, str, int]] = set()
337
+ if not self._force:
338
+ existing_rows = fetch_all("candidate_prs", select="owner, repo, issue_number")
339
+ existing_keys = {(e["owner"], e["repo"], e["issue_number"]) for e in existing_rows}
340
+
341
+ items = []
342
+ for r in rows:
343
+ sha = r.get("merge_commit_sha", "")
344
+ if not sha:
345
+ continue
346
+ if (r["owner"], r["repo"], sha) not in installable:
347
+ logger.debug(
348
+ "Skipping %s/%s#%d: no can_install package for sha %s",
349
+ r["owner"],
350
+ r["repo"],
351
+ r["issue_number"],
352
+ sha[:8],
353
+ )
354
+ continue
355
+ if (r["owner"], r["repo"], r["issue_number"]) in existing_keys:
356
+ continue
357
+ items.append({
358
+ "owner": r["owner"],
359
+ "repo": r["repo"],
360
+ "issue_number": r["issue_number"],
361
+ "merge_commit_sha": sha,
362
+ "title": r.get("title", ""),
363
+ "body": r.get("body", ""),
364
+ "created_at": r.get("created_at"),
365
+ "repo_description": repo_descriptions.get((r["owner"], r["repo"]), ""),
366
+ })
367
+
368
+ if self._tasks_per_repo is not None:
369
+ items = _cap_per_repo(items, self._tasks_per_repo)
370
+
371
+ logger.info("Rendering problem contexts for %d PRs", len(items))
372
+ await runner.run(items)
373
+ await gh.close()
374
+
375
+ async def _synthesize_images(self) -> None:
376
+ from datasmith.agents.synthesizer import Synthesizer
377
+ from datasmith.github.client import GitHubClient
378
+ from datasmith.runners.synthesize_images import SynthesizeImagesRunner
379
+ from datasmith.utils.tokens import TokenPool
380
+
381
+ pool = TokenPool()
382
+ gh = GitHubClient(pool)
383
+
384
+ synth = Synthesizer(agent=self._agent, force=self._force)
385
+ runner = SynthesizeImagesRunner(
386
+ synth,
387
+ gh=gh,
388
+ **({"n_concurrent": self._n_concurrent} if self._n_concurrent else {}),
389
+ )
390
+
391
+ query_kwargs: dict[str, Any] = {
392
+ "select": "owner, repo, issue_number, merge_commit_sha, title, body, created_at, rendered_problem",
393
+ "filters": {"is_performance_commit": True, "is_performance_commit_symbolic": True},
394
+ "neq_filters": {"merge_commit_sha": ""},
395
+ }
396
+ if not self._force:
397
+ query_kwargs["is_null"] = ["container_name"]
398
+ rows = fetch_all("pull_requests", **query_kwargs)
399
+
400
+ # Join with packages table for env_payload and python_version
401
+ pkg_rows = fetch_all(
402
+ "packages",
403
+ select="owner, repo, sha, env_payload, python_version",
404
+ filters={"can_install": True},
405
+ )
406
+ pkg_lookup: dict[tuple[str, str, str], dict[str, Any]] = {
407
+ (p["owner"], p["repo"], p["sha"]): p for p in pkg_rows
408
+ }
409
+
410
+ repo_descriptions = _fetch_repo_descriptions(rows)
411
+
412
+ # Only synthesize PRs that have a rendered context with linked issues
413
+ # and a non-empty extracted problem statement (from stage 5).
414
+ ctx_rows = fetch_all(
415
+ "candidate_prs",
416
+ select="owner, repo, issue_number, issues_json, initial_observations",
417
+ )
418
+ eligible_prs: set[tuple[str, str, int]] = {
419
+ (c["owner"], c["repo"], c["issue_number"])
420
+ for c in ctx_rows
421
+ if c.get("issues_json") or c.get("initial_observations")
422
+ }
423
+
424
+ items = []
425
+ for r in rows:
426
+ sha = r.get("merge_commit_sha", "")
427
+ pkg = pkg_lookup.get((r["owner"], r["repo"], sha), {})
428
+ # Skip PRs without resolved packages
429
+ if not pkg:
430
+ logger.debug(
431
+ "Skipping %s/%s#%d: no resolved packages for sha %s",
432
+ r["owner"],
433
+ r["repo"],
434
+ r["issue_number"],
435
+ sha[:8] if sha else "?",
436
+ )
437
+ continue
438
+ # Skip PRs without a rendered context (non-empty issues + observations)
439
+ if (r["owner"], r["repo"], r["issue_number"]) not in eligible_prs:
440
+ logger.debug(
441
+ "Skipping %s/%s#%d: no eligible pr_context (empty issues_json or initial_observations)",
442
+ r["owner"],
443
+ r["repo"],
444
+ r["issue_number"],
445
+ )
446
+ continue
447
+ items.append({
448
+ "owner": r["owner"],
449
+ "repo": r["repo"],
450
+ "issue_number": r["issue_number"],
451
+ "sha": sha,
452
+ "title": r.get("title", ""),
453
+ "body": r.get("body", ""),
454
+ "created_at": r.get("created_at"),
455
+ "pr_context": r.get("rendered_problem") or r.get("body", ""),
456
+ "repo_description": repo_descriptions.get((r["owner"], r["repo"]), ""),
457
+ "env_payload": pkg.get("env_payload", ""),
458
+ "python_version": pkg.get("python_version", ""),
459
+ })
460
+ if self._tasks_per_repo is not None:
461
+ items = _cap_per_repo(items, self._tasks_per_repo)
462
+
463
+ logger.info("Synthesizing images for %d PRs", len(items))
464
+ await runner.run(items)
465
+ await gh.close()
466
+
467
+ async def _publish(self, start_date: str, end_date: str) -> None:
468
+ from datasmith.publish.pipeline import publish_pipeline
469
+
470
+ await publish_pipeline(start_date, end_date)
471
+
472
+ def _get_completed_stages(self) -> list[str]:
473
+ try:
474
+ client = get_client()
475
+ resp = client.table("runner_progress").select("runner_name, completed, total").execute()
476
+ rows: list[dict[str, Any]] = resp.data # type: ignore[assignment]
477
+ completed: list[str] = []
478
+ for r in rows:
479
+ if r["total"] > 0 and r["completed"] >= r["total"]:
480
+ completed.append(r["runner_name"])
481
+ except Exception:
482
+ return []
483
+ else:
484
+ return completed
485
+
486
+ def _mark_stage_completed(self, stage_name: str) -> None:
487
+ try:
488
+ client = get_client()
489
+ client.table("runner_progress").upsert({
490
+ "runner_id": f"pipeline-{stage_name}",
491
+ "runner_name": stage_name,
492
+ "total": 1,
493
+ "completed": 1,
494
+ "failed": 0,
495
+ }).execute()
496
+ except Exception:
497
+ logger.warning("Failed to mark stage %s as completed", stage_name)
@@ -0,0 +1,18 @@
1
+ """ds.utils — Foundation layer: Supabase client, caching, token pool, config."""
2
+
3
+ from datasmith.utils.core import Settings, get_logger, with_backoff
4
+ from datasmith.utils.db import batch_upsert, fetch_all, get_async_client, get_client, stable_hash, supabase_cached
5
+ from datasmith.utils.tokens import TokenPool
6
+
7
+ __all__ = [
8
+ "Settings",
9
+ "TokenPool",
10
+ "batch_upsert",
11
+ "fetch_all",
12
+ "get_async_client",
13
+ "get_client",
14
+ "get_logger",
15
+ "stable_hash",
16
+ "supabase_cached",
17
+ "with_backoff",
18
+ ]
@@ -0,0 +1,67 @@
1
+ """Core configuration, logging, and retry utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import functools
6
+ import logging
7
+ import sys
8
+ import time
9
+ from typing import Any, Callable, TypeVar
10
+
11
+ from pydantic_settings import BaseSettings
12
+
13
+ F = TypeVar("F", bound=Callable[..., Any])
14
+
15
+
16
+ class Settings(BaseSettings):
17
+ """Application settings loaded from environment / tokens.env."""
18
+
19
+ supabase_url: str = ""
20
+ supabase_key: str = ""
21
+ gh_tokens: str = ""
22
+ dspy_model: str = ""
23
+ dspy_api_key: str = ""
24
+ dspy_api_base: str = ""
25
+ dspy_max_tokens: int = 16000
26
+ dockerhub_username: str = ""
27
+ dockerhub_token: str = ""
28
+ hf_token_path: str = ""
29
+
30
+ model_config = {"env_file": "tokens.env", "env_file_encoding": "utf-8", "extra": "ignore"}
31
+
32
+
33
+ def get_logger(name: str | None = None) -> logging.Logger:
34
+ """Return a logger under the ``datasmith`` namespace."""
35
+ full_name = f"datasmith.{name}" if name else "datasmith"
36
+ logger = logging.getLogger(full_name)
37
+ if not logging.getLogger("datasmith").handlers:
38
+ handler = logging.StreamHandler(sys.stderr)
39
+ handler.setFormatter(logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s", datefmt="%H:%M:%S"))
40
+ root = logging.getLogger("datasmith")
41
+ root.addHandler(handler)
42
+ root.setLevel(logging.INFO)
43
+ return logger
44
+
45
+
46
+ def with_backoff(max_retries: int = 3, base_delay: float = 1.0) -> Callable[[F], F]:
47
+ """Decorator: retry with exponential backoff on transient failures."""
48
+
49
+ def decorator(func: F) -> F:
50
+ @functools.wraps(func)
51
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
52
+ delay = base_delay
53
+ last_exc: Exception | None = None
54
+ for attempt in range(max_retries + 1):
55
+ try:
56
+ return func(*args, **kwargs)
57
+ except Exception as exc:
58
+ last_exc = exc
59
+ if attempt == max_retries:
60
+ raise
61
+ time.sleep(delay)
62
+ delay *= 2
63
+ raise last_exc # type: ignore[misc] # unreachable but satisfies mypy
64
+
65
+ return wrapper # type: ignore[return-value]
66
+
67
+ return decorator