fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. datasmith/__init__.py +330 -0
  2. datasmith/__init__.pyi +194 -0
  3. datasmith/agents/__init__.py +31 -0
  4. datasmith/agents/classifiers.py +272 -0
  5. datasmith/agents/codex.py +25 -0
  6. datasmith/agents/config.py +108 -0
  7. datasmith/agents/extractors.py +197 -0
  8. datasmith/agents/installed/README.md +52 -0
  9. datasmith/agents/installed/__init__.py +22 -0
  10. datasmith/agents/installed/base.py +240 -0
  11. datasmith/agents/installed/claude.py +134 -0
  12. datasmith/agents/installed/codex.py +91 -0
  13. datasmith/agents/installed/gemini.py +118 -0
  14. datasmith/agents/installed/none.py +27 -0
  15. datasmith/agents/sandbox.py +547 -0
  16. datasmith/agents/synthesizer.py +439 -0
  17. datasmith/agents/templates/AGENTS.md.j2 +150 -0
  18. datasmith/agents/templates/sandbox_verify.py +428 -0
  19. datasmith/docker/__init__.py +31 -0
  20. datasmith/docker/context.py +112 -0
  21. datasmith/docker/images.py +158 -0
  22. datasmith/docker/publish.py +56 -0
  23. datasmith/docker/templates/Dockerfile.base +26 -0
  24. datasmith/docker/templates/Dockerfile.pr +42 -0
  25. datasmith/docker/templates/Dockerfile.repo +11 -0
  26. datasmith/docker/templates/docker_build_base.sh +780 -0
  27. datasmith/docker/templates/docker_build_env.sh +309 -0
  28. datasmith/docker/templates/docker_build_final.sh +106 -0
  29. datasmith/docker/templates/docker_build_pkg.sh +99 -0
  30. datasmith/docker/templates/docker_build_run.sh +124 -0
  31. datasmith/docker/templates/entrypoint.sh +62 -0
  32. datasmith/docker/templates/parser.py +1405 -0
  33. datasmith/docker/templates/profile.sh +199 -0
  34. datasmith/docker/templates/pytest_runner.py +692 -0
  35. datasmith/docker/templates/run-tests.sh +197 -0
  36. datasmith/docker/verifiers.py +131 -0
  37. datasmith/filters.py +154 -0
  38. datasmith/github/__init__.py +22 -0
  39. datasmith/github/client.py +333 -0
  40. datasmith/github/hooks.py +50 -0
  41. datasmith/github/links.py +110 -0
  42. datasmith/github/models.py +206 -0
  43. datasmith/github/render.py +173 -0
  44. datasmith/github/search.py +66 -0
  45. datasmith/github/templates/comment.md.j2 +5 -0
  46. datasmith/github/templates/final.md.j2 +66 -0
  47. datasmith/github/templates/issues.md.j2 +21 -0
  48. datasmith/github/templates/repo.md.j2 +1 -0
  49. datasmith/preflight.py +162 -0
  50. datasmith/publish/__init__.py +13 -0
  51. datasmith/publish/huggingface.py +104 -0
  52. datasmith/publish/pipeline.py +60 -0
  53. datasmith/publish/records.py +91 -0
  54. datasmith/py.typed +1 -0
  55. datasmith/resolution/__init__.py +14 -0
  56. datasmith/resolution/blocklist.py +145 -0
  57. datasmith/resolution/cache.py +120 -0
  58. datasmith/resolution/constants.py +277 -0
  59. datasmith/resolution/dependency_resolver.py +174 -0
  60. datasmith/resolution/git_utils.py +378 -0
  61. datasmith/resolution/import_analyzer.py +66 -0
  62. datasmith/resolution/metadata_parser.py +412 -0
  63. datasmith/resolution/models.py +41 -0
  64. datasmith/resolution/orchestrator.py +522 -0
  65. datasmith/resolution/package_filters.py +312 -0
  66. datasmith/resolution/python_manager.py +110 -0
  67. datasmith/runners/__init__.py +15 -0
  68. datasmith/runners/base.py +112 -0
  69. datasmith/runners/classify_prs.py +48 -0
  70. datasmith/runners/render_problems.py +113 -0
  71. datasmith/runners/resolve_packages.py +66 -0
  72. datasmith/runners/scrape_commits.py +166 -0
  73. datasmith/runners/scrape_repos.py +44 -0
  74. datasmith/runners/synthesize_images.py +310 -0
  75. datasmith/update/__init__.py +5 -0
  76. datasmith/update/cli.py +169 -0
  77. datasmith/update/offline.py +173 -0
  78. datasmith/update/pipeline.py +497 -0
  79. datasmith/utils/__init__.py +18 -0
  80. datasmith/utils/core.py +67 -0
  81. datasmith/utils/db.py +156 -0
  82. datasmith/utils/tokens.py +65 -0
  83. fc_data-0.2.0.dist-info/METADATA +441 -0
  84. fc_data-0.2.0.dist-info/RECORD +87 -0
  85. fc_data-0.2.0.dist-info/WHEEL +4 -0
  86. fc_data-0.2.0.dist-info/entry_points.txt +2 -0
  87. fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0
@@ -0,0 +1,272 @@
1
+ from __future__ import annotations
2
+
3
+ import enum
4
+ import os
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ from datasmith.utils import get_logger
9
+
10
+ logger = get_logger("agents.classifiers")
11
+
12
+
13
+ class OptimizationType(str, enum.Enum):
14
+ USE_BETTER_ALGORITHM = "use_better_algorithm"
15
+ USE_BETTER_DATA_STRUCTURE_AND_LAYOUT = "use_better_data_structure_and_layout"
16
+ USE_LOWER_LEVEL_SYSTEM = "use_lower_level_system"
17
+ ACCEPT_LESS_PRECISE_SOLUTION = "accept_less_precise_solution"
18
+ USE_PARALLELIZATION = "use_parallelization"
19
+ REMOVE_OR_REDUCE_WORK = "remove_or_reduce_work"
20
+ CACHE_AND_REUSE = "cache_and_reuse"
21
+ DO_IT_EARLIER_BATCH_THROTTLE = "do_it_earlier_batch_throttle"
22
+ SCALE_PLATFORM = "scale_platform"
23
+ DATABASE_AND_STORAGE_TUNING = "database_and_storage_tuning"
24
+ MICRO_OPTIMIZATIONS = "micro_optimizations"
25
+ IO_AND_LATENCY_HIDING = "io_and_latency_hiding"
26
+ USE_HIGHER_LEVEL_SYSTEM = "use_higher_level_system"
27
+ UNCATEGORIZED = "uncategorized"
28
+
29
+ @property
30
+ def description(self) -> str:
31
+ return _OPTIMIZATION_DESCRIPTIONS.get(self.value, "")
32
+
33
+
34
+ _OPTIMIZATION_DESCRIPTIONS: dict[str, str] = {
35
+ "use_better_algorithm": (
36
+ "Complexity reduction or switching to a faster algorithm "
37
+ "(e.g. O(n^2) -> O(n log n), better sorting, smarter search)."
38
+ ),
39
+ "use_better_data_structure_and_layout": (
40
+ "Switching to a more efficient data structure or improving memory layout "
41
+ "(e.g. list -> set/dict for lookups, struct-of-arrays, contiguous buffers)."
42
+ ),
43
+ "use_lower_level_system": (
44
+ "Offloading work to C/Cython/Rust/Fortran extensions, NumPy vectorized ops, "
45
+ "or native SIMD intrinsics instead of pure Python."
46
+ ),
47
+ "accept_less_precise_solution": (
48
+ "Trading accuracy for speed via approximations, heuristics, sampling, "
49
+ "or reduced precision (e.g. float32 instead of float64)."
50
+ ),
51
+ "use_parallelization": (
52
+ "Using threads, multiprocessing, GPU kernels, or parallel algorithms "
53
+ "to split work across cores (not just async I/O)."
54
+ ),
55
+ "remove_or_reduce_work": (
56
+ "Eliminating unnecessary computation, short-circuiting, early exits, "
57
+ "skipping redundant steps, or simplifying requirements."
58
+ ),
59
+ "cache_and_reuse": (
60
+ "Memoization, LRU caches, materialized views, precomputed lookup tables, "
61
+ "or reusing expensive results across calls."
62
+ ),
63
+ "do_it_earlier_batch_throttle": (
64
+ "Batching small operations, lazy evaluation, deferred computation, "
65
+ "throttling, or moving work to an earlier/better time."
66
+ ),
67
+ "scale_platform": (
68
+ "Horizontal/vertical scaling, load balancing, sharding, or infrastructure-level capacity changes."
69
+ ),
70
+ "database_and_storage_tuning": (
71
+ "Adding indices, optimizing queries, denormalization, partitioning, "
72
+ "connection pooling, or storage engine configuration."
73
+ ),
74
+ "micro_optimizations": (
75
+ "Hot-path tweaks: inlining, branch reordering, avoiding temporary objects, "
76
+ "strength reduction, guard clauses, or tight-loop tuning."
77
+ ),
78
+ "io_and_latency_hiding": (
79
+ "Async/non-blocking I/O, overlapping I/O with compute, prefetching, pipelining, or reducing round-trip latency."
80
+ ),
81
+ "use_higher_level_system": (
82
+ "Replacing hand-rolled logic with an optimized library or framework "
83
+ "(e.g. pandas, polars, scipy, BLAS) that handles performance internally."
84
+ ),
85
+ "uncategorized": ("Performance-related change that does not clearly fit any of the above categories."),
86
+ }
87
+
88
+
89
+ class DifficultyLevel(str, enum.Enum):
90
+ EASY = "easy"
91
+ MEDIUM = "medium"
92
+ HARD = "hard"
93
+
94
+
95
+ @dataclass
96
+ class ClassificationDecision:
97
+ reason: str = ""
98
+ category: str = ""
99
+ difficulty: str = ""
100
+ confidence: int = 0
101
+
102
+ def __post_init__(self) -> None:
103
+ self.confidence = max(0, min(100, self.confidence))
104
+
105
+ @classmethod
106
+ def from_prediction(cls, prediction: Any) -> ClassificationDecision:
107
+ """Create a decision object from a DSPy prediction response."""
108
+ reasoning = getattr(prediction, "reasoning", "") or ""
109
+ category = getattr(prediction, "category", "") or ""
110
+ difficulty = getattr(prediction, "difficulty", "") or ""
111
+ raw_confidence = getattr(prediction, "confidence", None)
112
+
113
+ confidence: int
114
+ if isinstance(raw_confidence, int):
115
+ confidence = raw_confidence
116
+ else:
117
+ try:
118
+ confidence = int(str(raw_confidence).strip()) if raw_confidence is not None else 0
119
+ except (TypeError, ValueError):
120
+ confidence = 0
121
+
122
+ return cls(
123
+ reason=str(reasoning).strip(),
124
+ category=str(category).strip(),
125
+ difficulty=str(difficulty).strip(),
126
+ confidence=confidence,
127
+ )
128
+
129
+
130
+ class PerfClassifier:
131
+ """Binary classifier: is this PR a performance improvement?"""
132
+
133
+ def __init__(self) -> None:
134
+ self._predictor: Any | None = None
135
+
136
+ def _get_predictor(self) -> Any:
137
+ if self._predictor is None:
138
+ from datasmith.agents.config import ensure_configured
139
+
140
+ ensure_configured()
141
+ import dspy
142
+
143
+ class JudgeSignature(dspy.Signature):
144
+ """Decide if this commit's PRIMARY intent is to improve product/runtime performance.
145
+
146
+ Label YES only when there is CLEAR, EXPLICIT evidence in the description and/or patch that the
147
+ runtime gets faster (e.g., algorithm change, fewer allocations, caching, vectorization, reduced I/O,
148
+ async/non-blocking for throughput, latency reduction, memory footprint reduction, fix a speed regression).
149
+
150
+ Strong positive signals (weigh these collectively):
151
+ - PR title/body contains performance intent (e.g., "PERF:", "speed up", "faster", "performance").
152
+ - Linked issues/comments include benchmark links or timings demonstrating impact.
153
+ - Low-level/hot-path tweaks (e.g., reuse global context, avoid per-call init/teardown, vectorize C/NumPy).
154
+
155
+ Hard NO (non-performance) examples: tests/ASV/harness-only changes; CI/workflows/build/packaging; coverage;
156
+ pre-commit/format/lints (clippy/ruff/black); docs; version bumps; terminology/renames; pure refactors without
157
+ performance claims; changes aimed at making perf tests pass but not improving runtime.
158
+
159
+ If ambiguous, weigh the concrete code changes and problem description together. When there are
160
+ specific performance cues (title keywords, measured timings, fewer allocations, vectorization,
161
+ caching/reuse) lean YES; otherwise NO.
162
+ """
163
+
164
+ problem_description: str = dspy.InputField(desc="Problem statement and technical context from PR/issue")
165
+ github_patch: str = dspy.InputField(desc="Git diff showing actual code changes")
166
+ file_change_summary: str = dspy.InputField(
167
+ desc="A markdown table summarizing all the files changed in the commit along with lines added/removed.",
168
+ default="",
169
+ )
170
+ reasoning: str = dspy.OutputField(desc="Deductive reasoning steps leading to the classification.")
171
+ label: str = dspy.OutputField(desc='Final label: "YES" for performance-related, "NO" otherwise.')
172
+
173
+ self._predictor = dspy.Predict(JudgeSignature)
174
+ return self._predictor
175
+
176
+ def classify(
177
+ self, problem_description: str, github_patch: str = "", file_change_summary: str = ""
178
+ ) -> tuple[bool, str]:
179
+ try:
180
+ predictor = self._get_predictor()
181
+ result = predictor(
182
+ problem_description=problem_description,
183
+ github_patch=github_patch,
184
+ file_change_summary=file_change_summary,
185
+ )
186
+ label = str(getattr(result, "label", "NO"))
187
+ is_perf = label.strip().upper().startswith("YES")
188
+ reasoning = str(getattr(result, "reasoning", ""))
189
+ except Exception:
190
+ logger.exception("PerfClassifier failed")
191
+ return False, "Classification failed"
192
+ else:
193
+ return is_perf, reasoning
194
+
195
+
196
+ class ClassifyJudge:
197
+ """Classify optimization type and difficulty."""
198
+
199
+ def __init__(self, max_tokens: int | None = None) -> None:
200
+ self._max_tokens = max_tokens or int(os.getenv("DSPY_MAX_TOKENS", "16000"))
201
+ self._predictor: Any | None = None
202
+
203
+ def _get_predictor(self) -> Any:
204
+ if self._predictor is None:
205
+ from datasmith.agents.config import ensure_configured
206
+
207
+ ensure_configured()
208
+ import dspy
209
+
210
+ cat_lines = "\n".join(f"- {t.value}: {t.description}" for t in OptimizationType)
211
+ cat_values = ", ".join(t.value for t in OptimizationType)
212
+
213
+ class ClassifySignature(dspy.Signature):
214
+ """Decide the PRIMARY performance optimization technique and difficulty level."""
215
+
216
+ problem_description: str = dspy.InputField(desc="Problem statement and technical context from PR/issue")
217
+ github_patch: str = dspy.InputField(desc="Git patch showing code changes")
218
+ category: str = dspy.OutputField(desc=f"One of: {cat_values}")
219
+ difficulty: str = dspy.OutputField(desc="One of: easy, medium, hard")
220
+ reasoning: str = dspy.OutputField(desc="Brief explanation of the classification")
221
+
222
+ ClassifySignature.__doc__ = (
223
+ "Decide the PRIMARY performance optimization technique and difficulty level.\n\n"
224
+ f"Category mapping (pick the single best match):\n{cat_lines}\n\n"
225
+ "Difficulty levels:\n"
226
+ "- easy: localized change (<50 lines), minimal risk\n"
227
+ "- medium: module-level refactor, data structure changes\n"
228
+ "- hard: algorithm rewrite or architectural change"
229
+ )
230
+
231
+ self._predictor = dspy.Predict(ClassifySignature)
232
+ return self._predictor
233
+
234
+ def truncate_patch(self, patch: str) -> str:
235
+ try:
236
+ import tiktoken
237
+
238
+ enc = tiktoken.get_encoding("cl100k_base")
239
+ tokens = enc.encode(patch)
240
+ if len(tokens) > self._max_tokens:
241
+ tokens = tokens[: self._max_tokens - 10]
242
+ truncated = enc.decode(tokens)
243
+ return truncated + "\n\n// [TRUNCATED DUE TO LENGTH]"
244
+ except Exception: # noqa: S110
245
+ pass # tiktoken not available, return untruncated
246
+ return patch
247
+
248
+ def classify(self, problem_description: str, github_patch: str = "") -> ClassificationDecision:
249
+ github_patch = self.truncate_patch(github_patch)
250
+ try:
251
+ predictor = self._get_predictor()
252
+ result = predictor(problem_description=problem_description, github_patch=github_patch)
253
+
254
+ cat = str(getattr(result, "category", "")).strip().lower()
255
+ valid_cats = {t.value for t in OptimizationType}
256
+ if cat not in valid_cats:
257
+ cat = "uncategorized"
258
+
259
+ diff = str(getattr(result, "difficulty", "")).strip().lower()
260
+ if diff not in ("easy", "medium", "hard"):
261
+ diff = "medium"
262
+
263
+ return ClassificationDecision(
264
+ reason=str(getattr(result, "reasoning", "")),
265
+ category=cat,
266
+ difficulty=diff,
267
+ )
268
+ except Exception:
269
+ logger.exception("ClassifyJudge failed")
270
+ return ClassificationDecision(
271
+ reason="Classification failed", category="uncategorized", difficulty="medium", confidence=0
272
+ )
@@ -0,0 +1,25 @@
1
+ """Backward-compatibility shim — real logic lives in agents.installed.codex."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datasmith.agents.installed import AgentResult as CodexResult
6
+ from datasmith.agents.installed import CodexAgent
7
+ from datasmith.agents.installed.codex import _parse_codex_stdout
8
+
9
+ __all__ = ["CodexResult", "_parse_codex_stdout", "codex_exec"]
10
+
11
+
12
+ def codex_exec(
13
+ prompt: str,
14
+ timeout: int = 900,
15
+ workdir: str | None = None,
16
+ dry_run: bool = False,
17
+ full_auto: bool = False,
18
+ sandbox: str = "",
19
+ ) -> CodexResult:
20
+ """Execute a prompt via the Codex CLI.
21
+
22
+ Thin wrapper around :class:`~datasmith.agents.installed.codex.CodexAgent`.
23
+ """
24
+ agent = CodexAgent(full_auto=full_auto, sandbox=sandbox)
25
+ return agent.exec_or_dry_run(prompt, timeout=timeout, workdir=workdir, dry_run=dry_run)
@@ -0,0 +1,108 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import os
5
+ import threading
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+
9
+ from datasmith.utils import get_logger
10
+
11
+ logger = get_logger("agents.config")
12
+
13
+
14
+ @dataclass
15
+ class AgentConfig:
16
+ """Configuration for LLM agent backends."""
17
+
18
+ primary_model: str = ""
19
+ fallback_model: str = ""
20
+ api_key: str = ""
21
+ api_base: str = ""
22
+ max_tokens: int = 16000
23
+ temperature: float = 0.0
24
+ portkey_api_key: str = ""
25
+ portkey_model_name: str = ""
26
+
27
+ @classmethod
28
+ def from_env(cls) -> AgentConfig:
29
+ return cls(
30
+ primary_model=os.environ.get("DSPY_MODEL", "openai/gpt-oss-120b"),
31
+ fallback_model=os.environ.get("DSPY_FALLBACK_MODEL", ""),
32
+ api_key=os.environ.get("DSPY_API_KEY", "local"),
33
+ api_base=os.environ.get("DSPY_API_BASE", "http://localhost:30001/v1"),
34
+ max_tokens=int(os.environ.get("DSPY_MAX_TOKENS", "16000")),
35
+ temperature=float(os.environ.get("DSPY_TEMPERATURE", "0")),
36
+ portkey_api_key=os.environ.get("PORTKEY_API_KEY", ""),
37
+ portkey_model_name=os.environ.get("PORTKEY_MODEL_NAME", ""),
38
+ )
39
+
40
+
41
+ # Module-level state for lazy DSPy configuration.
42
+ _configured = False
43
+ _lock = threading.Lock()
44
+ _lm: Any = None # Stores the dspy.LM instance for async-safe reuse
45
+
46
+
47
+ def configure_dspy(config: AgentConfig) -> None:
48
+ """Configure DSPy backends from AgentConfig."""
49
+ global _lm
50
+ import dspy
51
+
52
+ kwargs: dict[str, Any] = {
53
+ "temperature": config.temperature,
54
+ "max_tokens": config.max_tokens,
55
+ }
56
+
57
+ if config.api_key and config.primary_model:
58
+ _lm = dspy.LM(
59
+ model=config.primary_model,
60
+ api_key=config.api_key,
61
+ api_base=config.api_base or None,
62
+ **kwargs,
63
+ )
64
+ model_name = config.primary_model
65
+ elif config.portkey_api_key:
66
+ from portkey_ai import PORTKEY_GATEWAY_URL
67
+
68
+ model_name = config.portkey_model_name or "@anthropic/claude-3-5-sonnet-latest"
69
+ kwargs["api_base"] = PORTKEY_GATEWAY_URL
70
+ kwargs["api_key"] = "unused-by-portkey"
71
+ kwargs["headers"] = {
72
+ "x-portkey-api-key": config.portkey_api_key,
73
+ "x-portkey-provider": model_name.split("/")[0].lstrip("@"),
74
+ }
75
+ kwargs["custom_llm_provider"] = "openai"
76
+ _lm = dspy.LM(model=model_name, **kwargs)
77
+ else:
78
+ logger.warning("No LM backend configured")
79
+ return
80
+
81
+ with contextlib.suppress(RuntimeError):
82
+ dspy.configure(lm=_lm)
83
+ logger.info("Configured DSPy with model: %s", model_name)
84
+
85
+
86
+ def ensure_configured() -> None:
87
+ """Lazy-initialize DSPy on first LLM call. Thread- and async-safe.
88
+
89
+ Uses double-checked locking to avoid repeated configuration.
90
+ If ``dspy.configure()`` was already called from a different async task,
91
+ the stored LM is applied via ``dspy.context()`` instead.
92
+ """
93
+ global _configured
94
+ if _configured:
95
+ # DSPy was configured, but possibly from a different async task.
96
+ # Re-apply the LM via dspy.context() which is async-safe.
97
+ if _lm is not None:
98
+ import dspy
99
+
100
+ with contextlib.suppress(RuntimeError):
101
+ dspy.configure(lm=_lm)
102
+ return
103
+ with _lock:
104
+ if _configured:
105
+ return
106
+ config = AgentConfig.from_env()
107
+ configure_dspy(config)
108
+ _configured = True
@@ -0,0 +1,197 @@
1
+ """
2
+ ProblemExtractor: Extractive-first approach for problem statement generation.
3
+
4
+ Key principles:
5
+ - 90% extractive, 10% abstractive
6
+ - Preserve code snippets verbatim (character-exact)
7
+ - Keep technical terms exactly as written
8
+ - Natural structure over imposed templates
9
+ - Preserve disagreements and different viewpoints
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from dataclasses import dataclass
16
+ from typing import Any
17
+
18
+ from datasmith.utils import get_logger
19
+
20
+ logger = get_logger("agents.extractors")
21
+
22
+
23
+ @dataclass
24
+ class ProblemExtraction:
25
+ """Structured extraction from a PR description.
26
+
27
+ Captures four phases of a performance optimization or bug fix:
28
+ 1. initial_observations: Objective symptoms of the problematic behavior
29
+ 2. triage_attempts: Investigative steps and reasoning used to narrow down the issue
30
+ 3. solution_overview: Description of the change(s) made
31
+ 4. solution_observations: Observations after applying the change
32
+ """
33
+
34
+ initial_observations: str = ""
35
+ triage_attempts: str = ""
36
+ solution_overview: str = ""
37
+ solution_observations: str = ""
38
+
39
+ def to_problem_markdown(self) -> str:
40
+ """Render only the problem portion (initial observations)."""
41
+ text = (self.initial_observations or "").strip()
42
+ text = re.sub(r"^```json\s*$", "", text, flags=re.MULTILINE)
43
+ return text
44
+
45
+ def _normalise_section(self, content: str | None, header_variants: list[str]) -> str | None:
46
+ """Remove redundant headers from section content."""
47
+ if not content:
48
+ return None
49
+ content = content.strip()
50
+ low = content.lstrip().lower()
51
+ for variant in header_variants:
52
+ if low.startswith(variant.lower()):
53
+ lines = content.splitlines()
54
+ if lines:
55
+ lines = lines[1:]
56
+ content = "\n".join(lines).lstrip()
57
+ break
58
+ return content
59
+
60
+ def to_full_markdown(self) -> str:
61
+ """Render all sections with headers."""
62
+ sections: list[str] = []
63
+
64
+ initial_obs = (self.initial_observations or "").strip()
65
+ if initial_obs:
66
+ sections.append(initial_obs)
67
+
68
+ triage = self._normalise_section(
69
+ self.triage_attempts,
70
+ ["## triage attempts", "**triage attempts**"],
71
+ )
72
+ if triage:
73
+ sections.append(f"## Triage Attempts\n\n{triage}")
74
+
75
+ solution = self._normalise_section(
76
+ self.solution_overview,
77
+ ["## solution overview", "**solution overview**"],
78
+ )
79
+ if solution:
80
+ sections.append(f"## Solution Overview\n\n{solution}")
81
+
82
+ solution_obs = self._normalise_section(
83
+ self.solution_observations,
84
+ ["## solution observations", "**solution observations**"],
85
+ )
86
+ if solution_obs:
87
+ sections.append(f"## Solution Observations\n\n{solution_obs}")
88
+
89
+ text = "\n\n".join(sections).strip()
90
+ text = re.sub(r"^```json\s*$", "", text, flags=re.MULTILINE)
91
+ return text
92
+
93
+ def to_dict(self) -> dict[str, str]:
94
+ return {
95
+ "initial_observations": self.initial_observations,
96
+ "triage_attempts": self.triage_attempts,
97
+ "solution_overview": self.solution_overview,
98
+ "solution_observations": self.solution_observations,
99
+ }
100
+
101
+
102
+ class ProblemExtractor:
103
+ """Extractive problem/solution bucketizer using DSPy."""
104
+
105
+ def __init__(self) -> None:
106
+ self._predictor: Any | None = None
107
+
108
+ def _get_predictor(self) -> Any:
109
+ if self._predictor is None:
110
+ from datasmith.agents.config import ensure_configured
111
+
112
+ ensure_configured()
113
+ import dspy
114
+
115
+ class ProblemExtractorSignature(dspy.Signature):
116
+ """What problem is this Github PR trying to solve? Extract near-verbatim relevant text following the given JSON output. If no relevant context exists for a field, return an empty string for it."""
117
+
118
+ pr_title: str = dspy.InputField(desc="The GitHub PR title")
119
+ pr_body: str = dspy.InputField(desc="The GitHub PR description")
120
+ pr_comments: str = dspy.InputField(desc="Comments on the PR thread.")
121
+ initial_observations: str = dspy.OutputField(
122
+ desc="Objective symptoms of the problematic behavior, described in the present tense. Focus strictly on what is happening (metrics, user impact, frequency). Do not include causes, hypotheses, or explanations."
123
+ )
124
+ triage_attempts: str = dspy.OutputField(
125
+ desc="The investigative steps and reasoning used to narrow down contributing factors—what you checked, what you ruled out, and what evidence you gathered to understand where the issue originates."
126
+ )
127
+ solution_overview: str = dspy.OutputField(
128
+ desc="A concise description of the change(s) made and how they address the identified bottleneck or constraint."
129
+ )
130
+ solution_observations: str = dspy.OutputField(
131
+ desc="What you observe after applying the change—new measurements, behavior differences, and any regressions or trade-offs that appeared."
132
+ )
133
+
134
+ self._predictor = dspy.Predict(ProblemExtractorSignature)
135
+ return self._predictor
136
+
137
+ def extract_problem(self, pr_title: str, pr_body: str, pr_comments: str = "") -> ProblemExtraction:
138
+ try:
139
+ predictor = self._get_predictor()
140
+ result = predictor(pr_title=pr_title, pr_body=pr_body, pr_comments=pr_comments)
141
+ return self._build_extraction(result)
142
+ except Exception:
143
+ logger.exception("Problem extraction failed, returning empty")
144
+ return ProblemExtraction(initial_observations=pr_body[:500] if pr_body else "")
145
+
146
+ def _clean_text(self, value: Any | None) -> str | None:
147
+ """Clean and normalize text values from predictions."""
148
+ if value is None:
149
+ return None
150
+ if isinstance(value, list):
151
+ try:
152
+ flat: list[str] = []
153
+ for v in value:
154
+ if isinstance(v, list):
155
+ flat.extend(str(x) for x in v)
156
+ else:
157
+ flat.append(str(v))
158
+ value = "\n".join(flat)
159
+ except Exception:
160
+ value = "\n".join(str(v) for v in value)
161
+ if not isinstance(value, str):
162
+ value = str(value)
163
+ stripped = value.strip()
164
+ if stripped.lower() in {"null", "none", "undefined", "n/a", ""}:
165
+ return None
166
+ return stripped or None
167
+
168
+ def _build_extraction(self, prediction: Any) -> ProblemExtraction:
169
+ """Normalize the raw DSPy prediction into a ProblemExtraction."""
170
+ initial_obs = self._clean_text(getattr(prediction, "initial_observations", None))
171
+ triage = self._clean_text(getattr(prediction, "triage_attempts", None))
172
+ solution = self._clean_text(getattr(prediction, "solution_overview", None))
173
+ solution_obs = self._clean_text(getattr(prediction, "solution_observations", None))
174
+
175
+ def plausible(s: str | None, *, min_len: int = 20) -> bool:
176
+ if s is None:
177
+ return False
178
+ stripped = s.strip()
179
+ if len(stripped) < min_len:
180
+ return False
181
+ return bool(re.search(r"[A-Za-z]", stripped))
182
+
183
+ if not plausible(initial_obs, min_len=20):
184
+ initial_obs = None
185
+ if not plausible(triage, min_len=10):
186
+ triage = None
187
+ if not plausible(solution, min_len=10):
188
+ solution = None
189
+ if not plausible(solution_obs, min_len=10):
190
+ solution_obs = None
191
+
192
+ return ProblemExtraction(
193
+ initial_observations=initial_obs or "",
194
+ triage_attempts=triage or "",
195
+ solution_overview=solution or "",
196
+ solution_observations=solution_obs or "",
197
+ )
@@ -0,0 +1,52 @@
1
+ # Installed Agent Abstraction
2
+
3
+ An **installed agent** is a CLI coding agent installed on the host machine that
4
+ can execute prompts non-interactively, auto-approve tool calls, and return
5
+ structured output.
6
+
7
+ ## Supported agents
8
+
9
+ | Agent | CLI binary | Install |
10
+ |-------|-----------|---------|
11
+ | Claude Code | `claude` | `npm install -g @anthropic-ai/claude-code` |
12
+ | Codex | `codex` | `npm install -g @openai/codex` |
13
+ | Gemini CLI | `gemini` | `npm install -g @anthropic-ai/gemini-cli` |
14
+
15
+ ## Interface contract
16
+
17
+ Every `InstalledAgent` implementation must satisfy these requirements:
18
+
19
+ 1. **Non-interactive execution** — run a prompt, return when done
20
+ 2. **Auto-approve all tool calls** — no human-in-the-loop
21
+ 3. **JSON/structured output** — parseable stdout with agent messages and file changes
22
+ 4. **Working directory** — operate in a specified directory (via subprocess `cwd=`)
23
+ 5. **Ephemeral sessions** — don't persist state across runs
24
+ 6. **Shell + file editing** — can run bash and edit files in the workspace
25
+ 7. **External timeout** — can be killed via subprocess timeout
26
+
27
+ ## Auto-detection
28
+
29
+ `get_agent()` tries agents in preference order (default: `claude → codex → gemini`)
30
+ and returns the first one whose CLI binary is on `PATH`:
31
+
32
+ ```python
33
+ from datasmith.agents.installed import get_agent
34
+
35
+ agent = get_agent() # auto-detect
36
+ agent = get_agent(preference=["codex"]) # force codex
37
+ result = agent.exec("Fix the build", timeout=600, workdir="/tmp/workspace")
38
+ ```
39
+
40
+ ## Adding a new agent
41
+
42
+ 1. Create `src/datasmith/agents/installed/<name>.py`
43
+ 2. Subclass `InstalledAgent` and implement `name()`, `is_available()`, `exec()`
44
+ 3. Add a `_parse_<name>_stdout()` function to normalise CLI output
45
+ 4. Register the class in `base.py`'s `get_agent()` registry dict
46
+ 5. Re-export from `__init__.py`
47
+
48
+ ## Output parsing
49
+
50
+ Each agent's CLI emits a different JSON schema. The `_parse_*_stdout()` function
51
+ for each agent normalises the output into `(output_lines, files_changed)` which
52
+ is then wrapped in an `AgentResult`.
@@ -0,0 +1,22 @@
1
+ """Installed CLI agent abstraction.
2
+
3
+ Provides a unified interface for CLI-based coding agents (Codex, Claude Code,
4
+ Gemini CLI) with auto-detection of whichever is available on the host.
5
+ """
6
+
7
+ from datasmith.agents.installed.base import AgentResult, CodexResult, InstalledAgent, get_agent
8
+ from datasmith.agents.installed.claude import ClaudeAgent
9
+ from datasmith.agents.installed.codex import CodexAgent
10
+ from datasmith.agents.installed.gemini import GeminiAgent
11
+ from datasmith.agents.installed.none import NoneAgent
12
+
13
+ __all__ = [
14
+ "AgentResult",
15
+ "ClaudeAgent",
16
+ "CodexAgent",
17
+ "CodexResult",
18
+ "GeminiAgent",
19
+ "InstalledAgent",
20
+ "NoneAgent",
21
+ "get_agent",
22
+ ]