python-harness 0.0.11__tar.gz → 0.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {python_harness-0.0.11/python_harness.egg-info → python_harness-0.0.13}/PKG-INFO +1 -1
  2. {python_harness-0.0.11 → python_harness-0.0.13}/pyproject.toml +1 -1
  3. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/__init__.py +1 -1
  4. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/cli.py +32 -46
  5. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/hard_evaluator.py +18 -2
  6. python_harness-0.0.13/python_harness/llm_client.py +32 -0
  7. python_harness-0.0.13/python_harness/python_file_inventory.py +27 -0
  8. python_harness-0.0.13/python_harness/refine_apply.py +177 -0
  9. python_harness-0.0.13/python_harness/refine_checks.py +29 -0
  10. python_harness-0.0.13/python_harness/refine_engine.py +41 -0
  11. python_harness-0.0.13/python_harness/refine_execution.py +114 -0
  12. python_harness-0.0.13/python_harness/refine_models.py +40 -0
  13. python_harness-0.0.13/python_harness/refine_rounds.py +373 -0
  14. python_harness-0.0.13/python_harness/refine_scoring.py +95 -0
  15. python_harness-0.0.13/python_harness/refine_workspace.py +57 -0
  16. python_harness-0.0.13/python_harness/soft_eval_report.py +30 -0
  17. python_harness-0.0.13/python_harness/soft_eval_report_messages.py +57 -0
  18. python_harness-0.0.13/python_harness/soft_eval_report_metrics.py +53 -0
  19. python_harness-0.0.13/python_harness/soft_eval_report_mock.py +45 -0
  20. python_harness-0.0.13/python_harness/soft_eval_report_shared.py +2 -0
  21. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/soft_evaluator.py +88 -151
  22. {python_harness-0.0.11 → python_harness-0.0.13/python_harness.egg-info}/PKG-INFO +1 -1
  23. python_harness-0.0.13/python_harness.egg-info/SOURCES.txt +39 -0
  24. {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_cli.py +132 -37
  25. {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_hard_evaluator.py +16 -0
  26. python_harness-0.0.13/tests/test_refine_apply.py +184 -0
  27. python_harness-0.0.13/tests/test_refine_engine.py +982 -0
  28. python_harness-0.0.13/tests/test_refine_scoring.py +177 -0
  29. python_harness-0.0.13/tests/test_refine_workspace.py +69 -0
  30. {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_soft_evaluator.py +178 -1
  31. python_harness-0.0.11/python_harness.egg-info/SOURCES.txt +0 -20
  32. {python_harness-0.0.11 → python_harness-0.0.13}/LICENSE +0 -0
  33. {python_harness-0.0.11 → python_harness-0.0.13}/README.md +0 -0
  34. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/evaluator.py +0 -0
  35. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/qc_evaluator.py +0 -0
  36. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness.egg-info/dependency_links.txt +0 -0
  37. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness.egg-info/entry_points.txt +0 -0
  38. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness.egg-info/requires.txt +0 -0
  39. {python_harness-0.0.11 → python_harness-0.0.13}/python_harness.egg-info/top_level.txt +0 -0
  40. {python_harness-0.0.11 → python_harness-0.0.13}/setup.cfg +0 -0
  41. {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_evaluator.py +0 -0
  42. {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_qc_evaluator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-harness
3
- Version: 0.0.11
3
+ Version: 0.0.13
4
4
  Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
5
  Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "python-harness"
3
- version = "0.0.11"
3
+ version = "0.0.13"
4
4
  description = "An agentic codebase evaluation and evolution tool for Python projects."
5
5
  requires-python = ">=3.10"
6
6
  readme = "README.md"
@@ -2,4 +2,4 @@
2
2
  Python Harness - An agentic evaluation tool for codebases.
3
3
  """
4
4
 
5
- __version__ = "0.0.11"
5
+ __version__ = "0.0.12"
@@ -4,6 +4,7 @@ Command-line interface for python-harness.
4
4
 
5
5
  import os
6
6
  import sys
7
+ from pathlib import Path
7
8
  from typing import Any
8
9
 
9
10
  import typer
@@ -11,6 +12,7 @@ from dotenv import load_dotenv
11
12
  from rich.console import Console
12
13
 
13
14
  from python_harness.evaluator import Evaluator
15
+ from python_harness.refine_engine import run_refine
14
16
 
15
17
  # Try to find .env file explicitly before anything else executes
16
18
  env_path = os.path.join(os.getcwd(), '.env')
@@ -21,6 +23,8 @@ else:
21
23
 
22
24
  app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
23
25
  console = Console()
26
+ MI_HEALTHY_THRESHOLD = 70.0
27
+ MI_WARNING_THRESHOLD = 40.0
24
28
 
25
29
 
26
30
  def _print_detail_block(title: str, details: str, color: str) -> None:
@@ -133,13 +137,21 @@ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
133
137
  _print_hard_failure_details(hard_results)
134
138
 
135
139
 
140
+ def _mi_scorecard_color(avg_mi: float) -> str:
141
+ if avg_mi >= MI_HEALTHY_THRESHOLD:
142
+ return "green"
143
+ if avg_mi >= MI_WARNING_THRESHOLD:
144
+ return "yellow"
145
+ return "red"
146
+
147
+
136
148
  def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
137
149
  mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
138
150
  if not mi_scores:
139
151
  return
140
152
 
141
153
  avg_mi = sum(mi_scores.values()) / len(mi_scores)
142
- color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
154
+ color = _mi_scorecard_color(avg_mi)
143
155
  console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
144
156
 
145
157
 
@@ -221,7 +233,7 @@ def _print_final_report(final_report: dict[str, Any]) -> None:
221
233
  suggestions = final_report.get("suggestions", [])
222
234
  if suggestions:
223
235
  console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
224
- for i, sug in enumerate(suggestions, 1):
236
+ for i, sug in enumerate(suggestions[:3], 1):
225
237
  console.print(
226
238
  f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
227
239
  f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
@@ -232,56 +244,30 @@ def _print_final_report(final_report: dict[str, Any]) -> None:
232
244
  @app.command()
233
245
  def refine(
234
246
  path: str = typer.Argument(".", help="The path to evaluate and evolve"),
235
- steps: int = typer.Option(1, help="Number of evolution steps to perform"),
236
- max_retries: int = typer.Option(3, help="Maximum retries per variant if tests fail")
247
+ max_retries: int = typer.Option(3, help="Maximum retries per candidate"),
248
+ loop: bool = typer.Option(False, help="Keep refining winners across rounds"),
249
+ max_rounds: int = typer.Option(3, help="Maximum refine rounds when looping"),
237
250
  ) -> None:
238
251
  """
239
- Refine the codebase through an agentic Edit-Test-Improve loop.
240
- Generates variants based on suggestions, tests them, and picks the best.
252
+ Refine the codebase through a fixed two-level search and optional loop.
241
253
  """
242
254
  console.print(
243
- f"[bold magenta]Starting evolution loop for path:[/bold magenta] {path} "
244
- f"[dim](steps={steps}, max_retries={max_retries})[/dim]"
245
- )
246
-
247
- # 1. First, run a baseline evaluation to get suggestions
248
- evaluator = Evaluator(path)
249
- console.print("[cyan]Running baseline evaluation...[/cyan]")
250
- hard_results = evaluator.hard_evaluator.evaluate()
251
- soft_results = evaluator.soft_evaluator.evaluate()
252
- baseline_report = evaluator.soft_evaluator.generate_final_report(
253
- hard_results, {"all_passed": True, "failures": []}, soft_results
254
- )
255
-
256
- suggestions = baseline_report.get("suggestions", [])
257
- if not suggestions:
258
- console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
259
- return
260
-
261
- console.print(
262
- f"[green]Found {len(suggestions)} suggestions. "
263
- f"Starting evolution branches...[/green]"
255
+ f"[bold magenta]Starting refine for path:[/bold magenta] {path} "
256
+ f"[dim](loop={loop}, max_rounds={max_rounds}, "
257
+ f"max_retries={max_retries})[/dim]"
264
258
  )
265
-
266
- # TODO: Implement the Git branching and Agent modification logic here.
267
- # The loop will be:
268
- # for step in range(steps):
269
- # for suggestion in suggestions:
270
- # checkout new branch variant-X
271
- # for retry in range(max_retries):
272
- # ask LLM to apply suggestion to code
273
- # run pytest
274
- # if pytest passes:
275
- # run harness . to get new score
276
- # break
277
- # else:
278
- # feed error back to LLM for retry
279
- # compare all variants and checkout the best one
280
-
281
- console.print(
282
- "[yellow]Evolution engine skeleton ready. "
283
- "Actual git mutation logic pending.[/yellow]"
259
+ target_path = Path(path).resolve()
260
+
261
+ result = run_refine(
262
+ target_path=target_path,
263
+ max_retries=max_retries,
264
+ loop=loop,
265
+ max_rounds=max_rounds,
266
+ progress_callback=lambda message: console.print(f"[dim]{message}[/dim]"),
284
267
  )
268
+ console.print(f"[green]winner_id:[/green] {result['winner_id']}")
269
+ console.print(f"[cyan]rounds_completed:[/cyan] {result['rounds_completed']}")
270
+ console.print(f"[yellow]stop_reason:[/yellow] {result['stop_reason']}")
285
271
  @app.command()
286
272
  def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
287
273
  """
@@ -11,6 +11,8 @@ from typing import Any
11
11
 
12
12
  from rich.console import Console
13
13
 
14
+ from python_harness.python_file_inventory import collect_python_files
15
+
14
16
  console = Console()
15
17
  PYTEST_TIMEOUT_SECONDS = 60
16
18
 
@@ -22,6 +24,9 @@ class HardEvaluator:
22
24
  def __init__(self, target_path: str):
23
25
  self.target_path = Path(target_path).resolve()
24
26
 
27
+ def _radon_metric_targets(self) -> list[str]:
28
+ return [str(file_path) for file_path in collect_python_files(self.target_path)]
29
+
25
30
  def run_ruff(self) -> dict[str, Any]:
26
31
  """
27
32
  Run Ruff linter and return results.
@@ -112,6 +117,14 @@ class HardEvaluator:
112
117
  Flag any function/method with CC > 15 as a failure.
113
118
  """
114
119
  try:
120
+ targets = self._radon_metric_targets()
121
+ if not targets:
122
+ return {
123
+ "status": "success",
124
+ "issues": [],
125
+ "return_code": 0,
126
+ "output": "",
127
+ }
115
128
  result = subprocess.run(
116
129
  [
117
130
  sys.executable,
@@ -120,7 +133,7 @@ class HardEvaluator:
120
133
  "cc",
121
134
  "-j",
122
135
  "-a",
123
- str(self.target_path),
136
+ *targets,
124
137
  ],
125
138
  capture_output=True,
126
139
  text=True,
@@ -178,8 +191,11 @@ class HardEvaluator:
178
191
  but it contributes to the scorecard.
179
192
  """
180
193
  try:
194
+ targets = self._radon_metric_targets()
195
+ if not targets:
196
+ return {"status": "success", "mi_scores": {}, "return_code": 0}
181
197
  result = subprocess.run(
182
- [sys.executable, "-m", "radon", "mi", "-j", str(self.target_path)],
198
+ [sys.executable, "-m", "radon", "mi", "-j", *targets],
183
199
  capture_output=True,
184
200
  text=True,
185
201
  check=False
@@ -0,0 +1,32 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from typing import Any
4
+
5
+ from openai import OpenAI
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class LLMSettings:
10
+ api_key: str | None
11
+ base_url: str
12
+ model_name: str
13
+ mini_model_name: str
14
+ request_timeout_seconds: float
15
+
16
+
17
+ def load_llm_settings() -> LLMSettings:
18
+ return LLMSettings(
19
+ api_key=os.environ.get("LLM_API_KEY"),
20
+ base_url=os.environ.get("LLM_BASE_URL", "https://api.deepseek.com/v1"),
21
+ model_name=os.environ.get("LLM_MODEL_NAME", "deepseek-reasoner"),
22
+ mini_model_name=os.environ.get("LLM_MINI_MODEL_NAME", "deepseek-chat"),
23
+ request_timeout_seconds=float(
24
+ os.environ.get("LLM_REQUEST_TIMEOUT_SECONDS", "60")
25
+ ),
26
+ )
27
+
28
+
29
+ def build_llm_client(settings: LLMSettings) -> Any | None:
30
+ if not settings.api_key:
31
+ return None
32
+ return OpenAI(api_key=settings.api_key, base_url=settings.base_url)
@@ -0,0 +1,27 @@
1
+ """
2
+ Python file discovery helpers.
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ SKIPPED_DIRS = {"__pycache__", "env", "test", "tests", "vendors", "venv"}
8
+
9
+
10
+ def should_skip_python_path(file_path: Path, root: Path) -> bool:
11
+ if file_path.name.startswith("test_") or file_path.name.endswith("_test.py"):
12
+ return True
13
+ try:
14
+ relative_parts = file_path.relative_to(root).parts
15
+ except ValueError:
16
+ relative_parts = file_path.parts
17
+ return any(part.startswith(".") or part in SKIPPED_DIRS for part in relative_parts)
18
+
19
+
20
+ def collect_python_files(root: Path) -> list[Path]:
21
+ if root.is_file():
22
+ return [root] if root.suffix == ".py" else []
23
+ return [
24
+ file_path
25
+ for file_path in sorted(root.rglob("*.py"))
26
+ if not should_skip_python_path(file_path, root)
27
+ ]
@@ -0,0 +1,177 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, cast
4
+
5
+ from python_harness.llm_client import build_llm_client, load_llm_settings
6
+ from python_harness.python_file_inventory import collect_python_files
7
+
8
+
9
+ class NullSuggestionApplier:
10
+ def apply(
11
+ self,
12
+ workspace: Path,
13
+ suggestion: dict[str, str],
14
+ failure_feedback: str = "",
15
+ ) -> dict[str, Any]:
16
+ return {
17
+ "ok": True,
18
+ "touched_files": [],
19
+ "failure_reason": "",
20
+ "suggestion_title": suggestion.get("title", ""),
21
+ "failure_feedback": failure_feedback,
22
+ "workspace": str(workspace),
23
+ }
24
+
25
+
26
+ class LLMSuggestionApplier:
27
+ def __init__(
28
+ self,
29
+ client: Any | None = None,
30
+ model_name: str | None = None,
31
+ ) -> None:
32
+ settings = load_llm_settings()
33
+ self.client = client if client is not None else build_llm_client(settings)
34
+ self.model_name = model_name or settings.mini_model_name
35
+ self.request_timeout_seconds = settings.request_timeout_seconds
36
+
37
+ def _select_files(self, workspace: Path, suggestion: dict[str, str]) -> list[Path]:
38
+ target_file = suggestion.get("target_file", "").strip()
39
+ if target_file and target_file != "all":
40
+ target_path = workspace / target_file
41
+ if target_path.is_file():
42
+ return [target_path]
43
+ if target_path.is_dir():
44
+ return sorted(target_path.rglob("*.py"))[:3]
45
+ return collect_python_files(workspace)[:3]
46
+
47
+ def _build_messages(
48
+ self,
49
+ workspace: Path,
50
+ suggestion: dict[str, str],
51
+ failure_feedback: str,
52
+ files: list[Path],
53
+ ) -> list[dict[str, str]]:
54
+ inventory = "\n".join(
55
+ f"- {file_path.relative_to(workspace)}"
56
+ for file_path in collect_python_files(workspace)
57
+ )
58
+ file_blocks = "\n\n".join(
59
+ (
60
+ f"FILE: {file_path.relative_to(workspace)}\n"
61
+ f"```python\n{file_path.read_text(encoding='utf-8')}\n```"
62
+ )
63
+ for file_path in files
64
+ )
65
+ system_prompt = (
66
+ "You apply a single repository improvement suggestion. "
67
+ "Return only valid JSON with schema "
68
+ '{"updates":[{"path":"relative/path.py","content":"full file content"}]}. '
69
+ "Make the smallest possible change that satisfies the suggestion "
70
+ "and preserves behavior. "
71
+ "Never write files outside the workspace."
72
+ )
73
+ user_prompt = (
74
+ f"Suggestion title: {suggestion.get('title', '')}\n"
75
+ f"Suggestion description: {suggestion.get('description', '')}\n"
76
+ f"Suggestion target_file: {suggestion.get('target_file', 'all')}\n"
77
+ f"Failure feedback from previous attempt: {failure_feedback or 'None'}\n\n"
78
+ f"Workspace python inventory:\n{inventory}\n\n"
79
+ f"Editable file contents:\n{file_blocks}"
80
+ )
81
+ return [
82
+ {"role": "system", "content": system_prompt},
83
+ {"role": "user", "content": user_prompt},
84
+ ]
85
+
86
+ def _parse_updates(self, raw_content: str) -> list[dict[str, str]]:
87
+ payload = json.loads(raw_content)
88
+ updates = payload.get("updates", [])
89
+ if not isinstance(updates, list):
90
+ raise ValueError("LLM updates payload must contain a list")
91
+ parsed: list[dict[str, str]] = []
92
+ for update in updates:
93
+ if not isinstance(update, dict):
94
+ continue
95
+ path = update.get("path")
96
+ content = update.get("content")
97
+ if isinstance(path, str) and isinstance(content, str):
98
+ parsed.append({"path": path, "content": content})
99
+ if not parsed:
100
+ raise ValueError("LLM returned no file updates")
101
+ return parsed
102
+
103
+ def apply(
104
+ self,
105
+ workspace: Path,
106
+ suggestion: dict[str, str],
107
+ failure_feedback: str = "",
108
+ ) -> dict[str, Any]:
109
+ if self.client is None:
110
+ return {
111
+ "ok": False,
112
+ "touched_files": [],
113
+ "failure_reason": "LLM_API_KEY not configured",
114
+ }
115
+ files = self._select_files(workspace, suggestion)
116
+ if not files:
117
+ return {
118
+ "ok": False,
119
+ "touched_files": [],
120
+ "failure_reason": "No editable files selected for suggestion",
121
+ }
122
+
123
+ client = cast(Any, self.client)
124
+ try:
125
+ completion = client.chat.completions.create(
126
+ model=self.model_name,
127
+ messages=self._build_messages(
128
+ workspace,
129
+ suggestion,
130
+ failure_feedback,
131
+ files,
132
+ ),
133
+ response_format={"type": "json_object"},
134
+ timeout=self.request_timeout_seconds,
135
+ )
136
+ except Exception as exc:
137
+ return {
138
+ "ok": False,
139
+ "touched_files": [],
140
+ "failure_reason": str(exc),
141
+ "retryable": False,
142
+ }
143
+ content = completion.choices[0].message.content
144
+ if not content:
145
+ return {
146
+ "ok": False,
147
+ "touched_files": [],
148
+ "failure_reason": "LLM returned empty response",
149
+ "retryable": False,
150
+ }
151
+
152
+ try:
153
+ updates = self._parse_updates(content)
154
+ touched_files: list[str] = []
155
+ for update in updates:
156
+ destination = (workspace / update["path"]).resolve()
157
+ if not destination.is_relative_to(workspace.resolve()):
158
+ raise ValueError("LLM update path is outside workspace")
159
+ destination.parent.mkdir(parents=True, exist_ok=True)
160
+ destination.write_text(update["content"], encoding="utf-8")
161
+ touched_files.append(str(destination.relative_to(workspace)))
162
+ except Exception as exc:
163
+ return {
164
+ "ok": False,
165
+ "touched_files": [],
166
+ "failure_reason": str(exc),
167
+ "retryable": False,
168
+ }
169
+
170
+ return {
171
+ "ok": True,
172
+ "touched_files": touched_files,
173
+ "failure_reason": "",
174
+ "suggestion_title": suggestion.get("title", ""),
175
+ "failure_feedback": failure_feedback,
176
+ "workspace": str(workspace),
177
+ }
@@ -0,0 +1,29 @@
1
+ import subprocess
2
+ import sys
3
+ from pathlib import Path
4
+
5
+
6
+ def run_command(path: Path, args: list[str]) -> tuple[bool, str]:
7
+ command_cwd = path if path.is_dir() else path.parent
8
+ completed = subprocess.run(
9
+ args,
10
+ cwd=command_cwd,
11
+ capture_output=True,
12
+ text=True,
13
+ check=False,
14
+ )
15
+ output = (completed.stdout + completed.stderr).strip()
16
+ return completed.returncode == 0, output
17
+
18
+
19
+ def default_self_check_runner(path: Path) -> tuple[bool, str]:
20
+ checks = [
21
+ [sys.executable, "-m", "ruff", "check", str(path)],
22
+ [sys.executable, "-m", "mypy", str(path)],
23
+ [sys.executable, "-m", "pytest", str(path)],
24
+ ]
25
+ for args in checks:
26
+ ok, output = run_command(path, args)
27
+ if not ok:
28
+ return False, output
29
+ return True, ""
@@ -0,0 +1,41 @@
1
+ from collections.abc import Callable
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from python_harness.refine_checks import default_self_check_runner
6
+ from python_harness.refine_execution import (
7
+ execute_candidate as _execute_candidate,
8
+ )
9
+ from python_harness.refine_rounds import (
10
+ default_evaluator_runner,
11
+ default_workspace_root,
12
+ suggestions_from,
13
+ validate_workspace_root,
14
+ )
15
+ from python_harness.refine_rounds import (
16
+ run_refine as _run_refine,
17
+ )
18
+ from python_harness.refine_rounds import (
19
+ run_refine_round as _run_refine_round,
20
+ )
21
+
22
+ SelfCheckRunner = Callable[[Path], tuple[bool, str]]
23
+ EvaluatorRunner = Callable[[Path], dict[str, Any]]
24
+
25
+ _default_evaluator_runner = default_evaluator_runner
26
+ _default_self_check_runner = default_self_check_runner
27
+ _default_workspace_root = default_workspace_root
28
+ _suggestions_from = suggestions_from
29
+ _validate_workspace_root = validate_workspace_root
30
+
31
+
32
+ def execute_candidate(*args: Any, **kwargs: Any) -> Any:
33
+ return _execute_candidate(*args, **kwargs)
34
+
35
+
36
+ def run_refine_round(*args: Any, **kwargs: Any) -> Any:
37
+ return _run_refine_round(*args, **kwargs)
38
+
39
+
40
+ def run_refine(*args: Any, **kwargs: Any) -> Any:
41
+ return _run_refine(*args, **kwargs)
@@ -0,0 +1,114 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ from python_harness.refine_models import Candidate, SuggestionApplier
5
+ from python_harness.refine_workspace import create_candidate_workspace
6
+
7
+
8
+ def _emit(progress_callback: Any, message: str) -> None:
9
+ if progress_callback is not None:
10
+ progress_callback(message)
11
+
12
+
13
+ def execute_candidate(
14
+ *,
15
+ parent: Candidate,
16
+ candidate_id: str,
17
+ suggestion: dict[str, str],
18
+ workspace_root: Path,
19
+ applier: SuggestionApplier,
20
+ self_check_runner: Any,
21
+ evaluator_runner: Any,
22
+ max_retries: int,
23
+ progress_callback: Any = None,
24
+ ) -> Candidate:
25
+ workspace = create_candidate_workspace(
26
+ parent.workspace,
27
+ workspace_root,
28
+ candidate_id,
29
+ )
30
+ feedback = ""
31
+ retries = 0
32
+ suggestion_title = suggestion.get("title", candidate_id)
33
+
34
+ while True:
35
+ apply_result: dict[str, Any] | None = None
36
+ _emit(
37
+ progress_callback,
38
+ f"{candidate_id} apply started: {suggestion_title}",
39
+ )
40
+ try:
41
+ apply_result = applier.apply(
42
+ workspace,
43
+ suggestion,
44
+ failure_feedback=feedback,
45
+ )
46
+ if not bool(apply_result.get("ok", False)):
47
+ feedback = str(
48
+ apply_result.get("failure_reason") or "suggestion apply failed"
49
+ )
50
+ raise RuntimeError(feedback)
51
+ _emit(progress_callback, f"{candidate_id} apply passed")
52
+ except Exception as exc:
53
+ feedback = str(exc)
54
+ retryable = True
55
+ if apply_result is not None:
56
+ retryable = bool(apply_result.get("retryable", True))
57
+ _emit(progress_callback, f"{candidate_id} apply failed: {feedback}")
58
+ if not retryable:
59
+ return Candidate(
60
+ id=candidate_id,
61
+ parent_id=parent.id,
62
+ depth=parent.depth + 1,
63
+ workspace=workspace,
64
+ suggestion_trace=parent.suggestion_trace + (suggestion_title,),
65
+ status="failed",
66
+ retry_count=retries,
67
+ selection_reason=feedback,
68
+ )
69
+ retries += 1
70
+ if retries > max_retries:
71
+ return Candidate(
72
+ id=candidate_id,
73
+ parent_id=parent.id,
74
+ depth=parent.depth + 1,
75
+ workspace=workspace,
76
+ suggestion_trace=parent.suggestion_trace + (suggestion_title,),
77
+ status="failed",
78
+ retry_count=retries - 1,
79
+ selection_reason=feedback,
80
+ )
81
+ continue
82
+
83
+ _emit(progress_callback, f"{candidate_id} guardrail 1 started")
84
+ is_ok, feedback = self_check_runner(workspace)
85
+ if is_ok:
86
+ _emit(progress_callback, f"{candidate_id} guardrail 1 passed")
87
+ _emit(progress_callback, f"{candidate_id} guardrail 2 started")
88
+ evaluation = evaluator_runner(workspace)
89
+ _emit(progress_callback, f"{candidate_id} guardrail 2 passed")
90
+ return Candidate(
91
+ id=candidate_id,
92
+ parent_id=parent.id,
93
+ depth=parent.depth + 1,
94
+ workspace=workspace,
95
+ suggestion_trace=parent.suggestion_trace + (suggestion_title,),
96
+ evaluation=evaluation,
97
+ status="measured",
98
+ retry_count=retries,
99
+ )
100
+
101
+ _emit(progress_callback, f"{candidate_id} guardrail 1 failed")
102
+ _emit(progress_callback, feedback)
103
+ retries += 1
104
+ if retries > max_retries:
105
+ return Candidate(
106
+ id=candidate_id,
107
+ parent_id=parent.id,
108
+ depth=parent.depth + 1,
109
+ workspace=workspace,
110
+ suggestion_trace=parent.suggestion_trace + (suggestion_title,),
111
+ status="failed",
112
+ retry_count=retries - 1,
113
+ selection_reason=str(feedback),
114
+ )
@@ -0,0 +1,40 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import Any, Protocol
4
+
5
+
6
+ @dataclass(slots=True)
7
+ class Candidate:
8
+ id: str
9
+ parent_id: str | None
10
+ depth: int
11
+ workspace: Path
12
+ suggestion_trace: tuple[str, ...]
13
+ evaluation: dict[str, Any] | None = None
14
+ status: str = "pending"
15
+ retry_count: int = 0
16
+ selection_reason: str = ""
17
+
18
+
19
+ @dataclass(slots=True)
20
+ class SelectionResult:
21
+ winner: Candidate
22
+ ordered_ids: list[str]
23
+ reason: str
24
+
25
+
26
+ @dataclass(slots=True)
27
+ class RefineRoundResult:
28
+ baseline: Candidate
29
+ candidates: list[Candidate] = field(default_factory=list)
30
+ winner: Candidate | None = None
31
+ stop_reason: str = ""
32
+
33
+
34
+ class SuggestionApplier(Protocol):
35
+ def apply(
36
+ self,
37
+ workspace: Path,
38
+ suggestion: dict[str, str],
39
+ failure_feedback: str = "",
40
+ ) -> dict[str, Any]: ...