python-harness 0.0.12__tar.gz → 0.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_harness-0.0.12/python_harness.egg-info → python_harness-0.0.13}/PKG-INFO +1 -1
- {python_harness-0.0.12 → python_harness-0.0.13}/pyproject.toml +1 -1
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness/cli.py +21 -45
- python_harness-0.0.13/python_harness/llm_client.py +32 -0
- python_harness-0.0.13/python_harness/refine_apply.py +177 -0
- python_harness-0.0.13/python_harness/refine_checks.py +29 -0
- python_harness-0.0.13/python_harness/refine_engine.py +41 -0
- python_harness-0.0.13/python_harness/refine_execution.py +114 -0
- python_harness-0.0.13/python_harness/refine_models.py +40 -0
- python_harness-0.0.13/python_harness/refine_rounds.py +373 -0
- python_harness-0.0.13/python_harness/refine_scoring.py +95 -0
- python_harness-0.0.13/python_harness/refine_workspace.py +57 -0
- python_harness-0.0.13/python_harness/soft_eval_report.py +30 -0
- python_harness-0.0.13/python_harness/soft_eval_report_messages.py +57 -0
- python_harness-0.0.13/python_harness/soft_eval_report_metrics.py +53 -0
- python_harness-0.0.13/python_harness/soft_eval_report_mock.py +45 -0
- python_harness-0.0.13/python_harness/soft_eval_report_shared.py +2 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness/soft_evaluator.py +69 -18
- {python_harness-0.0.12 → python_harness-0.0.13/python_harness.egg-info}/PKG-INFO +1 -1
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness.egg-info/SOURCES.txt +17 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/tests/test_cli.py +118 -37
- python_harness-0.0.13/tests/test_refine_apply.py +184 -0
- python_harness-0.0.13/tests/test_refine_engine.py +982 -0
- python_harness-0.0.13/tests/test_refine_scoring.py +177 -0
- python_harness-0.0.13/tests/test_refine_workspace.py +69 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/tests/test_soft_evaluator.py +116 -1
- python_harness-0.0.12/python_harness/soft_eval_report.py +0 -154
- {python_harness-0.0.12 → python_harness-0.0.13}/LICENSE +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/README.md +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness/__init__.py +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness/evaluator.py +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness/hard_evaluator.py +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness/python_file_inventory.py +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness/qc_evaluator.py +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness.egg-info/dependency_links.txt +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness.egg-info/entry_points.txt +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness.egg-info/requires.txt +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/python_harness.egg-info/top_level.txt +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/setup.cfg +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/tests/test_evaluator.py +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/tests/test_hard_evaluator.py +0 -0
- {python_harness-0.0.12 → python_harness-0.0.13}/tests/test_qc_evaluator.py +0 -0
|
@@ -4,6 +4,7 @@ Command-line interface for python-harness.
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
|
+
from pathlib import Path
|
|
7
8
|
from typing import Any
|
|
8
9
|
|
|
9
10
|
import typer
|
|
@@ -11,6 +12,7 @@ from dotenv import load_dotenv
|
|
|
11
12
|
from rich.console import Console
|
|
12
13
|
|
|
13
14
|
from python_harness.evaluator import Evaluator
|
|
15
|
+
from python_harness.refine_engine import run_refine
|
|
14
16
|
|
|
15
17
|
# Try to find .env file explicitly before anything else executes
|
|
16
18
|
env_path = os.path.join(os.getcwd(), '.env')
|
|
@@ -231,7 +233,7 @@ def _print_final_report(final_report: dict[str, Any]) -> None:
|
|
|
231
233
|
suggestions = final_report.get("suggestions", [])
|
|
232
234
|
if suggestions:
|
|
233
235
|
console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
|
|
234
|
-
for i, sug in enumerate(suggestions, 1):
|
|
236
|
+
for i, sug in enumerate(suggestions[:3], 1):
|
|
235
237
|
console.print(
|
|
236
238
|
f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
|
|
237
239
|
f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
|
|
@@ -242,56 +244,30 @@ def _print_final_report(final_report: dict[str, Any]) -> None:
|
|
|
242
244
|
@app.command()
|
|
243
245
|
def refine(
|
|
244
246
|
path: str = typer.Argument(".", help="The path to evaluate and evolve"),
|
|
245
|
-
|
|
246
|
-
|
|
247
|
+
max_retries: int = typer.Option(3, help="Maximum retries per candidate"),
|
|
248
|
+
loop: bool = typer.Option(False, help="Keep refining winners across rounds"),
|
|
249
|
+
max_rounds: int = typer.Option(3, help="Maximum refine rounds when looping"),
|
|
247
250
|
) -> None:
|
|
248
251
|
"""
|
|
249
|
-
Refine the codebase through
|
|
250
|
-
Generates variants based on suggestions, tests them, and picks the best.
|
|
252
|
+
Refine the codebase through a fixed two-level search and optional loop.
|
|
251
253
|
"""
|
|
252
254
|
console.print(
|
|
253
|
-
f"[bold magenta]Starting
|
|
254
|
-
f"[dim](
|
|
255
|
+
f"[bold magenta]Starting refine for path:[/bold magenta] {path} "
|
|
256
|
+
f"[dim](loop={loop}, max_rounds={max_rounds}, "
|
|
257
|
+
f"max_retries={max_retries})[/dim]"
|
|
255
258
|
)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
suggestions = baseline_report.get("suggestions", [])
|
|
267
|
-
if not suggestions:
|
|
268
|
-
console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
|
|
269
|
-
return
|
|
270
|
-
|
|
271
|
-
console.print(
|
|
272
|
-
f"[green]Found {len(suggestions)} suggestions. "
|
|
273
|
-
f"Starting evolution branches...[/green]"
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
# TODO: Implement the Git branching and Agent modification logic here.
|
|
277
|
-
# The loop will be:
|
|
278
|
-
# for step in range(steps):
|
|
279
|
-
# for suggestion in suggestions:
|
|
280
|
-
# checkout new branch variant-X
|
|
281
|
-
# for retry in range(max_retries):
|
|
282
|
-
# ask LLM to apply suggestion to code
|
|
283
|
-
# run pytest
|
|
284
|
-
# if pytest passes:
|
|
285
|
-
# run harness . to get new score
|
|
286
|
-
# break
|
|
287
|
-
# else:
|
|
288
|
-
# feed error back to LLM for retry
|
|
289
|
-
# compare all variants and checkout the best one
|
|
290
|
-
|
|
291
|
-
console.print(
|
|
292
|
-
"[yellow]Evolution engine skeleton ready. "
|
|
293
|
-
"Actual git mutation logic pending.[/yellow]"
|
|
259
|
+
target_path = Path(path).resolve()
|
|
260
|
+
|
|
261
|
+
result = run_refine(
|
|
262
|
+
target_path=target_path,
|
|
263
|
+
max_retries=max_retries,
|
|
264
|
+
loop=loop,
|
|
265
|
+
max_rounds=max_rounds,
|
|
266
|
+
progress_callback=lambda message: console.print(f"[dim]{message}[/dim]"),
|
|
294
267
|
)
|
|
268
|
+
console.print(f"[green]winner_id:[/green] {result['winner_id']}")
|
|
269
|
+
console.print(f"[cyan]rounds_completed:[/cyan] {result['rounds_completed']}")
|
|
270
|
+
console.print(f"[yellow]stop_reason:[/yellow] {result['stop_reason']}")
|
|
295
271
|
@app.command()
|
|
296
272
|
def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
|
|
297
273
|
"""
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class LLMSettings:
|
|
10
|
+
api_key: str | None
|
|
11
|
+
base_url: str
|
|
12
|
+
model_name: str
|
|
13
|
+
mini_model_name: str
|
|
14
|
+
request_timeout_seconds: float
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_llm_settings() -> LLMSettings:
|
|
18
|
+
return LLMSettings(
|
|
19
|
+
api_key=os.environ.get("LLM_API_KEY"),
|
|
20
|
+
base_url=os.environ.get("LLM_BASE_URL", "https://api.deepseek.com/v1"),
|
|
21
|
+
model_name=os.environ.get("LLM_MODEL_NAME", "deepseek-reasoner"),
|
|
22
|
+
mini_model_name=os.environ.get("LLM_MINI_MODEL_NAME", "deepseek-chat"),
|
|
23
|
+
request_timeout_seconds=float(
|
|
24
|
+
os.environ.get("LLM_REQUEST_TIMEOUT_SECONDS", "60")
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_llm_client(settings: LLMSettings) -> Any | None:
|
|
30
|
+
if not settings.api_key:
|
|
31
|
+
return None
|
|
32
|
+
return OpenAI(api_key=settings.api_key, base_url=settings.base_url)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from python_harness.llm_client import build_llm_client, load_llm_settings
|
|
6
|
+
from python_harness.python_file_inventory import collect_python_files
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NullSuggestionApplier:
|
|
10
|
+
def apply(
|
|
11
|
+
self,
|
|
12
|
+
workspace: Path,
|
|
13
|
+
suggestion: dict[str, str],
|
|
14
|
+
failure_feedback: str = "",
|
|
15
|
+
) -> dict[str, Any]:
|
|
16
|
+
return {
|
|
17
|
+
"ok": True,
|
|
18
|
+
"touched_files": [],
|
|
19
|
+
"failure_reason": "",
|
|
20
|
+
"suggestion_title": suggestion.get("title", ""),
|
|
21
|
+
"failure_feedback": failure_feedback,
|
|
22
|
+
"workspace": str(workspace),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LLMSuggestionApplier:
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
client: Any | None = None,
|
|
30
|
+
model_name: str | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
settings = load_llm_settings()
|
|
33
|
+
self.client = client if client is not None else build_llm_client(settings)
|
|
34
|
+
self.model_name = model_name or settings.mini_model_name
|
|
35
|
+
self.request_timeout_seconds = settings.request_timeout_seconds
|
|
36
|
+
|
|
37
|
+
def _select_files(self, workspace: Path, suggestion: dict[str, str]) -> list[Path]:
|
|
38
|
+
target_file = suggestion.get("target_file", "").strip()
|
|
39
|
+
if target_file and target_file != "all":
|
|
40
|
+
target_path = workspace / target_file
|
|
41
|
+
if target_path.is_file():
|
|
42
|
+
return [target_path]
|
|
43
|
+
if target_path.is_dir():
|
|
44
|
+
return sorted(target_path.rglob("*.py"))[:3]
|
|
45
|
+
return collect_python_files(workspace)[:3]
|
|
46
|
+
|
|
47
|
+
def _build_messages(
|
|
48
|
+
self,
|
|
49
|
+
workspace: Path,
|
|
50
|
+
suggestion: dict[str, str],
|
|
51
|
+
failure_feedback: str,
|
|
52
|
+
files: list[Path],
|
|
53
|
+
) -> list[dict[str, str]]:
|
|
54
|
+
inventory = "\n".join(
|
|
55
|
+
f"- {file_path.relative_to(workspace)}"
|
|
56
|
+
for file_path in collect_python_files(workspace)
|
|
57
|
+
)
|
|
58
|
+
file_blocks = "\n\n".join(
|
|
59
|
+
(
|
|
60
|
+
f"FILE: {file_path.relative_to(workspace)}\n"
|
|
61
|
+
f"```python\n{file_path.read_text(encoding='utf-8')}\n```"
|
|
62
|
+
)
|
|
63
|
+
for file_path in files
|
|
64
|
+
)
|
|
65
|
+
system_prompt = (
|
|
66
|
+
"You apply a single repository improvement suggestion. "
|
|
67
|
+
"Return only valid JSON with schema "
|
|
68
|
+
'{"updates":[{"path":"relative/path.py","content":"full file content"}]}. '
|
|
69
|
+
"Make the smallest possible change that satisfies the suggestion "
|
|
70
|
+
"and preserves behavior. "
|
|
71
|
+
"Never write files outside the workspace."
|
|
72
|
+
)
|
|
73
|
+
user_prompt = (
|
|
74
|
+
f"Suggestion title: {suggestion.get('title', '')}\n"
|
|
75
|
+
f"Suggestion description: {suggestion.get('description', '')}\n"
|
|
76
|
+
f"Suggestion target_file: {suggestion.get('target_file', 'all')}\n"
|
|
77
|
+
f"Failure feedback from previous attempt: {failure_feedback or 'None'}\n\n"
|
|
78
|
+
f"Workspace python inventory:\n{inventory}\n\n"
|
|
79
|
+
f"Editable file contents:\n{file_blocks}"
|
|
80
|
+
)
|
|
81
|
+
return [
|
|
82
|
+
{"role": "system", "content": system_prompt},
|
|
83
|
+
{"role": "user", "content": user_prompt},
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
def _parse_updates(self, raw_content: str) -> list[dict[str, str]]:
|
|
87
|
+
payload = json.loads(raw_content)
|
|
88
|
+
updates = payload.get("updates", [])
|
|
89
|
+
if not isinstance(updates, list):
|
|
90
|
+
raise ValueError("LLM updates payload must contain a list")
|
|
91
|
+
parsed: list[dict[str, str]] = []
|
|
92
|
+
for update in updates:
|
|
93
|
+
if not isinstance(update, dict):
|
|
94
|
+
continue
|
|
95
|
+
path = update.get("path")
|
|
96
|
+
content = update.get("content")
|
|
97
|
+
if isinstance(path, str) and isinstance(content, str):
|
|
98
|
+
parsed.append({"path": path, "content": content})
|
|
99
|
+
if not parsed:
|
|
100
|
+
raise ValueError("LLM returned no file updates")
|
|
101
|
+
return parsed
|
|
102
|
+
|
|
103
|
+
def apply(
|
|
104
|
+
self,
|
|
105
|
+
workspace: Path,
|
|
106
|
+
suggestion: dict[str, str],
|
|
107
|
+
failure_feedback: str = "",
|
|
108
|
+
) -> dict[str, Any]:
|
|
109
|
+
if self.client is None:
|
|
110
|
+
return {
|
|
111
|
+
"ok": False,
|
|
112
|
+
"touched_files": [],
|
|
113
|
+
"failure_reason": "LLM_API_KEY not configured",
|
|
114
|
+
}
|
|
115
|
+
files = self._select_files(workspace, suggestion)
|
|
116
|
+
if not files:
|
|
117
|
+
return {
|
|
118
|
+
"ok": False,
|
|
119
|
+
"touched_files": [],
|
|
120
|
+
"failure_reason": "No editable files selected for suggestion",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
client = cast(Any, self.client)
|
|
124
|
+
try:
|
|
125
|
+
completion = client.chat.completions.create(
|
|
126
|
+
model=self.model_name,
|
|
127
|
+
messages=self._build_messages(
|
|
128
|
+
workspace,
|
|
129
|
+
suggestion,
|
|
130
|
+
failure_feedback,
|
|
131
|
+
files,
|
|
132
|
+
),
|
|
133
|
+
response_format={"type": "json_object"},
|
|
134
|
+
timeout=self.request_timeout_seconds,
|
|
135
|
+
)
|
|
136
|
+
except Exception as exc:
|
|
137
|
+
return {
|
|
138
|
+
"ok": False,
|
|
139
|
+
"touched_files": [],
|
|
140
|
+
"failure_reason": str(exc),
|
|
141
|
+
"retryable": False,
|
|
142
|
+
}
|
|
143
|
+
content = completion.choices[0].message.content
|
|
144
|
+
if not content:
|
|
145
|
+
return {
|
|
146
|
+
"ok": False,
|
|
147
|
+
"touched_files": [],
|
|
148
|
+
"failure_reason": "LLM returned empty response",
|
|
149
|
+
"retryable": False,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
updates = self._parse_updates(content)
|
|
154
|
+
touched_files: list[str] = []
|
|
155
|
+
for update in updates:
|
|
156
|
+
destination = (workspace / update["path"]).resolve()
|
|
157
|
+
if not destination.is_relative_to(workspace.resolve()):
|
|
158
|
+
raise ValueError("LLM update path is outside workspace")
|
|
159
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
destination.write_text(update["content"], encoding="utf-8")
|
|
161
|
+
touched_files.append(str(destination.relative_to(workspace)))
|
|
162
|
+
except Exception as exc:
|
|
163
|
+
return {
|
|
164
|
+
"ok": False,
|
|
165
|
+
"touched_files": [],
|
|
166
|
+
"failure_reason": str(exc),
|
|
167
|
+
"retryable": False,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
"ok": True,
|
|
172
|
+
"touched_files": touched_files,
|
|
173
|
+
"failure_reason": "",
|
|
174
|
+
"suggestion_title": suggestion.get("title", ""),
|
|
175
|
+
"failure_feedback": failure_feedback,
|
|
176
|
+
"workspace": str(workspace),
|
|
177
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def run_command(path: Path, args: list[str]) -> tuple[bool, str]:
|
|
7
|
+
command_cwd = path if path.is_dir() else path.parent
|
|
8
|
+
completed = subprocess.run(
|
|
9
|
+
args,
|
|
10
|
+
cwd=command_cwd,
|
|
11
|
+
capture_output=True,
|
|
12
|
+
text=True,
|
|
13
|
+
check=False,
|
|
14
|
+
)
|
|
15
|
+
output = (completed.stdout + completed.stderr).strip()
|
|
16
|
+
return completed.returncode == 0, output
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def default_self_check_runner(path: Path) -> tuple[bool, str]:
|
|
20
|
+
checks = [
|
|
21
|
+
[sys.executable, "-m", "ruff", "check", str(path)],
|
|
22
|
+
[sys.executable, "-m", "mypy", str(path)],
|
|
23
|
+
[sys.executable, "-m", "pytest", str(path)],
|
|
24
|
+
]
|
|
25
|
+
for args in checks:
|
|
26
|
+
ok, output = run_command(path, args)
|
|
27
|
+
if not ok:
|
|
28
|
+
return False, output
|
|
29
|
+
return True, ""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from python_harness.refine_checks import default_self_check_runner
|
|
6
|
+
from python_harness.refine_execution import (
|
|
7
|
+
execute_candidate as _execute_candidate,
|
|
8
|
+
)
|
|
9
|
+
from python_harness.refine_rounds import (
|
|
10
|
+
default_evaluator_runner,
|
|
11
|
+
default_workspace_root,
|
|
12
|
+
suggestions_from,
|
|
13
|
+
validate_workspace_root,
|
|
14
|
+
)
|
|
15
|
+
from python_harness.refine_rounds import (
|
|
16
|
+
run_refine as _run_refine,
|
|
17
|
+
)
|
|
18
|
+
from python_harness.refine_rounds import (
|
|
19
|
+
run_refine_round as _run_refine_round,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
SelfCheckRunner = Callable[[Path], tuple[bool, str]]
|
|
23
|
+
EvaluatorRunner = Callable[[Path], dict[str, Any]]
|
|
24
|
+
|
|
25
|
+
_default_evaluator_runner = default_evaluator_runner
|
|
26
|
+
_default_self_check_runner = default_self_check_runner
|
|
27
|
+
_default_workspace_root = default_workspace_root
|
|
28
|
+
_suggestions_from = suggestions_from
|
|
29
|
+
_validate_workspace_root = validate_workspace_root
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def execute_candidate(*args: Any, **kwargs: Any) -> Any:
|
|
33
|
+
return _execute_candidate(*args, **kwargs)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run_refine_round(*args: Any, **kwargs: Any) -> Any:
|
|
37
|
+
return _run_refine_round(*args, **kwargs)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def run_refine(*args: Any, **kwargs: Any) -> Any:
|
|
41
|
+
return _run_refine(*args, **kwargs)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from python_harness.refine_models import Candidate, SuggestionApplier
|
|
5
|
+
from python_harness.refine_workspace import create_candidate_workspace
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _emit(progress_callback: Any, message: str) -> None:
|
|
9
|
+
if progress_callback is not None:
|
|
10
|
+
progress_callback(message)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def execute_candidate(
|
|
14
|
+
*,
|
|
15
|
+
parent: Candidate,
|
|
16
|
+
candidate_id: str,
|
|
17
|
+
suggestion: dict[str, str],
|
|
18
|
+
workspace_root: Path,
|
|
19
|
+
applier: SuggestionApplier,
|
|
20
|
+
self_check_runner: Any,
|
|
21
|
+
evaluator_runner: Any,
|
|
22
|
+
max_retries: int,
|
|
23
|
+
progress_callback: Any = None,
|
|
24
|
+
) -> Candidate:
|
|
25
|
+
workspace = create_candidate_workspace(
|
|
26
|
+
parent.workspace,
|
|
27
|
+
workspace_root,
|
|
28
|
+
candidate_id,
|
|
29
|
+
)
|
|
30
|
+
feedback = ""
|
|
31
|
+
retries = 0
|
|
32
|
+
suggestion_title = suggestion.get("title", candidate_id)
|
|
33
|
+
|
|
34
|
+
while True:
|
|
35
|
+
apply_result: dict[str, Any] | None = None
|
|
36
|
+
_emit(
|
|
37
|
+
progress_callback,
|
|
38
|
+
f"{candidate_id} apply started: {suggestion_title}",
|
|
39
|
+
)
|
|
40
|
+
try:
|
|
41
|
+
apply_result = applier.apply(
|
|
42
|
+
workspace,
|
|
43
|
+
suggestion,
|
|
44
|
+
failure_feedback=feedback,
|
|
45
|
+
)
|
|
46
|
+
if not bool(apply_result.get("ok", False)):
|
|
47
|
+
feedback = str(
|
|
48
|
+
apply_result.get("failure_reason") or "suggestion apply failed"
|
|
49
|
+
)
|
|
50
|
+
raise RuntimeError(feedback)
|
|
51
|
+
_emit(progress_callback, f"{candidate_id} apply passed")
|
|
52
|
+
except Exception as exc:
|
|
53
|
+
feedback = str(exc)
|
|
54
|
+
retryable = True
|
|
55
|
+
if apply_result is not None:
|
|
56
|
+
retryable = bool(apply_result.get("retryable", True))
|
|
57
|
+
_emit(progress_callback, f"{candidate_id} apply failed: {feedback}")
|
|
58
|
+
if not retryable:
|
|
59
|
+
return Candidate(
|
|
60
|
+
id=candidate_id,
|
|
61
|
+
parent_id=parent.id,
|
|
62
|
+
depth=parent.depth + 1,
|
|
63
|
+
workspace=workspace,
|
|
64
|
+
suggestion_trace=parent.suggestion_trace + (suggestion_title,),
|
|
65
|
+
status="failed",
|
|
66
|
+
retry_count=retries,
|
|
67
|
+
selection_reason=feedback,
|
|
68
|
+
)
|
|
69
|
+
retries += 1
|
|
70
|
+
if retries > max_retries:
|
|
71
|
+
return Candidate(
|
|
72
|
+
id=candidate_id,
|
|
73
|
+
parent_id=parent.id,
|
|
74
|
+
depth=parent.depth + 1,
|
|
75
|
+
workspace=workspace,
|
|
76
|
+
suggestion_trace=parent.suggestion_trace + (suggestion_title,),
|
|
77
|
+
status="failed",
|
|
78
|
+
retry_count=retries - 1,
|
|
79
|
+
selection_reason=feedback,
|
|
80
|
+
)
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
_emit(progress_callback, f"{candidate_id} guardrail 1 started")
|
|
84
|
+
is_ok, feedback = self_check_runner(workspace)
|
|
85
|
+
if is_ok:
|
|
86
|
+
_emit(progress_callback, f"{candidate_id} guardrail 1 passed")
|
|
87
|
+
_emit(progress_callback, f"{candidate_id} guardrail 2 started")
|
|
88
|
+
evaluation = evaluator_runner(workspace)
|
|
89
|
+
_emit(progress_callback, f"{candidate_id} guardrail 2 passed")
|
|
90
|
+
return Candidate(
|
|
91
|
+
id=candidate_id,
|
|
92
|
+
parent_id=parent.id,
|
|
93
|
+
depth=parent.depth + 1,
|
|
94
|
+
workspace=workspace,
|
|
95
|
+
suggestion_trace=parent.suggestion_trace + (suggestion_title,),
|
|
96
|
+
evaluation=evaluation,
|
|
97
|
+
status="measured",
|
|
98
|
+
retry_count=retries,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
_emit(progress_callback, f"{candidate_id} guardrail 1 failed")
|
|
102
|
+
_emit(progress_callback, feedback)
|
|
103
|
+
retries += 1
|
|
104
|
+
if retries > max_retries:
|
|
105
|
+
return Candidate(
|
|
106
|
+
id=candidate_id,
|
|
107
|
+
parent_id=parent.id,
|
|
108
|
+
depth=parent.depth + 1,
|
|
109
|
+
workspace=workspace,
|
|
110
|
+
suggestion_trace=parent.suggestion_trace + (suggestion_title,),
|
|
111
|
+
status="failed",
|
|
112
|
+
retry_count=retries - 1,
|
|
113
|
+
selection_reason=str(feedback),
|
|
114
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Protocol
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(slots=True)
|
|
7
|
+
class Candidate:
|
|
8
|
+
id: str
|
|
9
|
+
parent_id: str | None
|
|
10
|
+
depth: int
|
|
11
|
+
workspace: Path
|
|
12
|
+
suggestion_trace: tuple[str, ...]
|
|
13
|
+
evaluation: dict[str, Any] | None = None
|
|
14
|
+
status: str = "pending"
|
|
15
|
+
retry_count: int = 0
|
|
16
|
+
selection_reason: str = ""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(slots=True)
|
|
20
|
+
class SelectionResult:
|
|
21
|
+
winner: Candidate
|
|
22
|
+
ordered_ids: list[str]
|
|
23
|
+
reason: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(slots=True)
|
|
27
|
+
class RefineRoundResult:
|
|
28
|
+
baseline: Candidate
|
|
29
|
+
candidates: list[Candidate] = field(default_factory=list)
|
|
30
|
+
winner: Candidate | None = None
|
|
31
|
+
stop_reason: str = ""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SuggestionApplier(Protocol):
|
|
35
|
+
def apply(
|
|
36
|
+
self,
|
|
37
|
+
workspace: Path,
|
|
38
|
+
suggestion: dict[str, str],
|
|
39
|
+
failure_feedback: str = "",
|
|
40
|
+
) -> dict[str, Any]: ...
|