python-harness 0.0.11__tar.gz → 0.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_harness-0.0.11/python_harness.egg-info → python_harness-0.0.13}/PKG-INFO +1 -1
- {python_harness-0.0.11 → python_harness-0.0.13}/pyproject.toml +1 -1
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/__init__.py +1 -1
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/cli.py +32 -46
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/hard_evaluator.py +18 -2
- python_harness-0.0.13/python_harness/llm_client.py +32 -0
- python_harness-0.0.13/python_harness/python_file_inventory.py +27 -0
- python_harness-0.0.13/python_harness/refine_apply.py +177 -0
- python_harness-0.0.13/python_harness/refine_checks.py +29 -0
- python_harness-0.0.13/python_harness/refine_engine.py +41 -0
- python_harness-0.0.13/python_harness/refine_execution.py +114 -0
- python_harness-0.0.13/python_harness/refine_models.py +40 -0
- python_harness-0.0.13/python_harness/refine_rounds.py +373 -0
- python_harness-0.0.13/python_harness/refine_scoring.py +95 -0
- python_harness-0.0.13/python_harness/refine_workspace.py +57 -0
- python_harness-0.0.13/python_harness/soft_eval_report.py +30 -0
- python_harness-0.0.13/python_harness/soft_eval_report_messages.py +57 -0
- python_harness-0.0.13/python_harness/soft_eval_report_metrics.py +53 -0
- python_harness-0.0.13/python_harness/soft_eval_report_mock.py +45 -0
- python_harness-0.0.13/python_harness/soft_eval_report_shared.py +2 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/soft_evaluator.py +88 -151
- {python_harness-0.0.11 → python_harness-0.0.13/python_harness.egg-info}/PKG-INFO +1 -1
- python_harness-0.0.13/python_harness.egg-info/SOURCES.txt +39 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_cli.py +132 -37
- {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_hard_evaluator.py +16 -0
- python_harness-0.0.13/tests/test_refine_apply.py +184 -0
- python_harness-0.0.13/tests/test_refine_engine.py +982 -0
- python_harness-0.0.13/tests/test_refine_scoring.py +177 -0
- python_harness-0.0.13/tests/test_refine_workspace.py +69 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_soft_evaluator.py +178 -1
- python_harness-0.0.11/python_harness.egg-info/SOURCES.txt +0 -20
- {python_harness-0.0.11 → python_harness-0.0.13}/LICENSE +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/README.md +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/evaluator.py +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness/qc_evaluator.py +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness.egg-info/dependency_links.txt +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness.egg-info/entry_points.txt +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness.egg-info/requires.txt +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/python_harness.egg-info/top_level.txt +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/setup.cfg +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_evaluator.py +0 -0
- {python_harness-0.0.11 → python_harness-0.0.13}/tests/test_qc_evaluator.py +0 -0
|
@@ -4,6 +4,7 @@ Command-line interface for python-harness.
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import sys
|
|
7
|
+
from pathlib import Path
|
|
7
8
|
from typing import Any
|
|
8
9
|
|
|
9
10
|
import typer
|
|
@@ -11,6 +12,7 @@ from dotenv import load_dotenv
|
|
|
11
12
|
from rich.console import Console
|
|
12
13
|
|
|
13
14
|
from python_harness.evaluator import Evaluator
|
|
15
|
+
from python_harness.refine_engine import run_refine
|
|
14
16
|
|
|
15
17
|
# Try to find .env file explicitly before anything else executes
|
|
16
18
|
env_path = os.path.join(os.getcwd(), '.env')
|
|
@@ -21,6 +23,8 @@ else:
|
|
|
21
23
|
|
|
22
24
|
app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
|
|
23
25
|
console = Console()
|
|
26
|
+
MI_HEALTHY_THRESHOLD = 70.0
|
|
27
|
+
MI_WARNING_THRESHOLD = 40.0
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
def _print_detail_block(title: str, details: str, color: str) -> None:
|
|
@@ -133,13 +137,21 @@ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
|
|
|
133
137
|
_print_hard_failure_details(hard_results)
|
|
134
138
|
|
|
135
139
|
|
|
140
|
+
def _mi_scorecard_color(avg_mi: float) -> str:
|
|
141
|
+
if avg_mi >= MI_HEALTHY_THRESHOLD:
|
|
142
|
+
return "green"
|
|
143
|
+
if avg_mi >= MI_WARNING_THRESHOLD:
|
|
144
|
+
return "yellow"
|
|
145
|
+
return "red"
|
|
146
|
+
|
|
147
|
+
|
|
136
148
|
def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
|
|
137
149
|
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
138
150
|
if not mi_scores:
|
|
139
151
|
return
|
|
140
152
|
|
|
141
153
|
avg_mi = sum(mi_scores.values()) / len(mi_scores)
|
|
142
|
-
color =
|
|
154
|
+
color = _mi_scorecard_color(avg_mi)
|
|
143
155
|
console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
|
|
144
156
|
|
|
145
157
|
|
|
@@ -221,7 +233,7 @@ def _print_final_report(final_report: dict[str, Any]) -> None:
|
|
|
221
233
|
suggestions = final_report.get("suggestions", [])
|
|
222
234
|
if suggestions:
|
|
223
235
|
console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
|
|
224
|
-
for i, sug in enumerate(suggestions, 1):
|
|
236
|
+
for i, sug in enumerate(suggestions[:3], 1):
|
|
225
237
|
console.print(
|
|
226
238
|
f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
|
|
227
239
|
f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
|
|
@@ -232,56 +244,30 @@ def _print_final_report(final_report: dict[str, Any]) -> None:
|
|
|
232
244
|
@app.command()
|
|
233
245
|
def refine(
|
|
234
246
|
path: str = typer.Argument(".", help="The path to evaluate and evolve"),
|
|
235
|
-
|
|
236
|
-
|
|
247
|
+
max_retries: int = typer.Option(3, help="Maximum retries per candidate"),
|
|
248
|
+
loop: bool = typer.Option(False, help="Keep refining winners across rounds"),
|
|
249
|
+
max_rounds: int = typer.Option(3, help="Maximum refine rounds when looping"),
|
|
237
250
|
) -> None:
|
|
238
251
|
"""
|
|
239
|
-
Refine the codebase through
|
|
240
|
-
Generates variants based on suggestions, tests them, and picks the best.
|
|
252
|
+
Refine the codebase through a fixed two-level search and optional loop.
|
|
241
253
|
"""
|
|
242
254
|
console.print(
|
|
243
|
-
f"[bold magenta]Starting
|
|
244
|
-
f"[dim](
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
# 1. First, run a baseline evaluation to get suggestions
|
|
248
|
-
evaluator = Evaluator(path)
|
|
249
|
-
console.print("[cyan]Running baseline evaluation...[/cyan]")
|
|
250
|
-
hard_results = evaluator.hard_evaluator.evaluate()
|
|
251
|
-
soft_results = evaluator.soft_evaluator.evaluate()
|
|
252
|
-
baseline_report = evaluator.soft_evaluator.generate_final_report(
|
|
253
|
-
hard_results, {"all_passed": True, "failures": []}, soft_results
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
suggestions = baseline_report.get("suggestions", [])
|
|
257
|
-
if not suggestions:
|
|
258
|
-
console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
|
|
259
|
-
return
|
|
260
|
-
|
|
261
|
-
console.print(
|
|
262
|
-
f"[green]Found {len(suggestions)} suggestions. "
|
|
263
|
-
f"Starting evolution branches...[/green]"
|
|
255
|
+
f"[bold magenta]Starting refine for path:[/bold magenta] {path} "
|
|
256
|
+
f"[dim](loop={loop}, max_rounds={max_rounds}, "
|
|
257
|
+
f"max_retries={max_retries})[/dim]"
|
|
264
258
|
)
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
# run pytest
|
|
274
|
-
# if pytest passes:
|
|
275
|
-
# run harness . to get new score
|
|
276
|
-
# break
|
|
277
|
-
# else:
|
|
278
|
-
# feed error back to LLM for retry
|
|
279
|
-
# compare all variants and checkout the best one
|
|
280
|
-
|
|
281
|
-
console.print(
|
|
282
|
-
"[yellow]Evolution engine skeleton ready. "
|
|
283
|
-
"Actual git mutation logic pending.[/yellow]"
|
|
259
|
+
target_path = Path(path).resolve()
|
|
260
|
+
|
|
261
|
+
result = run_refine(
|
|
262
|
+
target_path=target_path,
|
|
263
|
+
max_retries=max_retries,
|
|
264
|
+
loop=loop,
|
|
265
|
+
max_rounds=max_rounds,
|
|
266
|
+
progress_callback=lambda message: console.print(f"[dim]{message}[/dim]"),
|
|
284
267
|
)
|
|
268
|
+
console.print(f"[green]winner_id:[/green] {result['winner_id']}")
|
|
269
|
+
console.print(f"[cyan]rounds_completed:[/cyan] {result['rounds_completed']}")
|
|
270
|
+
console.print(f"[yellow]stop_reason:[/yellow] {result['stop_reason']}")
|
|
285
271
|
@app.command()
|
|
286
272
|
def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
|
|
287
273
|
"""
|
|
@@ -11,6 +11,8 @@ from typing import Any
|
|
|
11
11
|
|
|
12
12
|
from rich.console import Console
|
|
13
13
|
|
|
14
|
+
from python_harness.python_file_inventory import collect_python_files
|
|
15
|
+
|
|
14
16
|
console = Console()
|
|
15
17
|
PYTEST_TIMEOUT_SECONDS = 60
|
|
16
18
|
|
|
@@ -22,6 +24,9 @@ class HardEvaluator:
|
|
|
22
24
|
def __init__(self, target_path: str):
|
|
23
25
|
self.target_path = Path(target_path).resolve()
|
|
24
26
|
|
|
27
|
+
def _radon_metric_targets(self) -> list[str]:
|
|
28
|
+
return [str(file_path) for file_path in collect_python_files(self.target_path)]
|
|
29
|
+
|
|
25
30
|
def run_ruff(self) -> dict[str, Any]:
|
|
26
31
|
"""
|
|
27
32
|
Run Ruff linter and return results.
|
|
@@ -112,6 +117,14 @@ class HardEvaluator:
|
|
|
112
117
|
Flag any function/method with CC > 15 as a failure.
|
|
113
118
|
"""
|
|
114
119
|
try:
|
|
120
|
+
targets = self._radon_metric_targets()
|
|
121
|
+
if not targets:
|
|
122
|
+
return {
|
|
123
|
+
"status": "success",
|
|
124
|
+
"issues": [],
|
|
125
|
+
"return_code": 0,
|
|
126
|
+
"output": "",
|
|
127
|
+
}
|
|
115
128
|
result = subprocess.run(
|
|
116
129
|
[
|
|
117
130
|
sys.executable,
|
|
@@ -120,7 +133,7 @@ class HardEvaluator:
|
|
|
120
133
|
"cc",
|
|
121
134
|
"-j",
|
|
122
135
|
"-a",
|
|
123
|
-
|
|
136
|
+
*targets,
|
|
124
137
|
],
|
|
125
138
|
capture_output=True,
|
|
126
139
|
text=True,
|
|
@@ -178,8 +191,11 @@ class HardEvaluator:
|
|
|
178
191
|
but it contributes to the scorecard.
|
|
179
192
|
"""
|
|
180
193
|
try:
|
|
194
|
+
targets = self._radon_metric_targets()
|
|
195
|
+
if not targets:
|
|
196
|
+
return {"status": "success", "mi_scores": {}, "return_code": 0}
|
|
181
197
|
result = subprocess.run(
|
|
182
|
-
[sys.executable, "-m", "radon", "mi", "-j",
|
|
198
|
+
[sys.executable, "-m", "radon", "mi", "-j", *targets],
|
|
183
199
|
capture_output=True,
|
|
184
200
|
text=True,
|
|
185
201
|
check=False
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class LLMSettings:
|
|
10
|
+
api_key: str | None
|
|
11
|
+
base_url: str
|
|
12
|
+
model_name: str
|
|
13
|
+
mini_model_name: str
|
|
14
|
+
request_timeout_seconds: float
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_llm_settings() -> LLMSettings:
|
|
18
|
+
return LLMSettings(
|
|
19
|
+
api_key=os.environ.get("LLM_API_KEY"),
|
|
20
|
+
base_url=os.environ.get("LLM_BASE_URL", "https://api.deepseek.com/v1"),
|
|
21
|
+
model_name=os.environ.get("LLM_MODEL_NAME", "deepseek-reasoner"),
|
|
22
|
+
mini_model_name=os.environ.get("LLM_MINI_MODEL_NAME", "deepseek-chat"),
|
|
23
|
+
request_timeout_seconds=float(
|
|
24
|
+
os.environ.get("LLM_REQUEST_TIMEOUT_SECONDS", "60")
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_llm_client(settings: LLMSettings) -> Any | None:
|
|
30
|
+
if not settings.api_key:
|
|
31
|
+
return None
|
|
32
|
+
return OpenAI(api_key=settings.api_key, base_url=settings.base_url)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Python file discovery helpers.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
SKIPPED_DIRS = {"__pycache__", "env", "test", "tests", "vendors", "venv"}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def should_skip_python_path(file_path: Path, root: Path) -> bool:
|
|
11
|
+
if file_path.name.startswith("test_") or file_path.name.endswith("_test.py"):
|
|
12
|
+
return True
|
|
13
|
+
try:
|
|
14
|
+
relative_parts = file_path.relative_to(root).parts
|
|
15
|
+
except ValueError:
|
|
16
|
+
relative_parts = file_path.parts
|
|
17
|
+
return any(part.startswith(".") or part in SKIPPED_DIRS for part in relative_parts)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def collect_python_files(root: Path) -> list[Path]:
|
|
21
|
+
if root.is_file():
|
|
22
|
+
return [root] if root.suffix == ".py" else []
|
|
23
|
+
return [
|
|
24
|
+
file_path
|
|
25
|
+
for file_path in sorted(root.rglob("*.py"))
|
|
26
|
+
if not should_skip_python_path(file_path, root)
|
|
27
|
+
]
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from python_harness.llm_client import build_llm_client, load_llm_settings
|
|
6
|
+
from python_harness.python_file_inventory import collect_python_files
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NullSuggestionApplier:
|
|
10
|
+
def apply(
|
|
11
|
+
self,
|
|
12
|
+
workspace: Path,
|
|
13
|
+
suggestion: dict[str, str],
|
|
14
|
+
failure_feedback: str = "",
|
|
15
|
+
) -> dict[str, Any]:
|
|
16
|
+
return {
|
|
17
|
+
"ok": True,
|
|
18
|
+
"touched_files": [],
|
|
19
|
+
"failure_reason": "",
|
|
20
|
+
"suggestion_title": suggestion.get("title", ""),
|
|
21
|
+
"failure_feedback": failure_feedback,
|
|
22
|
+
"workspace": str(workspace),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LLMSuggestionApplier:
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
client: Any | None = None,
|
|
30
|
+
model_name: str | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
settings = load_llm_settings()
|
|
33
|
+
self.client = client if client is not None else build_llm_client(settings)
|
|
34
|
+
self.model_name = model_name or settings.mini_model_name
|
|
35
|
+
self.request_timeout_seconds = settings.request_timeout_seconds
|
|
36
|
+
|
|
37
|
+
def _select_files(self, workspace: Path, suggestion: dict[str, str]) -> list[Path]:
|
|
38
|
+
target_file = suggestion.get("target_file", "").strip()
|
|
39
|
+
if target_file and target_file != "all":
|
|
40
|
+
target_path = workspace / target_file
|
|
41
|
+
if target_path.is_file():
|
|
42
|
+
return [target_path]
|
|
43
|
+
if target_path.is_dir():
|
|
44
|
+
return sorted(target_path.rglob("*.py"))[:3]
|
|
45
|
+
return collect_python_files(workspace)[:3]
|
|
46
|
+
|
|
47
|
+
def _build_messages(
|
|
48
|
+
self,
|
|
49
|
+
workspace: Path,
|
|
50
|
+
suggestion: dict[str, str],
|
|
51
|
+
failure_feedback: str,
|
|
52
|
+
files: list[Path],
|
|
53
|
+
) -> list[dict[str, str]]:
|
|
54
|
+
inventory = "\n".join(
|
|
55
|
+
f"- {file_path.relative_to(workspace)}"
|
|
56
|
+
for file_path in collect_python_files(workspace)
|
|
57
|
+
)
|
|
58
|
+
file_blocks = "\n\n".join(
|
|
59
|
+
(
|
|
60
|
+
f"FILE: {file_path.relative_to(workspace)}\n"
|
|
61
|
+
f"```python\n{file_path.read_text(encoding='utf-8')}\n```"
|
|
62
|
+
)
|
|
63
|
+
for file_path in files
|
|
64
|
+
)
|
|
65
|
+
system_prompt = (
|
|
66
|
+
"You apply a single repository improvement suggestion. "
|
|
67
|
+
"Return only valid JSON with schema "
|
|
68
|
+
'{"updates":[{"path":"relative/path.py","content":"full file content"}]}. '
|
|
69
|
+
"Make the smallest possible change that satisfies the suggestion "
|
|
70
|
+
"and preserves behavior. "
|
|
71
|
+
"Never write files outside the workspace."
|
|
72
|
+
)
|
|
73
|
+
user_prompt = (
|
|
74
|
+
f"Suggestion title: {suggestion.get('title', '')}\n"
|
|
75
|
+
f"Suggestion description: {suggestion.get('description', '')}\n"
|
|
76
|
+
f"Suggestion target_file: {suggestion.get('target_file', 'all')}\n"
|
|
77
|
+
f"Failure feedback from previous attempt: {failure_feedback or 'None'}\n\n"
|
|
78
|
+
f"Workspace python inventory:\n{inventory}\n\n"
|
|
79
|
+
f"Editable file contents:\n{file_blocks}"
|
|
80
|
+
)
|
|
81
|
+
return [
|
|
82
|
+
{"role": "system", "content": system_prompt},
|
|
83
|
+
{"role": "user", "content": user_prompt},
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
def _parse_updates(self, raw_content: str) -> list[dict[str, str]]:
|
|
87
|
+
payload = json.loads(raw_content)
|
|
88
|
+
updates = payload.get("updates", [])
|
|
89
|
+
if not isinstance(updates, list):
|
|
90
|
+
raise ValueError("LLM updates payload must contain a list")
|
|
91
|
+
parsed: list[dict[str, str]] = []
|
|
92
|
+
for update in updates:
|
|
93
|
+
if not isinstance(update, dict):
|
|
94
|
+
continue
|
|
95
|
+
path = update.get("path")
|
|
96
|
+
content = update.get("content")
|
|
97
|
+
if isinstance(path, str) and isinstance(content, str):
|
|
98
|
+
parsed.append({"path": path, "content": content})
|
|
99
|
+
if not parsed:
|
|
100
|
+
raise ValueError("LLM returned no file updates")
|
|
101
|
+
return parsed
|
|
102
|
+
|
|
103
|
+
def apply(
|
|
104
|
+
self,
|
|
105
|
+
workspace: Path,
|
|
106
|
+
suggestion: dict[str, str],
|
|
107
|
+
failure_feedback: str = "",
|
|
108
|
+
) -> dict[str, Any]:
|
|
109
|
+
if self.client is None:
|
|
110
|
+
return {
|
|
111
|
+
"ok": False,
|
|
112
|
+
"touched_files": [],
|
|
113
|
+
"failure_reason": "LLM_API_KEY not configured",
|
|
114
|
+
}
|
|
115
|
+
files = self._select_files(workspace, suggestion)
|
|
116
|
+
if not files:
|
|
117
|
+
return {
|
|
118
|
+
"ok": False,
|
|
119
|
+
"touched_files": [],
|
|
120
|
+
"failure_reason": "No editable files selected for suggestion",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
client = cast(Any, self.client)
|
|
124
|
+
try:
|
|
125
|
+
completion = client.chat.completions.create(
|
|
126
|
+
model=self.model_name,
|
|
127
|
+
messages=self._build_messages(
|
|
128
|
+
workspace,
|
|
129
|
+
suggestion,
|
|
130
|
+
failure_feedback,
|
|
131
|
+
files,
|
|
132
|
+
),
|
|
133
|
+
response_format={"type": "json_object"},
|
|
134
|
+
timeout=self.request_timeout_seconds,
|
|
135
|
+
)
|
|
136
|
+
except Exception as exc:
|
|
137
|
+
return {
|
|
138
|
+
"ok": False,
|
|
139
|
+
"touched_files": [],
|
|
140
|
+
"failure_reason": str(exc),
|
|
141
|
+
"retryable": False,
|
|
142
|
+
}
|
|
143
|
+
content = completion.choices[0].message.content
|
|
144
|
+
if not content:
|
|
145
|
+
return {
|
|
146
|
+
"ok": False,
|
|
147
|
+
"touched_files": [],
|
|
148
|
+
"failure_reason": "LLM returned empty response",
|
|
149
|
+
"retryable": False,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
updates = self._parse_updates(content)
|
|
154
|
+
touched_files: list[str] = []
|
|
155
|
+
for update in updates:
|
|
156
|
+
destination = (workspace / update["path"]).resolve()
|
|
157
|
+
if not destination.is_relative_to(workspace.resolve()):
|
|
158
|
+
raise ValueError("LLM update path is outside workspace")
|
|
159
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
destination.write_text(update["content"], encoding="utf-8")
|
|
161
|
+
touched_files.append(str(destination.relative_to(workspace)))
|
|
162
|
+
except Exception as exc:
|
|
163
|
+
return {
|
|
164
|
+
"ok": False,
|
|
165
|
+
"touched_files": [],
|
|
166
|
+
"failure_reason": str(exc),
|
|
167
|
+
"retryable": False,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
"ok": True,
|
|
172
|
+
"touched_files": touched_files,
|
|
173
|
+
"failure_reason": "",
|
|
174
|
+
"suggestion_title": suggestion.get("title", ""),
|
|
175
|
+
"failure_feedback": failure_feedback,
|
|
176
|
+
"workspace": str(workspace),
|
|
177
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def run_command(path: Path, args: list[str]) -> tuple[bool, str]:
|
|
7
|
+
command_cwd = path if path.is_dir() else path.parent
|
|
8
|
+
completed = subprocess.run(
|
|
9
|
+
args,
|
|
10
|
+
cwd=command_cwd,
|
|
11
|
+
capture_output=True,
|
|
12
|
+
text=True,
|
|
13
|
+
check=False,
|
|
14
|
+
)
|
|
15
|
+
output = (completed.stdout + completed.stderr).strip()
|
|
16
|
+
return completed.returncode == 0, output
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def default_self_check_runner(path: Path) -> tuple[bool, str]:
|
|
20
|
+
checks = [
|
|
21
|
+
[sys.executable, "-m", "ruff", "check", str(path)],
|
|
22
|
+
[sys.executable, "-m", "mypy", str(path)],
|
|
23
|
+
[sys.executable, "-m", "pytest", str(path)],
|
|
24
|
+
]
|
|
25
|
+
for args in checks:
|
|
26
|
+
ok, output = run_command(path, args)
|
|
27
|
+
if not ok:
|
|
28
|
+
return False, output
|
|
29
|
+
return True, ""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from python_harness.refine_checks import default_self_check_runner
|
|
6
|
+
from python_harness.refine_execution import (
|
|
7
|
+
execute_candidate as _execute_candidate,
|
|
8
|
+
)
|
|
9
|
+
from python_harness.refine_rounds import (
|
|
10
|
+
default_evaluator_runner,
|
|
11
|
+
default_workspace_root,
|
|
12
|
+
suggestions_from,
|
|
13
|
+
validate_workspace_root,
|
|
14
|
+
)
|
|
15
|
+
from python_harness.refine_rounds import (
|
|
16
|
+
run_refine as _run_refine,
|
|
17
|
+
)
|
|
18
|
+
from python_harness.refine_rounds import (
|
|
19
|
+
run_refine_round as _run_refine_round,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
SelfCheckRunner = Callable[[Path], tuple[bool, str]]
|
|
23
|
+
EvaluatorRunner = Callable[[Path], dict[str, Any]]
|
|
24
|
+
|
|
25
|
+
_default_evaluator_runner = default_evaluator_runner
|
|
26
|
+
_default_self_check_runner = default_self_check_runner
|
|
27
|
+
_default_workspace_root = default_workspace_root
|
|
28
|
+
_suggestions_from = suggestions_from
|
|
29
|
+
_validate_workspace_root = validate_workspace_root
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def execute_candidate(*args: Any, **kwargs: Any) -> Any:
|
|
33
|
+
return _execute_candidate(*args, **kwargs)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run_refine_round(*args: Any, **kwargs: Any) -> Any:
|
|
37
|
+
return _run_refine_round(*args, **kwargs)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def run_refine(*args: Any, **kwargs: Any) -> Any:
|
|
41
|
+
return _run_refine(*args, **kwargs)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from python_harness.refine_models import Candidate, SuggestionApplier
|
|
5
|
+
from python_harness.refine_workspace import create_candidate_workspace
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _emit(progress_callback: Any, message: str) -> None:
|
|
9
|
+
if progress_callback is not None:
|
|
10
|
+
progress_callback(message)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def execute_candidate(
|
|
14
|
+
*,
|
|
15
|
+
parent: Candidate,
|
|
16
|
+
candidate_id: str,
|
|
17
|
+
suggestion: dict[str, str],
|
|
18
|
+
workspace_root: Path,
|
|
19
|
+
applier: SuggestionApplier,
|
|
20
|
+
self_check_runner: Any,
|
|
21
|
+
evaluator_runner: Any,
|
|
22
|
+
max_retries: int,
|
|
23
|
+
progress_callback: Any = None,
|
|
24
|
+
) -> Candidate:
|
|
25
|
+
workspace = create_candidate_workspace(
|
|
26
|
+
parent.workspace,
|
|
27
|
+
workspace_root,
|
|
28
|
+
candidate_id,
|
|
29
|
+
)
|
|
30
|
+
feedback = ""
|
|
31
|
+
retries = 0
|
|
32
|
+
suggestion_title = suggestion.get("title", candidate_id)
|
|
33
|
+
|
|
34
|
+
while True:
|
|
35
|
+
apply_result: dict[str, Any] | None = None
|
|
36
|
+
_emit(
|
|
37
|
+
progress_callback,
|
|
38
|
+
f"{candidate_id} apply started: {suggestion_title}",
|
|
39
|
+
)
|
|
40
|
+
try:
|
|
41
|
+
apply_result = applier.apply(
|
|
42
|
+
workspace,
|
|
43
|
+
suggestion,
|
|
44
|
+
failure_feedback=feedback,
|
|
45
|
+
)
|
|
46
|
+
if not bool(apply_result.get("ok", False)):
|
|
47
|
+
feedback = str(
|
|
48
|
+
apply_result.get("failure_reason") or "suggestion apply failed"
|
|
49
|
+
)
|
|
50
|
+
raise RuntimeError(feedback)
|
|
51
|
+
_emit(progress_callback, f"{candidate_id} apply passed")
|
|
52
|
+
except Exception as exc:
|
|
53
|
+
feedback = str(exc)
|
|
54
|
+
retryable = True
|
|
55
|
+
if apply_result is not None:
|
|
56
|
+
retryable = bool(apply_result.get("retryable", True))
|
|
57
|
+
_emit(progress_callback, f"{candidate_id} apply failed: {feedback}")
|
|
58
|
+
if not retryable:
|
|
59
|
+
return Candidate(
|
|
60
|
+
id=candidate_id,
|
|
61
|
+
parent_id=parent.id,
|
|
62
|
+
depth=parent.depth + 1,
|
|
63
|
+
workspace=workspace,
|
|
64
|
+
suggestion_trace=parent.suggestion_trace + (suggestion_title,),
|
|
65
|
+
status="failed",
|
|
66
|
+
retry_count=retries,
|
|
67
|
+
selection_reason=feedback,
|
|
68
|
+
)
|
|
69
|
+
retries += 1
|
|
70
|
+
if retries > max_retries:
|
|
71
|
+
return Candidate(
|
|
72
|
+
id=candidate_id,
|
|
73
|
+
parent_id=parent.id,
|
|
74
|
+
depth=parent.depth + 1,
|
|
75
|
+
workspace=workspace,
|
|
76
|
+
suggestion_trace=parent.suggestion_trace + (suggestion_title,),
|
|
77
|
+
status="failed",
|
|
78
|
+
retry_count=retries - 1,
|
|
79
|
+
selection_reason=feedback,
|
|
80
|
+
)
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
_emit(progress_callback, f"{candidate_id} guardrail 1 started")
|
|
84
|
+
is_ok, feedback = self_check_runner(workspace)
|
|
85
|
+
if is_ok:
|
|
86
|
+
_emit(progress_callback, f"{candidate_id} guardrail 1 passed")
|
|
87
|
+
_emit(progress_callback, f"{candidate_id} guardrail 2 started")
|
|
88
|
+
evaluation = evaluator_runner(workspace)
|
|
89
|
+
_emit(progress_callback, f"{candidate_id} guardrail 2 passed")
|
|
90
|
+
return Candidate(
|
|
91
|
+
id=candidate_id,
|
|
92
|
+
parent_id=parent.id,
|
|
93
|
+
depth=parent.depth + 1,
|
|
94
|
+
workspace=workspace,
|
|
95
|
+
suggestion_trace=parent.suggestion_trace + (suggestion_title,),
|
|
96
|
+
evaluation=evaluation,
|
|
97
|
+
status="measured",
|
|
98
|
+
retry_count=retries,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
_emit(progress_callback, f"{candidate_id} guardrail 1 failed")
|
|
102
|
+
_emit(progress_callback, feedback)
|
|
103
|
+
retries += 1
|
|
104
|
+
if retries > max_retries:
|
|
105
|
+
return Candidate(
|
|
106
|
+
id=candidate_id,
|
|
107
|
+
parent_id=parent.id,
|
|
108
|
+
depth=parent.depth + 1,
|
|
109
|
+
workspace=workspace,
|
|
110
|
+
suggestion_trace=parent.suggestion_trace + (suggestion_title,),
|
|
111
|
+
status="failed",
|
|
112
|
+
retry_count=retries - 1,
|
|
113
|
+
selection_reason=str(feedback),
|
|
114
|
+
)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Protocol
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(slots=True)
|
|
7
|
+
class Candidate:
|
|
8
|
+
id: str
|
|
9
|
+
parent_id: str | None
|
|
10
|
+
depth: int
|
|
11
|
+
workspace: Path
|
|
12
|
+
suggestion_trace: tuple[str, ...]
|
|
13
|
+
evaluation: dict[str, Any] | None = None
|
|
14
|
+
status: str = "pending"
|
|
15
|
+
retry_count: int = 0
|
|
16
|
+
selection_reason: str = ""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(slots=True)
|
|
20
|
+
class SelectionResult:
|
|
21
|
+
winner: Candidate
|
|
22
|
+
ordered_ids: list[str]
|
|
23
|
+
reason: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(slots=True)
|
|
27
|
+
class RefineRoundResult:
|
|
28
|
+
baseline: Candidate
|
|
29
|
+
candidates: list[Candidate] = field(default_factory=list)
|
|
30
|
+
winner: Candidate | None = None
|
|
31
|
+
stop_reason: str = ""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SuggestionApplier(Protocol):
|
|
35
|
+
def apply(
|
|
36
|
+
self,
|
|
37
|
+
workspace: Path,
|
|
38
|
+
suggestion: dict[str, str],
|
|
39
|
+
failure_feedback: str = "",
|
|
40
|
+
) -> dict[str, Any]: ...
|