python-harness 0.0.8__tar.gz → 0.0.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_harness-0.0.8/python_harness.egg-info → python_harness-0.0.10}/PKG-INFO +1 -1
- {python_harness-0.0.8 → python_harness-0.0.10}/pyproject.toml +1 -2
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness/__init__.py +1 -1
- python_harness-0.0.10/python_harness/cli.py +312 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness/hard_evaluator.py +64 -22
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness/soft_evaluator.py +248 -153
- {python_harness-0.0.8 → python_harness-0.0.10/python_harness.egg-info}/PKG-INFO +1 -1
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness.egg-info/SOURCES.txt +1 -0
- python_harness-0.0.10/tests/test_cli.py +429 -0
- python_harness-0.0.10/tests/test_evaluator.py +37 -0
- python_harness-0.0.10/tests/test_hard_evaluator.py +358 -0
- python_harness-0.0.10/tests/test_qc_evaluator.py +65 -0
- python_harness-0.0.10/tests/test_soft_evaluator.py +354 -0
- python_harness-0.0.8/python_harness/cli.py +0 -253
- python_harness-0.0.8/tests/test_cli.py +0 -26
- python_harness-0.0.8/tests/test_evaluator.py +0 -18
- python_harness-0.0.8/tests/test_hard_evaluator.py +0 -95
- python_harness-0.0.8/tests/test_soft_evaluator.py +0 -42
- {python_harness-0.0.8 → python_harness-0.0.10}/LICENSE +0 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/README.md +0 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness/evaluator.py +0 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness/qc_evaluator.py +0 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness.egg-info/dependency_links.txt +0 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness.egg-info/entry_points.txt +0 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness.egg-info/requires.txt +0 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/python_harness.egg-info/top_level.txt +0 -0
- {python_harness-0.0.8 → python_harness-0.0.10}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "python-harness"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.10"
|
|
4
4
|
description = "An agentic codebase evaluation and evolution tool for Python projects."
|
|
5
5
|
requires-python = ">=3.10"
|
|
6
6
|
readme = "README.md"
|
|
@@ -62,4 +62,3 @@ addopts = "-ra -q --cov=python_harness --cov-report=term-missing --cov-report=ht
|
|
|
62
62
|
testpaths = [
|
|
63
63
|
"tests",
|
|
64
64
|
]
|
|
65
|
-
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for python-harness.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
|
|
13
|
+
from python_harness.evaluator import Evaluator
|
|
14
|
+
|
|
15
|
+
# Try to find .env file explicitly before anything else executes
|
|
16
|
+
env_path = os.path.join(os.getcwd(), '.env')
|
|
17
|
+
if os.path.exists(env_path):
|
|
18
|
+
load_dotenv(dotenv_path=env_path)
|
|
19
|
+
else:
|
|
20
|
+
load_dotenv() # Fallback to default search
|
|
21
|
+
|
|
22
|
+
app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
|
|
23
|
+
console = Console()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _print_detail_block(title: str, details: str, color: str) -> None:
|
|
27
|
+
normalized_details = [
|
|
28
|
+
line.rstrip() for line in details.splitlines() if line.strip()
|
|
29
|
+
]
|
|
30
|
+
console.print(f"[{color}]{title}:[/{color}]")
|
|
31
|
+
for line in normalized_details:
|
|
32
|
+
console.print(f" {line}")
|
|
33
|
+
console.print()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _print_ruff_issues(issues: list[dict[str, Any]]) -> None:
|
|
37
|
+
console.print("[red]Ruff issues found:[/red]")
|
|
38
|
+
for issue in issues:
|
|
39
|
+
file = issue.get("filename", "unknown")
|
|
40
|
+
line = issue.get("location", {}).get("row", "?")
|
|
41
|
+
msg = issue.get("message", "unknown issue")
|
|
42
|
+
console.print(f" - {file}:{line} {msg}")
|
|
43
|
+
console.print()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _print_ty_result(ty_results: dict[str, Any]) -> None:
|
|
47
|
+
status = ty_results.get("status")
|
|
48
|
+
if status == "warning":
|
|
49
|
+
msg = str(ty_results.get("error_message", "ty not found"))
|
|
50
|
+
_print_detail_block("Ty warning", msg, "yellow")
|
|
51
|
+
return
|
|
52
|
+
if status == "success":
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
output = str(ty_results.get("output", ""))
|
|
56
|
+
error_msg = str(ty_results.get("error_message", ""))
|
|
57
|
+
if output:
|
|
58
|
+
_print_detail_block("Ty issues found", output, "red")
|
|
59
|
+
elif error_msg:
|
|
60
|
+
_print_detail_block("Ty error", error_msg, "red")
|
|
61
|
+
else:
|
|
62
|
+
console.print("[red]Ty failed, but no standard output was captured.[/red]")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _print_radon_cc_result(radon_results: dict[str, Any]) -> None:
|
|
66
|
+
status = radon_results.get("status")
|
|
67
|
+
if status == "warning":
|
|
68
|
+
err_msg = str(radon_results.get("error_message", ""))
|
|
69
|
+
_print_detail_block("Radon CC warning", err_msg, "yellow")
|
|
70
|
+
return
|
|
71
|
+
if status != "failed":
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
issues = radon_results.get("issues", [])
|
|
75
|
+
if issues:
|
|
76
|
+
console.print(
|
|
77
|
+
f"[red]Cyclomatic Complexity too high "
|
|
78
|
+
f"({len(issues)} functions > 15):[/red]"
|
|
79
|
+
)
|
|
80
|
+
for issue in issues:
|
|
81
|
+
console.print(
|
|
82
|
+
f" - {issue['file']}: {issue['type']} '{issue['name']}' "
|
|
83
|
+
f"has CC {issue['complexity']}"
|
|
84
|
+
)
|
|
85
|
+
console.print()
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
err_msg = str(radon_results.get("error_message", ""))
|
|
89
|
+
if err_msg:
|
|
90
|
+
_print_detail_block("Radon CC error", err_msg, "red")
|
|
91
|
+
return
|
|
92
|
+
console.print("[red]Radon CC failed but no specific issues were parsed.[/red]")
|
|
93
|
+
console.print()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _print_hard_failure_details(hard_results: dict[str, Any]) -> None:
|
|
97
|
+
console.print("[bold red]Hard Evaluation Failed![/bold red]")
|
|
98
|
+
console.print()
|
|
99
|
+
|
|
100
|
+
ruff_issues = hard_results.get("ruff", {}).get("issues", [])
|
|
101
|
+
if hard_results.get("ruff", {}).get("status") != "success":
|
|
102
|
+
_print_ruff_issues(ruff_issues)
|
|
103
|
+
|
|
104
|
+
if hard_results.get("mypy", {}).get("status") != "success":
|
|
105
|
+
output = str(hard_results.get("mypy", {}).get("output", ""))
|
|
106
|
+
_print_detail_block("Mypy issues found", output, "red")
|
|
107
|
+
|
|
108
|
+
_print_ty_result(hard_results.get("ty", {}))
|
|
109
|
+
_print_radon_cc_result(hard_results.get("radon_cc", {}))
|
|
110
|
+
|
|
111
|
+
if hard_results.get("pytest", {}).get("status") == "failed":
|
|
112
|
+
error_msg = str(hard_results.get("pytest", {}).get("error_message", ""))
|
|
113
|
+
_print_detail_block("Pytest/Coverage issues found", error_msg, "red")
|
|
114
|
+
|
|
115
|
+
console.print(
|
|
116
|
+
"[yellow]Continuing to soft evaluation to generate "
|
|
117
|
+
"suggestions despite hard failures...[/yellow]"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
|
|
122
|
+
if hard_results["all_passed"]:
|
|
123
|
+
console.print("[bold green]Hard Evaluation Passed![/bold green]")
|
|
124
|
+
return
|
|
125
|
+
_print_hard_failure_details(hard_results)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
|
|
129
|
+
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
130
|
+
if not mi_scores:
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
avg_mi = sum(mi_scores.values()) / len(mi_scores)
|
|
134
|
+
color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
|
|
135
|
+
console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _print_qc_summary(qc_results: dict[str, Any]) -> None:
|
|
139
|
+
console.print()
|
|
140
|
+
console.print("[bold blue]Running Governance QC (Second Fence)...[/bold blue]")
|
|
141
|
+
|
|
142
|
+
if qc_results["all_passed"]:
|
|
143
|
+
console.print(
|
|
144
|
+
"[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
|
|
145
|
+
)
|
|
146
|
+
console.print()
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
console.print("[bold red]Governance QC Failed![/bold red]")
|
|
150
|
+
console.print()
|
|
151
|
+
console.print(
|
|
152
|
+
"[red]The proposed changes violate governance constraints "
|
|
153
|
+
"or lack sufficient evidence.[/red]"
|
|
154
|
+
)
|
|
155
|
+
for failure in qc_results["failures"]:
|
|
156
|
+
console.print(f"[red]- {failure}[/red]")
|
|
157
|
+
console.print()
|
|
158
|
+
console.print(
|
|
159
|
+
"[yellow]Continuing to soft evaluation to generate "
|
|
160
|
+
"suggestions despite QC failures...[/yellow]"
|
|
161
|
+
)
|
|
162
|
+
console.print()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _print_soft_evaluation_start() -> None:
|
|
166
|
+
console.print(
|
|
167
|
+
"[bold blue]Running Soft Evaluation "
|
|
168
|
+
"(Readability & Understandability)...[/bold blue]"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _print_soft_summary(soft_results: dict[str, Any]) -> None:
|
|
173
|
+
pkg_summary = soft_results["package_summary"]
|
|
174
|
+
console.print(
|
|
175
|
+
f"[green]Analyzed {pkg_summary['total_files']} files with a total of "
|
|
176
|
+
f"{pkg_summary['total_tokens']} tokens.[/green]"
|
|
177
|
+
)
|
|
178
|
+
console.print(
|
|
179
|
+
f"[magenta]Agent's Understanding of the Package:[/magenta]\n"
|
|
180
|
+
f"{pkg_summary['package_understanding']}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
console.print()
|
|
184
|
+
console.print(
|
|
185
|
+
f"[cyan]Overall Understandability Score:[/cyan] "
|
|
186
|
+
f"{soft_results['understandability_score']:.1f}/100"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
qa_results = soft_results.get("qa_results", {}).get("sampled_entities", [])
|
|
190
|
+
if qa_results:
|
|
191
|
+
console.print()
|
|
192
|
+
console.print("[bold yellow]Blind QA Sampling Results:[/bold yellow]")
|
|
193
|
+
for qa in qa_results:
|
|
194
|
+
color = "green" if qa["score"] >= 80 else "red"
|
|
195
|
+
console.print(f" - [{color}]{qa['entity']}: Score {qa['score']}[/{color}]")
|
|
196
|
+
console.print(f" [dim]Feedback: {qa['feedback']}[/dim]")
|
|
197
|
+
|
|
198
|
+
console.print()
|
|
199
|
+
console.print("[yellow]Evaluation completed. Generating report...[/yellow]")
|
|
200
|
+
console.print()
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _print_final_report(final_report: dict[str, Any]) -> None:
|
|
204
|
+
verdict = str(final_report.get("verdict", "Unknown"))
|
|
205
|
+
verdict_color = "bold green" if "Pass" in verdict else "bold red"
|
|
206
|
+
|
|
207
|
+
console.print(
|
|
208
|
+
f"[{verdict_color}]=== FINAL VERDICT: {verdict} ===[/{verdict_color}]"
|
|
209
|
+
)
|
|
210
|
+
console.print(f"[bold]Summary:[/bold] {final_report.get('summary', '')}")
|
|
211
|
+
console.print()
|
|
212
|
+
|
|
213
|
+
suggestions = final_report.get("suggestions", [])
|
|
214
|
+
if suggestions:
|
|
215
|
+
console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
|
|
216
|
+
for i, sug in enumerate(suggestions, 1):
|
|
217
|
+
console.print(
|
|
218
|
+
f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
|
|
219
|
+
f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
|
|
220
|
+
)
|
|
221
|
+
console.print(f" [dim]{sug.get('description', '')}[/dim]")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@app.command()
|
|
225
|
+
def refine(
|
|
226
|
+
path: str = typer.Argument(".", help="The path to evaluate and evolve"),
|
|
227
|
+
steps: int = typer.Option(1, help="Number of evolution steps to perform"),
|
|
228
|
+
max_retries: int = typer.Option(3, help="Maximum retries per variant if tests fail")
|
|
229
|
+
) -> None:
|
|
230
|
+
"""
|
|
231
|
+
Refine the codebase through an agentic Edit-Test-Improve loop.
|
|
232
|
+
Generates variants based on suggestions, tests them, and picks the best.
|
|
233
|
+
"""
|
|
234
|
+
console.print(
|
|
235
|
+
f"[bold magenta]Starting evolution loop for path:[/bold magenta] {path} "
|
|
236
|
+
f"[dim](steps={steps}, max_retries={max_retries})[/dim]"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# 1. First, run a baseline evaluation to get suggestions
|
|
240
|
+
evaluator = Evaluator(path)
|
|
241
|
+
console.print("[cyan]Running baseline evaluation...[/cyan]")
|
|
242
|
+
hard_results = evaluator.hard_evaluator.evaluate()
|
|
243
|
+
soft_results = evaluator.soft_evaluator.evaluate()
|
|
244
|
+
baseline_report = evaluator.soft_evaluator.generate_final_report(
|
|
245
|
+
hard_results, {"all_passed": True, "failures": []}, soft_results
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
suggestions = baseline_report.get("suggestions", [])
|
|
249
|
+
if not suggestions:
|
|
250
|
+
console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
console.print(
|
|
254
|
+
f"[green]Found {len(suggestions)} suggestions. "
|
|
255
|
+
f"Starting evolution branches...[/green]"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# TODO: Implement the Git branching and Agent modification logic here.
|
|
259
|
+
# The loop will be:
|
|
260
|
+
# for step in range(steps):
|
|
261
|
+
# for suggestion in suggestions:
|
|
262
|
+
# checkout new branch variant-X
|
|
263
|
+
# for retry in range(max_retries):
|
|
264
|
+
# ask LLM to apply suggestion to code
|
|
265
|
+
# run pytest
|
|
266
|
+
# if pytest passes:
|
|
267
|
+
# run harness . to get new score
|
|
268
|
+
# break
|
|
269
|
+
# else:
|
|
270
|
+
# feed error back to LLM for retry
|
|
271
|
+
# compare all variants and checkout the best one
|
|
272
|
+
|
|
273
|
+
console.print(
|
|
274
|
+
"[yellow]Evolution engine skeleton ready. "
|
|
275
|
+
"Actual git mutation logic pending.[/yellow]"
|
|
276
|
+
)
|
|
277
|
+
@app.command()
|
|
278
|
+
def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
|
|
279
|
+
"""
|
|
280
|
+
Measure the codebase against hard, soft, and governance constraints.
|
|
281
|
+
Outputs a final report with scores and actionable improvement suggestions.
|
|
282
|
+
"""
|
|
283
|
+
console.print(
|
|
284
|
+
f"[bold green]Starting harness measurement for path:[/bold green] {path}"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
evaluator = Evaluator(path)
|
|
288
|
+
console.print("[bold blue]Running Hard Evaluation (ruff, mypy)...[/bold blue]")
|
|
289
|
+
hard_results = evaluator.hard_evaluator.evaluate()
|
|
290
|
+
_print_hard_evaluation_summary(hard_results)
|
|
291
|
+
_print_mi_scorecard(hard_results)
|
|
292
|
+
|
|
293
|
+
qc_results = evaluator.qc_evaluator.evaluate()
|
|
294
|
+
_print_qc_summary(qc_results)
|
|
295
|
+
|
|
296
|
+
_print_soft_evaluation_start()
|
|
297
|
+
soft_results = evaluator.soft_evaluator.evaluate()
|
|
298
|
+
_print_soft_summary(soft_results)
|
|
299
|
+
|
|
300
|
+
final_report = evaluator.soft_evaluator.generate_final_report(
|
|
301
|
+
hard_results, qc_results, soft_results
|
|
302
|
+
)
|
|
303
|
+
if not final_report:
|
|
304
|
+
return
|
|
305
|
+
|
|
306
|
+
_print_final_report(final_report)
|
|
307
|
+
if "Fail" in str(final_report.get("verdict", "Unknown")):
|
|
308
|
+
sys.exit(1)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
if __name__ == "__main__":
|
|
312
|
+
app()
|
|
@@ -4,12 +4,15 @@ Core module for integrating hard evaluation tools like ruff, mypy, and pytest.
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
import tempfile
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
from typing import Any
|
|
9
11
|
|
|
10
12
|
from rich.console import Console
|
|
11
13
|
|
|
12
14
|
console = Console()
|
|
15
|
+
PYTEST_TIMEOUT_SECONDS = 60
|
|
13
16
|
|
|
14
17
|
class HardEvaluator:
|
|
15
18
|
"""
|
|
@@ -25,7 +28,15 @@ class HardEvaluator:
|
|
|
25
28
|
"""
|
|
26
29
|
try:
|
|
27
30
|
result = subprocess.run(
|
|
28
|
-
[
|
|
31
|
+
[
|
|
32
|
+
sys.executable,
|
|
33
|
+
"-m",
|
|
34
|
+
"ruff",
|
|
35
|
+
"check",
|
|
36
|
+
str(self.target_path),
|
|
37
|
+
"--output-format",
|
|
38
|
+
"json",
|
|
39
|
+
],
|
|
29
40
|
capture_output=True,
|
|
30
41
|
text=True,
|
|
31
42
|
check=False
|
|
@@ -46,7 +57,7 @@ class HardEvaluator:
|
|
|
46
57
|
"""
|
|
47
58
|
try:
|
|
48
59
|
result = subprocess.run(
|
|
49
|
-
["mypy", str(self.target_path)],
|
|
60
|
+
[sys.executable, "-m", "mypy", str(self.target_path)],
|
|
50
61
|
capture_output=True,
|
|
51
62
|
text=True,
|
|
52
63
|
check=False
|
|
@@ -101,7 +112,15 @@ class HardEvaluator:
|
|
|
101
112
|
"""
|
|
102
113
|
try:
|
|
103
114
|
result = subprocess.run(
|
|
104
|
-
[
|
|
115
|
+
[
|
|
116
|
+
sys.executable,
|
|
117
|
+
"-m",
|
|
118
|
+
"radon",
|
|
119
|
+
"cc",
|
|
120
|
+
"-j",
|
|
121
|
+
"-a",
|
|
122
|
+
str(self.target_path),
|
|
123
|
+
],
|
|
105
124
|
capture_output=True,
|
|
106
125
|
text=True,
|
|
107
126
|
check=False
|
|
@@ -143,7 +162,7 @@ class HardEvaluator:
|
|
|
143
162
|
"error_message": "radon executable not found. Please install it."
|
|
144
163
|
}
|
|
145
164
|
except Exception as e:
|
|
146
|
-
if "No
|
|
165
|
+
if "No module named radon" in str(e) or "radon" in str(e):
|
|
147
166
|
return {
|
|
148
167
|
"status": "warning",
|
|
149
168
|
"issues": [],
|
|
@@ -159,7 +178,7 @@ class HardEvaluator:
|
|
|
159
178
|
"""
|
|
160
179
|
try:
|
|
161
180
|
result = subprocess.run(
|
|
162
|
-
["radon", "mi", "-j", str(self.target_path)],
|
|
181
|
+
[sys.executable, "-m", "radon", "mi", "-j", str(self.target_path)],
|
|
163
182
|
capture_output=True,
|
|
164
183
|
text=True,
|
|
165
184
|
check=False
|
|
@@ -183,7 +202,7 @@ class HardEvaluator:
|
|
|
183
202
|
"error_message": "radon executable not found. Please install it."
|
|
184
203
|
}
|
|
185
204
|
except Exception as e:
|
|
186
|
-
if "No
|
|
205
|
+
if "No module named radon" in str(e) or "radon" in str(e):
|
|
187
206
|
return {
|
|
188
207
|
"status": "warning",
|
|
189
208
|
"mi_scores": {},
|
|
@@ -196,22 +215,44 @@ class HardEvaluator:
|
|
|
196
215
|
Run Pytest test suite and return coverage results.
|
|
197
216
|
"""
|
|
198
217
|
try:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
218
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
219
|
+
coverage_report = Path(tmp_dir) / "coverage.json"
|
|
220
|
+
result = subprocess.run(
|
|
221
|
+
[
|
|
222
|
+
sys.executable,
|
|
223
|
+
"-m",
|
|
224
|
+
"pytest",
|
|
225
|
+
str(self.target_path),
|
|
226
|
+
"--cov",
|
|
227
|
+
f"--cov-report=json:{coverage_report}",
|
|
228
|
+
],
|
|
229
|
+
capture_output=True,
|
|
230
|
+
text=True,
|
|
231
|
+
check=False,
|
|
232
|
+
timeout=PYTEST_TIMEOUT_SECONDS,
|
|
233
|
+
)
|
|
234
|
+
coverage_percentage = None
|
|
235
|
+
if coverage_report.exists():
|
|
236
|
+
coverage_data = json.loads(coverage_report.read_text())
|
|
237
|
+
coverage_percentage = coverage_data.get("totals", {}).get(
|
|
238
|
+
"percent_covered"
|
|
239
|
+
)
|
|
207
240
|
status = "success" if result.returncode == 0 else "failed"
|
|
208
241
|
return {
|
|
209
242
|
"status": status,
|
|
210
243
|
"output": result.stdout,
|
|
211
244
|
"return_code": result.returncode,
|
|
245
|
+
"coverage_percentage": coverage_percentage,
|
|
246
|
+
}
|
|
247
|
+
except subprocess.TimeoutExpired:
|
|
248
|
+
return {
|
|
249
|
+
"status": "failed",
|
|
250
|
+
"error_message": (
|
|
251
|
+
f"Pytest run timed out after {PYTEST_TIMEOUT_SECONDS} seconds."
|
|
252
|
+
),
|
|
212
253
|
}
|
|
213
254
|
except Exception as e:
|
|
214
|
-
|
|
255
|
+
return {"status": "error", "error_message": str(e)}
|
|
215
256
|
|
|
216
257
|
def evaluate(self) -> dict[str, Any]:
|
|
217
258
|
"""
|
|
@@ -226,19 +267,20 @@ class HardEvaluator:
|
|
|
226
267
|
pytest_res = self.run_pytest()
|
|
227
268
|
|
|
228
269
|
# Parse pytest coverage to check if it's < 90%
|
|
229
|
-
cov_percentage =
|
|
230
|
-
if pytest_res.get("status") == "success"
|
|
231
|
-
|
|
232
|
-
cov_data = json.loads(pytest_res["output"])
|
|
233
|
-
cov_percentage = cov_data.get("totals", {}).get("percent_covered", 0.0)
|
|
270
|
+
cov_percentage = pytest_res.get("coverage_percentage")
|
|
271
|
+
if pytest_res.get("status") == "success":
|
|
272
|
+
if isinstance(cov_percentage, (int, float)):
|
|
234
273
|
if cov_percentage < 90.0:
|
|
235
274
|
pytest_res["status"] = "failed"
|
|
236
275
|
pytest_res["error_message"] = (
|
|
237
276
|
f"Test coverage is {cov_percentage:.2f}%, "
|
|
238
277
|
f"which is below the 90% threshold."
|
|
239
278
|
)
|
|
240
|
-
|
|
241
|
-
|
|
279
|
+
else:
|
|
280
|
+
pytest_res["status"] = "failed"
|
|
281
|
+
pytest_res["error_message"] = (
|
|
282
|
+
"Coverage report was missing or unreadable."
|
|
283
|
+
)
|
|
242
284
|
|
|
243
285
|
all_passed = (
|
|
244
286
|
ruff_res.get("status") == "success" and
|