python-harness 0.0.5__tar.gz → 0.0.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_harness-0.0.5/python_harness.egg-info → python_harness-0.0.8}/PKG-INFO +1 -1
- {python_harness-0.0.5 → python_harness-0.0.8}/pyproject.toml +1 -1
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/cli.py +36 -8
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/evaluator.py +1 -1
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/hard_evaluator.py +50 -5
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/soft_evaluator.py +38 -7
- {python_harness-0.0.5 → python_harness-0.0.8/python_harness.egg-info}/PKG-INFO +1 -1
- python_harness-0.0.8/tests/test_hard_evaluator.py +95 -0
- python_harness-0.0.5/tests/test_hard_evaluator.py +0 -29
- {python_harness-0.0.5 → python_harness-0.0.8}/LICENSE +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/README.md +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/__init__.py +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/qc_evaluator.py +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/SOURCES.txt +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/dependency_links.txt +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/entry_points.txt +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/requires.txt +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/top_level.txt +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/setup.cfg +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/tests/test_cli.py +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/tests/test_evaluator.py +0 -0
- {python_harness-0.0.5 → python_harness-0.0.8}/tests/test_soft_evaluator.py +0 -0
|
@@ -130,9 +130,32 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
|
|
|
130
130
|
f" - {issue['file']}: {issue['type']} '{issue['name']}' "
|
|
131
131
|
f"has CC {issue['complexity']}"
|
|
132
132
|
)
|
|
133
|
-
|
|
133
|
+
|
|
134
|
+
# If radon failed for another reason
|
|
135
|
+
# (e.g. radon not installed or syntax error)
|
|
136
|
+
if not issues and hard_results["radon_cc"].get("error_message"):
|
|
137
|
+
err_msg = hard_results['radon_cc'].get('error_message')
|
|
138
|
+
console.print(f"[red]Radon CC Error:[/red] {err_msg}")
|
|
139
|
+
elif not issues:
|
|
140
|
+
console.print(
|
|
141
|
+
"[red]Radon CC failed but no specific issues were parsed.[/red]"
|
|
142
|
+
)
|
|
143
|
+
elif hard_results["radon_cc"]["status"] == "warning":
|
|
144
|
+
err_msg = hard_results['radon_cc'].get('error_message')
|
|
145
|
+
console.print(f"[yellow]Radon CC warning:[/yellow] {err_msg}")
|
|
134
146
|
|
|
135
|
-
|
|
147
|
+
if hard_results.get("pytest", {}).get("status") == "failed":
|
|
148
|
+
error_msg = hard_results["pytest"].get("error_message", "Tests failed")
|
|
149
|
+
console.print(f"[red]Pytest/Coverage issues found:[/red] {error_msg}")
|
|
150
|
+
|
|
151
|
+
# DO NOT sys.exit(1) here anymore!
|
|
152
|
+
# We want to generate the report even if it fails.
|
|
153
|
+
console.print(
|
|
154
|
+
"[yellow]Continuing to soft evaluation to generate "
|
|
155
|
+
"suggestions despite hard failures...[/yellow]"
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
console.print("[bold green]Hard Evaluation Passed![/bold green]")
|
|
136
159
|
|
|
137
160
|
# Print Maintainability Index scorecard
|
|
138
161
|
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
@@ -155,11 +178,15 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
|
|
|
155
178
|
)
|
|
156
179
|
for failure in qc_results["failures"]:
|
|
157
180
|
console.print(f"[red]- {failure}[/red]")
|
|
158
|
-
sys.exit(1)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
181
|
+
# DO NOT sys.exit(1) here! We want to generate suggestions for QC failures too.
|
|
182
|
+
console.print(
|
|
183
|
+
"[yellow]Continuing to soft evaluation to generate "
|
|
184
|
+
"suggestions despite QC failures...[/yellow]"
|
|
185
|
+
)
|
|
186
|
+
else:
|
|
187
|
+
console.print(
|
|
188
|
+
"[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
|
|
189
|
+
)
|
|
163
190
|
|
|
164
191
|
# 3. Soft Evaluation/Readability (Third Fence)
|
|
165
192
|
console.print(
|
|
@@ -194,8 +221,9 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
|
|
|
194
221
|
console.print("\n[yellow]Evaluation completed. Generating report...[/yellow]\n")
|
|
195
222
|
|
|
196
223
|
# Generate Final Report
|
|
224
|
+
# Pass all results to the reporter so it knows *why* things failed
|
|
197
225
|
final_report = evaluator.soft_evaluator.generate_final_report(
|
|
198
|
-
hard_results, soft_results
|
|
226
|
+
hard_results, qc_results, soft_results
|
|
199
227
|
)
|
|
200
228
|
|
|
201
229
|
if final_report:
|
|
@@ -123,16 +123,32 @@ class HardEvaluator:
|
|
|
123
123
|
"complexity": block.get('complexity')
|
|
124
124
|
})
|
|
125
125
|
|
|
126
|
-
if
|
|
126
|
+
if result.returncode != 0:
|
|
127
|
+
# E.g. syntax error in target code preventing radon from parsing
|
|
128
|
+
status = "failed"
|
|
129
|
+
elif issues:
|
|
127
130
|
status = "failed"
|
|
128
131
|
|
|
129
132
|
return {
|
|
130
133
|
"status": status,
|
|
131
134
|
"issues": issues,
|
|
132
135
|
"return_code": result.returncode,
|
|
133
|
-
"output": result.stdout
|
|
136
|
+
"output": result.stdout,
|
|
137
|
+
"error_message": result.stderr if result.returncode != 0 else ""
|
|
138
|
+
}
|
|
139
|
+
except FileNotFoundError:
|
|
140
|
+
return {
|
|
141
|
+
"status": "warning",
|
|
142
|
+
"issues": [],
|
|
143
|
+
"error_message": "radon executable not found. Please install it."
|
|
134
144
|
}
|
|
135
145
|
except Exception as e:
|
|
146
|
+
if "No such file or directory: 'radon'" in str(e):
|
|
147
|
+
return {
|
|
148
|
+
"status": "warning",
|
|
149
|
+
"issues": [],
|
|
150
|
+
"error_message": "radon executable not found. Please install it."
|
|
151
|
+
}
|
|
136
152
|
return {"status": "error", "error_message": str(e)}
|
|
137
153
|
|
|
138
154
|
def run_radon_mi(self) -> dict[str, Any]:
|
|
@@ -160,7 +176,19 @@ class HardEvaluator:
|
|
|
160
176
|
"mi_scores": mi_scores,
|
|
161
177
|
"return_code": result.returncode,
|
|
162
178
|
}
|
|
179
|
+
except FileNotFoundError:
|
|
180
|
+
return {
|
|
181
|
+
"status": "warning",
|
|
182
|
+
"mi_scores": {},
|
|
183
|
+
"error_message": "radon executable not found. Please install it."
|
|
184
|
+
}
|
|
163
185
|
except Exception as e:
|
|
186
|
+
if "No such file or directory: 'radon'" in str(e):
|
|
187
|
+
return {
|
|
188
|
+
"status": "warning",
|
|
189
|
+
"mi_scores": {},
|
|
190
|
+
"error_message": "radon executable not found. Please install it."
|
|
191
|
+
}
|
|
164
192
|
return {"status": "error", "error_message": str(e)}
|
|
165
193
|
|
|
166
194
|
def run_pytest(self) -> dict[str, Any]:
|
|
@@ -195,13 +223,29 @@ class HardEvaluator:
|
|
|
195
223
|
ty_res = self.run_ty()
|
|
196
224
|
radon_cc_res = self.run_radon_cc()
|
|
197
225
|
radon_mi_res = self.run_radon_mi()
|
|
198
|
-
|
|
226
|
+
pytest_res = self.run_pytest()
|
|
199
227
|
|
|
228
|
+
# Parse pytest coverage to check if it's < 90%
|
|
229
|
+
cov_percentage = 0.0
|
|
230
|
+
if pytest_res.get("status") == "success" and pytest_res.get("output"):
|
|
231
|
+
try:
|
|
232
|
+
cov_data = json.loads(pytest_res["output"])
|
|
233
|
+
cov_percentage = cov_data.get("totals", {}).get("percent_covered", 0.0)
|
|
234
|
+
if cov_percentage < 90.0:
|
|
235
|
+
pytest_res["status"] = "failed"
|
|
236
|
+
pytest_res["error_message"] = (
|
|
237
|
+
f"Test coverage is {cov_percentage:.2f}%, "
|
|
238
|
+
f"which is below the 90% threshold."
|
|
239
|
+
)
|
|
240
|
+
except Exception:
|
|
241
|
+
pass
|
|
242
|
+
|
|
200
243
|
all_passed = (
|
|
201
244
|
ruff_res.get("status") == "success" and
|
|
202
245
|
mypy_res.get("status") == "success" and
|
|
203
246
|
ty_res.get("status") in ("success", "warning") and
|
|
204
|
-
radon_cc_res.get("status")
|
|
247
|
+
radon_cc_res.get("status") in ("success", "warning") and
|
|
248
|
+
pytest_res.get("status") == "success"
|
|
205
249
|
)
|
|
206
250
|
|
|
207
251
|
return {
|
|
@@ -210,5 +254,6 @@ class HardEvaluator:
|
|
|
210
254
|
"mypy": mypy_res,
|
|
211
255
|
"ty": ty_res,
|
|
212
256
|
"radon_cc": radon_cc_res,
|
|
213
|
-
"radon_mi": radon_mi_res
|
|
257
|
+
"radon_mi": radon_mi_res,
|
|
258
|
+
"pytest": pytest_res
|
|
214
259
|
}
|
|
@@ -373,7 +373,10 @@ class SoftEvaluator:
|
|
|
373
373
|
}
|
|
374
374
|
|
|
375
375
|
def generate_final_report(
|
|
376
|
-
self,
|
|
376
|
+
self,
|
|
377
|
+
hard_results: dict[str, Any],
|
|
378
|
+
qc_results: dict[str, Any],
|
|
379
|
+
soft_results: dict[str, Any]
|
|
377
380
|
) -> dict[str, Any]:
|
|
378
381
|
"""
|
|
379
382
|
Synthesize all evaluation results into a final verdict and exactly
|
|
@@ -408,25 +411,48 @@ class SoftEvaluator:
|
|
|
408
411
|
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
409
412
|
avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
|
|
410
413
|
|
|
414
|
+
# Extract failures
|
|
415
|
+
hard_failed = not hard_results.get("all_passed", True)
|
|
416
|
+
|
|
417
|
+
hard_errors = []
|
|
418
|
+
if hard_failed:
|
|
419
|
+
if hard_results.get("ruff", {}).get("status") != "success":
|
|
420
|
+
hard_errors.append("Linter (Ruff) failed.")
|
|
421
|
+
if hard_results.get("mypy", {}).get("status") != "success":
|
|
422
|
+
hard_errors.append("Type checker (Mypy) failed.")
|
|
423
|
+
if hard_results.get("pytest", {}).get("status") != "success":
|
|
424
|
+
pytest_err = hard_results.get("pytest", {}).get(
|
|
425
|
+
"error_message", "Tests or Coverage failed."
|
|
426
|
+
)
|
|
427
|
+
hard_errors.append(pytest_err)
|
|
428
|
+
|
|
429
|
+
qc_errors = qc_results.get("failures", [])
|
|
430
|
+
|
|
411
431
|
qa_score = soft_results.get("understandability_score", 100.0)
|
|
412
432
|
qa_entities = soft_results.get("qa_results", {}).get("sampled_entities", [])
|
|
413
433
|
|
|
414
434
|
sys_prompt = (
|
|
415
435
|
"You are an elite Python Codebase Evaluator. You have just analyzed "
|
|
416
436
|
"a repository. Your task is to provide a final judgment and EXACTLY "
|
|
417
|
-
"3 concrete, actionable improvement suggestions
|
|
418
|
-
"
|
|
419
|
-
"
|
|
437
|
+
"3 concrete, actionable improvement suggestions.\n"
|
|
438
|
+
"If the codebase failed its Hard or QC evaluations (e.g. tests "
|
|
439
|
+
"failed, coverage is low, or governance violated), your suggestions "
|
|
440
|
+
"MUST prioritize fixing those issues.\n"
|
|
441
|
+
"Otherwise, focus on refactoring/quality improvements without "
|
|
442
|
+
"changing external functionality.\n\n"
|
|
420
443
|
"Output MUST be in valid JSON matching this schema:\n"
|
|
421
444
|
"{\n"
|
|
422
445
|
' "verdict": "Pass" or "Fail",\n'
|
|
423
|
-
' "summary": "One paragraph summary of codebase health
|
|
446
|
+
' "summary": "One paragraph summary of codebase health and '
|
|
447
|
+
'any critical failures",\n'
|
|
424
448
|
' "suggestions": [\n'
|
|
425
449
|
' {"title": "str", "description": "str", "target_file": "str"}\n'
|
|
426
450
|
" ]\n"
|
|
427
451
|
"}\n"
|
|
428
|
-
"Rule for Verdict:
|
|
429
|
-
"
|
|
452
|
+
"Rule for Verdict: If there are Hard Failures or QC Failures, "
|
|
453
|
+
"verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
|
|
454
|
+
"> 50 and QA Score > 75 and no Critical CC issues (>15). "
|
|
455
|
+
"Otherwise Fail."
|
|
430
456
|
)
|
|
431
457
|
|
|
432
458
|
user_content = (
|
|
@@ -435,6 +461,11 @@ class SoftEvaluator:
|
|
|
435
461
|
f"- Number of functions with Cyclomatic Complexity > 15: "
|
|
436
462
|
f"{len(cc_issues)}\n"
|
|
437
463
|
f"- Agent QA Readability Score: {qa_score:.1f}/100\n\n"
|
|
464
|
+
f"Failures (Prioritize these!):\n"
|
|
465
|
+
f"- Hard Evaluation Errors: "
|
|
466
|
+
f"{hard_errors if hard_errors else 'None'}\n"
|
|
467
|
+
f"- QC/Governance Errors: "
|
|
468
|
+
f"{qc_errors if qc_errors else 'None'}\n\n"
|
|
438
469
|
f"QA Feedback Snippets:\n"
|
|
439
470
|
+ "\n".join(
|
|
440
471
|
[f" * {q['entity']}: {q['feedback']}" for q in qa_entities]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for hard evaluation logic.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from python_harness.hard_evaluator import HardEvaluator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_hard_evaluator_methods() -> None:
|
|
12
|
+
"""
|
|
13
|
+
Test methods of HardEvaluator.
|
|
14
|
+
"""
|
|
15
|
+
evaluator = HardEvaluator(".")
|
|
16
|
+
|
|
17
|
+
ruff_result = evaluator.run_ruff()
|
|
18
|
+
assert "status" in ruff_result
|
|
19
|
+
|
|
20
|
+
mypy_result = evaluator.run_mypy()
|
|
21
|
+
assert "status" in mypy_result
|
|
22
|
+
|
|
23
|
+
ty_result = evaluator.run_ty()
|
|
24
|
+
assert "status" in ty_result
|
|
25
|
+
|
|
26
|
+
# pytest_result = evaluator.run_pytest() # Causes infinite loop when run in test
|
|
27
|
+
# assert "status" in pytest_result
|
|
28
|
+
|
|
29
|
+
eval_result = evaluator.evaluate()
|
|
30
|
+
assert "ruff" in eval_result
|
|
31
|
+
assert "mypy" in eval_result
|
|
32
|
+
assert "ty" in eval_result
|
|
33
|
+
|
|
34
|
+
def test_ty_fallback_behavior(monkeypatch: Any) -> None:
|
|
35
|
+
"""
|
|
36
|
+
Test that run_ty gracefully falls back to a warning when 'ty' is not installed.
|
|
37
|
+
"""
|
|
38
|
+
# Create a mock subprocess.run that always raises FileNotFoundError
|
|
39
|
+
# to simulate 'ty' not being on the system PATH
|
|
40
|
+
def mock_run(*args: Any, **kwargs: Any) -> Any:
|
|
41
|
+
raise FileNotFoundError("[Errno 2] No such file or directory: 'ty'")
|
|
42
|
+
|
|
43
|
+
monkeypatch.setattr("subprocess.run", mock_run)
|
|
44
|
+
|
|
45
|
+
evaluator = HardEvaluator(".")
|
|
46
|
+
result = evaluator.run_ty()
|
|
47
|
+
|
|
48
|
+
assert result["status"] == "warning"
|
|
49
|
+
assert "ty executable not found" in result["error_message"]
|
|
50
|
+
|
|
51
|
+
def test_ty_fallback_behavior_oserror(monkeypatch: Any) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Test that run_ty gracefully falls back to a warning when a generic Exception
|
|
54
|
+
containing the Errno 2 string is thrown.
|
|
55
|
+
"""
|
|
56
|
+
def mock_run(*args: Any, **kwargs: Any) -> Any:
|
|
57
|
+
raise Exception("[Errno 2] No such file or directory: 'ty'")
|
|
58
|
+
|
|
59
|
+
monkeypatch.setattr("subprocess.run", mock_run)
|
|
60
|
+
|
|
61
|
+
evaluator = HardEvaluator(".")
|
|
62
|
+
result = evaluator.run_ty()
|
|
63
|
+
|
|
64
|
+
assert result["status"] == "warning"
|
|
65
|
+
assert "ty executable not found" in result["error_message"]
|
|
66
|
+
|
|
67
|
+
def test_radon_cc_syntax_error(monkeypatch: Any, tmp_path: Path) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Test that run_radon_cc correctly captures and reports stderr when radon
|
|
70
|
+
fails (e.g. due to syntax errors in the target codebase).
|
|
71
|
+
"""
|
|
72
|
+
# Create a mock subprocess.run that simulates radon exiting with code 1
|
|
73
|
+
# and writing an error to stderr (which happens when there are syntax errors)
|
|
74
|
+
import subprocess
|
|
75
|
+
original_run = subprocess.run
|
|
76
|
+
|
|
77
|
+
def mock_run(args: Any, **kwargs: Any) -> Any:
|
|
78
|
+
if args and args[0] == "radon" and args[1] == "cc":
|
|
79
|
+
# Simulate radon failing on syntax error
|
|
80
|
+
class MockResult:
|
|
81
|
+
returncode = 1
|
|
82
|
+
stdout = ""
|
|
83
|
+
stderr = "ERROR: SyntaxError in bad.py"
|
|
84
|
+
return MockResult()
|
|
85
|
+
return original_run(args, **kwargs)
|
|
86
|
+
|
|
87
|
+
monkeypatch.setattr("subprocess.run", mock_run)
|
|
88
|
+
|
|
89
|
+
evaluator = HardEvaluator(str(tmp_path))
|
|
90
|
+
result = evaluator.run_radon_cc()
|
|
91
|
+
|
|
92
|
+
assert result["status"] == "failed"
|
|
93
|
+
assert len(result.get("issues", [])) == 0
|
|
94
|
+
# Radon should output to stderr because of the syntax error
|
|
95
|
+
assert "SyntaxError" in result["error_message"] or result["return_code"] != 0
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Tests for hard evaluation logic.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from python_harness.hard_evaluator import HardEvaluator
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def test_hard_evaluator_methods() -> None:
|
|
9
|
-
"""
|
|
10
|
-
Test methods of HardEvaluator.
|
|
11
|
-
"""
|
|
12
|
-
evaluator = HardEvaluator(".")
|
|
13
|
-
|
|
14
|
-
ruff_result = evaluator.run_ruff()
|
|
15
|
-
assert "status" in ruff_result
|
|
16
|
-
|
|
17
|
-
mypy_result = evaluator.run_mypy()
|
|
18
|
-
assert "status" in mypy_result
|
|
19
|
-
|
|
20
|
-
ty_result = evaluator.run_ty()
|
|
21
|
-
assert "status" in ty_result
|
|
22
|
-
|
|
23
|
-
# pytest_result = evaluator.run_pytest() # Causes infinite loop when run in test
|
|
24
|
-
# assert "status" in pytest_result
|
|
25
|
-
|
|
26
|
-
eval_result = evaluator.evaluate()
|
|
27
|
-
assert "ruff" in eval_result
|
|
28
|
-
assert "mypy" in eval_result
|
|
29
|
-
assert "ty" in eval_result
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|