python-harness 0.0.11__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_harness-0.0.11/python_harness.egg-info → python_harness-0.0.12}/PKG-INFO +1 -1
- {python_harness-0.0.11 → python_harness-0.0.12}/pyproject.toml +1 -1
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/__init__.py +1 -1
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/cli.py +11 -1
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/hard_evaluator.py +18 -2
- python_harness-0.0.12/python_harness/python_file_inventory.py +27 -0
- python_harness-0.0.12/python_harness/soft_eval_report.py +154 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/soft_evaluator.py +19 -133
- {python_harness-0.0.11 → python_harness-0.0.12/python_harness.egg-info}/PKG-INFO +1 -1
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/SOURCES.txt +2 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_cli.py +14 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_hard_evaluator.py +16 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_soft_evaluator.py +62 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/LICENSE +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/README.md +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/evaluator.py +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/qc_evaluator.py +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/dependency_links.txt +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/entry_points.txt +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/requires.txt +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/top_level.txt +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/setup.cfg +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_evaluator.py +0 -0
- {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_qc_evaluator.py +0 -0
|
@@ -21,6 +21,8 @@ else:
|
|
|
21
21
|
|
|
22
22
|
app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
|
|
23
23
|
console = Console()
|
|
24
|
+
MI_HEALTHY_THRESHOLD = 70.0
|
|
25
|
+
MI_WARNING_THRESHOLD = 40.0
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
def _print_detail_block(title: str, details: str, color: str) -> None:
|
|
@@ -133,13 +135,21 @@ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
|
|
|
133
135
|
_print_hard_failure_details(hard_results)
|
|
134
136
|
|
|
135
137
|
|
|
138
|
+
def _mi_scorecard_color(avg_mi: float) -> str:
|
|
139
|
+
if avg_mi >= MI_HEALTHY_THRESHOLD:
|
|
140
|
+
return "green"
|
|
141
|
+
if avg_mi >= MI_WARNING_THRESHOLD:
|
|
142
|
+
return "yellow"
|
|
143
|
+
return "red"
|
|
144
|
+
|
|
145
|
+
|
|
136
146
|
def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
|
|
137
147
|
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
138
148
|
if not mi_scores:
|
|
139
149
|
return
|
|
140
150
|
|
|
141
151
|
avg_mi = sum(mi_scores.values()) / len(mi_scores)
|
|
142
|
-
color =
|
|
152
|
+
color = _mi_scorecard_color(avg_mi)
|
|
143
153
|
console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
|
|
144
154
|
|
|
145
155
|
|
|
@@ -11,6 +11,8 @@ from typing import Any
|
|
|
11
11
|
|
|
12
12
|
from rich.console import Console
|
|
13
13
|
|
|
14
|
+
from python_harness.python_file_inventory import collect_python_files
|
|
15
|
+
|
|
14
16
|
console = Console()
|
|
15
17
|
PYTEST_TIMEOUT_SECONDS = 60
|
|
16
18
|
|
|
@@ -22,6 +24,9 @@ class HardEvaluator:
|
|
|
22
24
|
def __init__(self, target_path: str):
|
|
23
25
|
self.target_path = Path(target_path).resolve()
|
|
24
26
|
|
|
27
|
+
def _radon_metric_targets(self) -> list[str]:
|
|
28
|
+
return [str(file_path) for file_path in collect_python_files(self.target_path)]
|
|
29
|
+
|
|
25
30
|
def run_ruff(self) -> dict[str, Any]:
|
|
26
31
|
"""
|
|
27
32
|
Run Ruff linter and return results.
|
|
@@ -112,6 +117,14 @@ class HardEvaluator:
|
|
|
112
117
|
Flag any function/method with CC > 15 as a failure.
|
|
113
118
|
"""
|
|
114
119
|
try:
|
|
120
|
+
targets = self._radon_metric_targets()
|
|
121
|
+
if not targets:
|
|
122
|
+
return {
|
|
123
|
+
"status": "success",
|
|
124
|
+
"issues": [],
|
|
125
|
+
"return_code": 0,
|
|
126
|
+
"output": "",
|
|
127
|
+
}
|
|
115
128
|
result = subprocess.run(
|
|
116
129
|
[
|
|
117
130
|
sys.executable,
|
|
@@ -120,7 +133,7 @@ class HardEvaluator:
|
|
|
120
133
|
"cc",
|
|
121
134
|
"-j",
|
|
122
135
|
"-a",
|
|
123
|
-
|
|
136
|
+
*targets,
|
|
124
137
|
],
|
|
125
138
|
capture_output=True,
|
|
126
139
|
text=True,
|
|
@@ -178,8 +191,11 @@ class HardEvaluator:
|
|
|
178
191
|
but it contributes to the scorecard.
|
|
179
192
|
"""
|
|
180
193
|
try:
|
|
194
|
+
targets = self._radon_metric_targets()
|
|
195
|
+
if not targets:
|
|
196
|
+
return {"status": "success", "mi_scores": {}, "return_code": 0}
|
|
181
197
|
result = subprocess.run(
|
|
182
|
-
[sys.executable, "-m", "radon", "mi", "-j",
|
|
198
|
+
[sys.executable, "-m", "radon", "mi", "-j", *targets],
|
|
183
199
|
capture_output=True,
|
|
184
200
|
text=True,
|
|
185
201
|
check=False
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Python file discovery helpers.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
SKIPPED_DIRS = {"__pycache__", "env", "test", "tests", "vendors", "venv"}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def should_skip_python_path(file_path: Path, root: Path) -> bool:
|
|
11
|
+
if file_path.name.startswith("test_") or file_path.name.endswith("_test.py"):
|
|
12
|
+
return True
|
|
13
|
+
try:
|
|
14
|
+
relative_parts = file_path.relative_to(root).parts
|
|
15
|
+
except ValueError:
|
|
16
|
+
relative_parts = file_path.parts
|
|
17
|
+
return any(part.startswith(".") or part in SKIPPED_DIRS for part in relative_parts)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def collect_python_files(root: Path) -> list[Path]:
|
|
21
|
+
if root.is_file():
|
|
22
|
+
return [root] if root.suffix == ".py" else []
|
|
23
|
+
return [
|
|
24
|
+
file_path
|
|
25
|
+
for file_path in sorted(root.rglob("*.py"))
|
|
26
|
+
if not should_skip_python_path(file_path, root)
|
|
27
|
+
]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Report-building helpers for soft evaluation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
MI_PASS_THRESHOLD = 70.0
|
|
9
|
+
QA_PASS_THRESHOLD = 75.0
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def collect_hard_errors(hard_results: dict[str, Any]) -> list[str]:
|
|
13
|
+
if hard_results.get("all_passed", True):
|
|
14
|
+
return []
|
|
15
|
+
|
|
16
|
+
hard_errors = []
|
|
17
|
+
if hard_results.get("ruff", {}).get("status") != "success":
|
|
18
|
+
hard_errors.append("Linter (Ruff) failed.")
|
|
19
|
+
if hard_results.get("mypy", {}).get("status") != "success":
|
|
20
|
+
hard_errors.append("Type checker (Mypy) failed.")
|
|
21
|
+
if hard_results.get("pytest", {}).get("status") != "success":
|
|
22
|
+
hard_errors.append(
|
|
23
|
+
hard_results.get("pytest", {}).get(
|
|
24
|
+
"error_message",
|
|
25
|
+
"Tests or Coverage failed.",
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
return hard_errors
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def extract_metrics(
|
|
32
|
+
hard_results: dict[str, Any],
|
|
33
|
+
qc_results: dict[str, Any],
|
|
34
|
+
soft_results: dict[str, Any],
|
|
35
|
+
) -> dict[str, Any]:
|
|
36
|
+
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
37
|
+
avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
|
|
38
|
+
return {
|
|
39
|
+
"avg_mi": avg_mi,
|
|
40
|
+
"cc_issues": hard_results.get("radon_cc", {}).get("issues", []),
|
|
41
|
+
"hard_errors": collect_hard_errors(hard_results),
|
|
42
|
+
"hard_failed": not hard_results.get("all_passed", True),
|
|
43
|
+
"qa_entities": soft_results.get("qa_results", {}).get("sampled_entities", []),
|
|
44
|
+
"qa_score": soft_results.get("understandability_score", 100.0),
|
|
45
|
+
"qc_errors": qc_results.get("failures", []),
|
|
46
|
+
"qc_failed": not qc_results.get("all_passed", True),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def determine_verdict(metrics: dict[str, Any], mock: bool = False) -> str:
|
|
51
|
+
suffix = " (Mock)" if mock else ""
|
|
52
|
+
if metrics["hard_failed"] or metrics["qc_failed"]:
|
|
53
|
+
return f"Fail{suffix}"
|
|
54
|
+
passed = (
|
|
55
|
+
metrics["avg_mi"] >= MI_PASS_THRESHOLD
|
|
56
|
+
and metrics["qa_score"] > QA_PASS_THRESHOLD
|
|
57
|
+
and not metrics["cc_issues"]
|
|
58
|
+
)
|
|
59
|
+
return f"Pass{suffix}" if passed else f"Fail{suffix}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def build_mock_summary(
|
|
63
|
+
metrics: dict[str, Any],
|
|
64
|
+
hard_results: dict[str, Any],
|
|
65
|
+
) -> str:
|
|
66
|
+
summary_parts = []
|
|
67
|
+
if metrics["hard_failed"]:
|
|
68
|
+
pytest_err = hard_results.get("pytest", {}).get("error_message", "")
|
|
69
|
+
summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
|
|
70
|
+
if metrics["qc_failed"]:
|
|
71
|
+
summary_parts.append("Governance QC failed.")
|
|
72
|
+
if not summary_parts:
|
|
73
|
+
summary_parts.append("Mock evaluation completed without LLM.")
|
|
74
|
+
return " ".join(summary_parts)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def build_mock_final_report(
|
|
78
|
+
hard_results: dict[str, Any],
|
|
79
|
+
metrics: dict[str, Any],
|
|
80
|
+
) -> dict[str, Any]:
|
|
81
|
+
return {
|
|
82
|
+
"verdict": determine_verdict(metrics, mock=True),
|
|
83
|
+
"summary": build_mock_summary(metrics, hard_results),
|
|
84
|
+
"suggestions": [
|
|
85
|
+
{
|
|
86
|
+
"title": "Mock Suggestion 1",
|
|
87
|
+
"description": "Add more docstrings.",
|
|
88
|
+
"target_file": "all",
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"title": "Mock Suggestion 2",
|
|
92
|
+
"description": "Refactor large functions.",
|
|
93
|
+
"target_file": "all",
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"title": "Mock Suggestion 3",
|
|
97
|
+
"description": "Improve test coverage.",
|
|
98
|
+
"target_file": "tests/",
|
|
99
|
+
},
|
|
100
|
+
],
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def build_final_report_messages(metrics: dict[str, Any]) -> list[dict[str, str]]:
|
|
105
|
+
sys_prompt = (
|
|
106
|
+
"You are an elite Python Codebase Evaluator. You have just analyzed "
|
|
107
|
+
"a repository. Your task is to provide a final judgment and EXACTLY "
|
|
108
|
+
"3 concrete, actionable improvement suggestions.\n"
|
|
109
|
+
"If the codebase failed its Hard or QC evaluations (e.g. tests "
|
|
110
|
+
"failed, coverage is low, or governance violated), your suggestions "
|
|
111
|
+
"MUST prioritize fixing those issues.\n"
|
|
112
|
+
"Otherwise, focus on refactoring/quality improvements without "
|
|
113
|
+
"changing external functionality.\n\n"
|
|
114
|
+
"Output MUST be in valid JSON matching this schema:\n"
|
|
115
|
+
"{\n"
|
|
116
|
+
' "verdict": "Pass" or "Fail",\n'
|
|
117
|
+
' "summary": "One paragraph summary of codebase health and '
|
|
118
|
+
'any critical failures",\n'
|
|
119
|
+
' "suggestions": [\n'
|
|
120
|
+
' {"title": "str", "description": "str", "target_file": "str"}\n'
|
|
121
|
+
" ]\n"
|
|
122
|
+
"}\n"
|
|
123
|
+
"Rule for Verdict: If there are Hard Failures or QC Failures, "
|
|
124
|
+
"verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
|
|
125
|
+
f">= {MI_PASS_THRESHOLD:.0f} and QA Score > {QA_PASS_THRESHOLD:.0f} "
|
|
126
|
+
"and no Critical CC issues (>15). Otherwise Fail."
|
|
127
|
+
)
|
|
128
|
+
user_content = (
|
|
129
|
+
f"Metrics:\n"
|
|
130
|
+
f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
|
|
131
|
+
f"- Number of functions with Cyclomatic Complexity > 15: "
|
|
132
|
+
f"{len(metrics['cc_issues'])}\n"
|
|
133
|
+
f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
|
|
134
|
+
f"Failures (Prioritize these!):\n"
|
|
135
|
+
f"- Hard Evaluation Errors: "
|
|
136
|
+
f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
|
|
137
|
+
f"- QC/Governance Errors: "
|
|
138
|
+
f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
|
|
139
|
+
f"QA Feedback Snippets:\n"
|
|
140
|
+
+ "\n".join(
|
|
141
|
+
[f" * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
return [
|
|
145
|
+
{"role": "system", "content": sys_prompt},
|
|
146
|
+
{"role": "user", "content": user_content},
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def parse_final_report_response(raw_content: str) -> dict[str, Any]:
|
|
151
|
+
parsed_json = json.loads(raw_content)
|
|
152
|
+
if isinstance(parsed_json, dict):
|
|
153
|
+
return parsed_json
|
|
154
|
+
raise ValueError("JSON response is not a dictionary.")
|
|
@@ -15,6 +15,17 @@ from openai import OpenAI
|
|
|
15
15
|
from pydantic import BaseModel
|
|
16
16
|
from rich.console import Console
|
|
17
17
|
|
|
18
|
+
from python_harness.python_file_inventory import collect_python_files
|
|
19
|
+
from python_harness.soft_eval_report import (
|
|
20
|
+
build_final_report_messages,
|
|
21
|
+
build_mock_final_report,
|
|
22
|
+
build_mock_summary,
|
|
23
|
+
collect_hard_errors,
|
|
24
|
+
determine_verdict,
|
|
25
|
+
extract_metrics,
|
|
26
|
+
parse_final_report_response,
|
|
27
|
+
)
|
|
28
|
+
|
|
18
29
|
console = Console()
|
|
19
30
|
|
|
20
31
|
class FileSummary(BaseModel):
|
|
@@ -57,23 +68,7 @@ class SoftEvaluator:
|
|
|
57
68
|
Recursively find all Python files in the target directory,
|
|
58
69
|
excluding hidden dirs and .venv.
|
|
59
70
|
"""
|
|
60
|
-
|
|
61
|
-
for root, dirs, files in os.walk(self.target_path):
|
|
62
|
-
# Exclude hidden directories and virtual environments
|
|
63
|
-
dirs[:] = [
|
|
64
|
-
d
|
|
65
|
-
for d in dirs
|
|
66
|
-
if not d.startswith(".") and d not in (
|
|
67
|
-
"__pycache__",
|
|
68
|
-
"venv",
|
|
69
|
-
"env",
|
|
70
|
-
"vendors",
|
|
71
|
-
)
|
|
72
|
-
]
|
|
73
|
-
for file in files:
|
|
74
|
-
if file.endswith(".py"):
|
|
75
|
-
python_files.append(Path(root) / file)
|
|
76
|
-
return python_files
|
|
71
|
+
return collect_python_files(self.target_path)
|
|
77
72
|
|
|
78
73
|
def _read_file_text(self, file_path: Path) -> str:
|
|
79
74
|
return file_path.read_text(encoding="utf-8")
|
|
@@ -164,145 +159,36 @@ class SoftEvaluator:
|
|
|
164
159
|
qc_results: dict[str, Any],
|
|
165
160
|
soft_results: dict[str, Any],
|
|
166
161
|
) -> dict[str, Any]:
|
|
167
|
-
|
|
168
|
-
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
169
|
-
avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
|
|
170
|
-
return {
|
|
171
|
-
"cc_issues": cc_issues,
|
|
172
|
-
"avg_mi": avg_mi,
|
|
173
|
-
"hard_failed": not hard_results.get("all_passed", True),
|
|
174
|
-
"qc_failed": not qc_results.get("all_passed", True),
|
|
175
|
-
"qc_errors": qc_results.get("failures", []),
|
|
176
|
-
"qa_score": soft_results.get("understandability_score", 100.0),
|
|
177
|
-
"qa_entities": soft_results.get("qa_results", {}).get(
|
|
178
|
-
"sampled_entities", []
|
|
179
|
-
),
|
|
180
|
-
"hard_errors": self._collect_hard_errors(hard_results),
|
|
181
|
-
}
|
|
162
|
+
return extract_metrics(hard_results, qc_results, soft_results)
|
|
182
163
|
|
|
183
164
|
def _collect_hard_errors(self, hard_results: dict[str, Any]) -> list[str]:
|
|
184
|
-
|
|
185
|
-
return []
|
|
186
|
-
|
|
187
|
-
hard_errors = []
|
|
188
|
-
if hard_results.get("ruff", {}).get("status") != "success":
|
|
189
|
-
hard_errors.append("Linter (Ruff) failed.")
|
|
190
|
-
if hard_results.get("mypy", {}).get("status") != "success":
|
|
191
|
-
hard_errors.append("Type checker (Mypy) failed.")
|
|
192
|
-
if hard_results.get("pytest", {}).get("status") != "success":
|
|
193
|
-
hard_errors.append(
|
|
194
|
-
hard_results.get("pytest", {}).get(
|
|
195
|
-
"error_message", "Tests or Coverage failed."
|
|
196
|
-
)
|
|
197
|
-
)
|
|
198
|
-
return hard_errors
|
|
165
|
+
return collect_hard_errors(hard_results)
|
|
199
166
|
|
|
200
167
|
def _determine_verdict(self, metrics: dict[str, Any], mock: bool = False) -> str:
|
|
201
|
-
|
|
202
|
-
if metrics["hard_failed"] or metrics["qc_failed"]:
|
|
203
|
-
return f"Fail{suffix}"
|
|
204
|
-
passed = (
|
|
205
|
-
metrics["avg_mi"] > 50
|
|
206
|
-
and metrics["qa_score"] > 75
|
|
207
|
-
and not metrics["cc_issues"]
|
|
208
|
-
)
|
|
209
|
-
return f"Pass{suffix}" if passed else f"Fail{suffix}"
|
|
168
|
+
return determine_verdict(metrics, mock=mock)
|
|
210
169
|
|
|
211
170
|
def _build_mock_summary(
|
|
212
171
|
self,
|
|
213
172
|
metrics: dict[str, Any],
|
|
214
173
|
hard_results: dict[str, Any],
|
|
215
174
|
) -> str:
|
|
216
|
-
|
|
217
|
-
if metrics["hard_failed"]:
|
|
218
|
-
pytest_err = hard_results.get("pytest", {}).get("error_message", "")
|
|
219
|
-
summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
|
|
220
|
-
if metrics["qc_failed"]:
|
|
221
|
-
summary_parts.append("Governance QC failed.")
|
|
222
|
-
if not summary_parts:
|
|
223
|
-
summary_parts.append("Mock evaluation completed without LLM.")
|
|
224
|
-
return " ".join(summary_parts)
|
|
175
|
+
return build_mock_summary(metrics, hard_results)
|
|
225
176
|
|
|
226
177
|
def _build_mock_final_report(
|
|
227
178
|
self,
|
|
228
179
|
hard_results: dict[str, Any],
|
|
229
180
|
metrics: dict[str, Any],
|
|
230
181
|
) -> dict[str, Any]:
|
|
231
|
-
return
|
|
232
|
-
"verdict": self._determine_verdict(metrics, mock=True),
|
|
233
|
-
"summary": self._build_mock_summary(metrics, hard_results),
|
|
234
|
-
"suggestions": [
|
|
235
|
-
{
|
|
236
|
-
"title": "Mock Suggestion 1",
|
|
237
|
-
"description": "Add more docstrings.",
|
|
238
|
-
"target_file": "all",
|
|
239
|
-
},
|
|
240
|
-
{
|
|
241
|
-
"title": "Mock Suggestion 2",
|
|
242
|
-
"description": "Refactor large functions.",
|
|
243
|
-
"target_file": "all",
|
|
244
|
-
},
|
|
245
|
-
{
|
|
246
|
-
"title": "Mock Suggestion 3",
|
|
247
|
-
"description": "Improve test coverage.",
|
|
248
|
-
"target_file": "tests/",
|
|
249
|
-
},
|
|
250
|
-
],
|
|
251
|
-
}
|
|
182
|
+
return build_mock_final_report(hard_results, metrics)
|
|
252
183
|
|
|
253
184
|
def _build_final_report_messages(
|
|
254
185
|
self,
|
|
255
186
|
metrics: dict[str, Any],
|
|
256
187
|
) -> list[dict[str, str]]:
|
|
257
|
-
|
|
258
|
-
"You are an elite Python Codebase Evaluator. You have just analyzed "
|
|
259
|
-
"a repository. Your task is to provide a final judgment and EXACTLY "
|
|
260
|
-
"3 concrete, actionable improvement suggestions.\n"
|
|
261
|
-
"If the codebase failed its Hard or QC evaluations (e.g. tests "
|
|
262
|
-
"failed, coverage is low, or governance violated), your suggestions "
|
|
263
|
-
"MUST prioritize fixing those issues.\n"
|
|
264
|
-
"Otherwise, focus on refactoring/quality improvements without "
|
|
265
|
-
"changing external functionality.\n\n"
|
|
266
|
-
"Output MUST be in valid JSON matching this schema:\n"
|
|
267
|
-
"{\n"
|
|
268
|
-
' "verdict": "Pass" or "Fail",\n'
|
|
269
|
-
' "summary": "One paragraph summary of codebase health and '
|
|
270
|
-
'any critical failures",\n'
|
|
271
|
-
' "suggestions": [\n'
|
|
272
|
-
' {"title": "str", "description": "str", "target_file": "str"}\n'
|
|
273
|
-
" ]\n"
|
|
274
|
-
"}\n"
|
|
275
|
-
"Rule for Verdict: If there are Hard Failures or QC Failures, "
|
|
276
|
-
"verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
|
|
277
|
-
"> 50 and QA Score > 75 and no Critical CC issues (>15). "
|
|
278
|
-
"Otherwise Fail."
|
|
279
|
-
)
|
|
280
|
-
user_content = (
|
|
281
|
-
f"Metrics:\n"
|
|
282
|
-
f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
|
|
283
|
-
f"- Number of functions with Cyclomatic Complexity > 15: "
|
|
284
|
-
f"{len(metrics['cc_issues'])}\n"
|
|
285
|
-
f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
|
|
286
|
-
f"Failures (Prioritize these!):\n"
|
|
287
|
-
f"- Hard Evaluation Errors: "
|
|
288
|
-
f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
|
|
289
|
-
f"- QC/Governance Errors: "
|
|
290
|
-
f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
|
|
291
|
-
f"QA Feedback Snippets:\n"
|
|
292
|
-
+ "\n".join(
|
|
293
|
-
[f" * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
|
|
294
|
-
)
|
|
295
|
-
)
|
|
296
|
-
return [
|
|
297
|
-
{"role": "system", "content": sys_prompt},
|
|
298
|
-
{"role": "user", "content": user_content},
|
|
299
|
-
]
|
|
188
|
+
return build_final_report_messages(metrics)
|
|
300
189
|
|
|
301
190
|
def _parse_final_report_response(self, raw_content: str) -> dict[str, Any]:
|
|
302
|
-
|
|
303
|
-
if isinstance(parsed_json, dict):
|
|
304
|
-
return parsed_json
|
|
305
|
-
raise ValueError("JSON response is not a dictionary.")
|
|
191
|
+
return parse_final_report_response(raw_content)
|
|
306
192
|
|
|
307
193
|
def calculate_token_complexity(self, file_path: Path) -> int:
|
|
308
194
|
"""
|
|
@@ -5,7 +5,9 @@ python_harness/__init__.py
|
|
|
5
5
|
python_harness/cli.py
|
|
6
6
|
python_harness/evaluator.py
|
|
7
7
|
python_harness/hard_evaluator.py
|
|
8
|
+
python_harness/python_file_inventory.py
|
|
8
9
|
python_harness/qc_evaluator.py
|
|
10
|
+
python_harness/soft_eval_report.py
|
|
9
11
|
python_harness/soft_evaluator.py
|
|
10
12
|
python_harness.egg-info/PKG-INFO
|
|
11
13
|
python_harness.egg-info/SOURCES.txt
|
|
@@ -501,3 +501,17 @@ def test_measure_surfaces_hard_tool_errors(monkeypatch: Any) -> None:
|
|
|
501
501
|
assert "No module named mypy" in result.stdout
|
|
502
502
|
assert "Pytest/Coverage issues found" in result.stdout
|
|
503
503
|
assert "No module named pytest" in result.stdout
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def test_mi_scorecard_uses_warning_color_below_70() -> None:
|
|
507
|
+
"""
|
|
508
|
+
Test that MI below 70 is no longer rendered as healthy green.
|
|
509
|
+
"""
|
|
510
|
+
assert cli_module._mi_scorecard_color(65.0) == "yellow"
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def test_mi_scorecard_uses_green_at_70() -> None:
|
|
514
|
+
"""
|
|
515
|
+
Test that MI 70 is rendered at the healthy threshold.
|
|
516
|
+
"""
|
|
517
|
+
assert cli_module._mi_scorecard_color(70.0) == "green"
|
|
@@ -79,6 +79,7 @@ def test_radon_cc_syntax_error(monkeypatch: Any, tmp_path: Path) -> None:
|
|
|
79
79
|
# and writing an error to stderr (which happens when there are syntax errors)
|
|
80
80
|
import subprocess
|
|
81
81
|
original_run = subprocess.run
|
|
82
|
+
(tmp_path / "bad.py").write_text("def broken(:\n")
|
|
82
83
|
|
|
83
84
|
def mock_run(args: Any, **kwargs: Any) -> Any:
|
|
84
85
|
# Check if the command is for radon cc (sys.executable, -m, radon, cc)
|
|
@@ -372,6 +373,21 @@ def test_run_pytest_surfaces_stderr(monkeypatch: Any, tmp_path: Path) -> None:
|
|
|
372
373
|
assert result["error_message"] == "No module named pytest"
|
|
373
374
|
|
|
374
375
|
|
|
376
|
+
def test_radon_mi_targets_exclude_test_files(tmp_path: Path) -> None:
|
|
377
|
+
"""
|
|
378
|
+
Test that maintainability scoring ignores test files and directories.
|
|
379
|
+
"""
|
|
380
|
+
(tmp_path / "pkg").mkdir()
|
|
381
|
+
(tmp_path / "pkg" / "keep.py").write_text("x = 1\n")
|
|
382
|
+
(tmp_path / "tests").mkdir()
|
|
383
|
+
(tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
|
|
384
|
+
(tmp_path / "test_skip.py").write_text("x = 1\n")
|
|
385
|
+
|
|
386
|
+
evaluator = HardEvaluator(str(tmp_path))
|
|
387
|
+
|
|
388
|
+
assert evaluator._radon_metric_targets() == [str(tmp_path / "pkg" / "keep.py")]
|
|
389
|
+
|
|
390
|
+
|
|
375
391
|
def test_evaluate_fails_when_coverage_report_missing(monkeypatch: Any) -> None:
|
|
376
392
|
"""
|
|
377
393
|
Test that missing coverage data fails the hard gate even when tests pass.
|
|
@@ -111,6 +111,66 @@ def test_generate_final_report_mock_fails_on_hard_failure() -> None:
|
|
|
111
111
|
os.environ["LLM_API_KEY"] = old_key
|
|
112
112
|
|
|
113
113
|
|
|
114
|
+
def test_determine_verdict_fails_below_mi_70(tmp_path: Path) -> None:
|
|
115
|
+
"""
|
|
116
|
+
Test that MI below 70 no longer qualifies for a passing verdict.
|
|
117
|
+
"""
|
|
118
|
+
evaluator = SoftEvaluator(str(tmp_path))
|
|
119
|
+
|
|
120
|
+
verdict = evaluator._determine_verdict(
|
|
121
|
+
{
|
|
122
|
+
"hard_failed": False,
|
|
123
|
+
"qc_failed": False,
|
|
124
|
+
"avg_mi": 65.0,
|
|
125
|
+
"qa_score": 90.0,
|
|
126
|
+
"cc_issues": [],
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
assert verdict == "Fail"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_determine_verdict_passes_at_mi_70(tmp_path: Path) -> None:
|
|
134
|
+
"""
|
|
135
|
+
Test that MI of 70 is sufficient for a passing verdict.
|
|
136
|
+
"""
|
|
137
|
+
evaluator = SoftEvaluator(str(tmp_path))
|
|
138
|
+
|
|
139
|
+
verdict = evaluator._determine_verdict(
|
|
140
|
+
{
|
|
141
|
+
"hard_failed": False,
|
|
142
|
+
"qc_failed": False,
|
|
143
|
+
"avg_mi": 70.0,
|
|
144
|
+
"qa_score": 90.0,
|
|
145
|
+
"cc_issues": [],
|
|
146
|
+
}
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
assert verdict == "Pass"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_final_report_prompt_mentions_mi_70_threshold(tmp_path: Path) -> None:
|
|
153
|
+
"""
|
|
154
|
+
Test that the final report prompt advertises the updated MI threshold.
|
|
155
|
+
"""
|
|
156
|
+
evaluator = SoftEvaluator(str(tmp_path))
|
|
157
|
+
|
|
158
|
+
messages = evaluator._build_final_report_messages(
|
|
159
|
+
{
|
|
160
|
+
"avg_mi": 70.0,
|
|
161
|
+
"cc_issues": [],
|
|
162
|
+
"qa_score": 90.0,
|
|
163
|
+
"hard_errors": [],
|
|
164
|
+
"qc_errors": [],
|
|
165
|
+
"qa_entities": [],
|
|
166
|
+
"hard_failed": False,
|
|
167
|
+
"qc_failed": False,
|
|
168
|
+
}
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
assert "Average Maintainability >= 70" in messages[0]["content"]
|
|
172
|
+
|
|
173
|
+
|
|
114
174
|
def test_read_file_text_helper_reads_utf8_content(tmp_path: Path) -> None:
|
|
115
175
|
"""
|
|
116
176
|
Test that the file-reading helper returns UTF-8 text content.
|
|
@@ -145,6 +205,8 @@ def test_get_python_files_filters_hidden_and_virtualenv_dirs(tmp_path: Path) ->
|
|
|
145
205
|
(tmp_path / "venv" / "skip.py").write_text("x = 1\n")
|
|
146
206
|
(tmp_path / "vendors").mkdir()
|
|
147
207
|
(tmp_path / "vendors" / "skip.py").write_text("x = 1\n")
|
|
208
|
+
(tmp_path / "tests").mkdir()
|
|
209
|
+
(tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
|
|
148
210
|
|
|
149
211
|
evaluator = SoftEvaluator(str(tmp_path))
|
|
150
212
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|