python-harness 0.0.10__tar.gz → 0.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_harness-0.0.10/python_harness.egg-info → python_harness-0.0.12}/PKG-INFO +6 -6
- {python_harness-0.0.10 → python_harness-0.0.12}/pyproject.toml +6 -6
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/__init__.py +1 -1
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/cli.py +21 -3
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/hard_evaluator.py +21 -3
- python_harness-0.0.12/python_harness/python_file_inventory.py +27 -0
- python_harness-0.0.12/python_harness/soft_eval_report.py +154 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/soft_evaluator.py +19 -133
- {python_harness-0.0.10 → python_harness-0.0.12/python_harness.egg-info}/PKG-INFO +6 -6
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/SOURCES.txt +2 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_cli.py +88 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_hard_evaluator.py +80 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_soft_evaluator.py +62 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/LICENSE +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/README.md +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/evaluator.py +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/qc_evaluator.py +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/dependency_links.txt +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/entry_points.txt +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/requires.txt +3 -3
- {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/top_level.txt +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/setup.cfg +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_evaluator.py +0 -0
- {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_qc_evaluator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: python-harness
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: An agentic codebase evaluation and evolution tool for Python projects.
|
|
5
5
|
Author-email: Mingli Yuan <mingli.yuan@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -15,13 +15,13 @@ Requires-Dist: anthropic>=0.18.0
|
|
|
15
15
|
Requires-Dist: tenacity>=8.2.0
|
|
16
16
|
Requires-Dist: tiktoken>=0.6.0
|
|
17
17
|
Requires-Dist: python-dotenv>=1.0.0
|
|
18
|
+
Requires-Dist: pytest>=8.0.0
|
|
19
|
+
Requires-Dist: pytest-cov>=4.1.0
|
|
20
|
+
Requires-Dist: ruff>=0.3.0
|
|
21
|
+
Requires-Dist: mypy>=1.9.0
|
|
22
|
+
Requires-Dist: radon>=6.0.1
|
|
18
23
|
Provides-Extra: dev
|
|
19
|
-
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
20
|
-
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
21
|
-
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
22
|
-
Requires-Dist: mypy>=1.9.0; extra == "dev"
|
|
23
24
|
Requires-Dist: ty>=0.0.1; extra == "dev"
|
|
24
|
-
Requires-Dist: radon>=6.0.1; extra == "dev"
|
|
25
25
|
Dynamic: license-file
|
|
26
26
|
|
|
27
27
|
# Python Harness
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "python-harness"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.12"
|
|
4
4
|
description = "An agentic codebase evaluation and evolution tool for Python projects."
|
|
5
5
|
requires-python = ">=3.10"
|
|
6
6
|
readme = "README.md"
|
|
@@ -17,18 +17,18 @@ dependencies = [
|
|
|
17
17
|
"tenacity>=8.2.0",
|
|
18
18
|
"tiktoken>=0.6.0",
|
|
19
19
|
"python-dotenv>=1.0.0",
|
|
20
|
-
]
|
|
21
|
-
|
|
22
|
-
[project.optional-dependencies]
|
|
23
|
-
dev = [
|
|
24
20
|
"pytest>=8.0.0",
|
|
25
21
|
"pytest-cov>=4.1.0",
|
|
26
22
|
"ruff>=0.3.0",
|
|
27
23
|
"mypy>=1.9.0",
|
|
28
|
-
"ty>=0.0.1", # Assuming ty is available or will be replaced with actual LSP integration
|
|
29
24
|
"radon>=6.0.1",
|
|
30
25
|
]
|
|
31
26
|
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
dev = [
|
|
29
|
+
"ty>=0.0.1", # Assuming ty is available or will be replaced with actual LSP integration
|
|
30
|
+
]
|
|
31
|
+
|
|
32
32
|
[build-system]
|
|
33
33
|
requires = ["setuptools>=61.0"]
|
|
34
34
|
build-backend = "setuptools.build_meta"
|
|
@@ -21,6 +21,8 @@ else:
|
|
|
21
21
|
|
|
22
22
|
app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
|
|
23
23
|
console = Console()
|
|
24
|
+
MI_HEALTHY_THRESHOLD = 70.0
|
|
25
|
+
MI_WARNING_THRESHOLD = 40.0
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
def _print_detail_block(title: str, details: str, color: str) -> None:
|
|
@@ -33,13 +35,18 @@ def _print_detail_block(title: str, details: str, color: str) -> None:
|
|
|
33
35
|
console.print()
|
|
34
36
|
|
|
35
37
|
|
|
36
|
-
def _print_ruff_issues(
|
|
38
|
+
def _print_ruff_issues(
|
|
39
|
+
issues: list[dict[str, Any]],
|
|
40
|
+
error_message: str = "",
|
|
41
|
+
) -> None:
|
|
37
42
|
console.print("[red]Ruff issues found:[/red]")
|
|
38
43
|
for issue in issues:
|
|
39
44
|
file = issue.get("filename", "unknown")
|
|
40
45
|
line = issue.get("location", {}).get("row", "?")
|
|
41
46
|
msg = issue.get("message", "unknown issue")
|
|
42
47
|
console.print(f" - {file}:{line} {msg}")
|
|
48
|
+
if not issues and error_message:
|
|
49
|
+
console.print(f" {error_message}")
|
|
43
50
|
console.print()
|
|
44
51
|
|
|
45
52
|
|
|
@@ -99,7 +106,10 @@ def _print_hard_failure_details(hard_results: dict[str, Any]) -> None:
|
|
|
99
106
|
|
|
100
107
|
ruff_issues = hard_results.get("ruff", {}).get("issues", [])
|
|
101
108
|
if hard_results.get("ruff", {}).get("status") != "success":
|
|
102
|
-
_print_ruff_issues(
|
|
109
|
+
_print_ruff_issues(
|
|
110
|
+
ruff_issues,
|
|
111
|
+
str(hard_results.get("ruff", {}).get("error_message", "")),
|
|
112
|
+
)
|
|
103
113
|
|
|
104
114
|
if hard_results.get("mypy", {}).get("status") != "success":
|
|
105
115
|
output = str(hard_results.get("mypy", {}).get("output", ""))
|
|
@@ -125,13 +135,21 @@ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
|
|
|
125
135
|
_print_hard_failure_details(hard_results)
|
|
126
136
|
|
|
127
137
|
|
|
138
|
+
def _mi_scorecard_color(avg_mi: float) -> str:
|
|
139
|
+
if avg_mi >= MI_HEALTHY_THRESHOLD:
|
|
140
|
+
return "green"
|
|
141
|
+
if avg_mi >= MI_WARNING_THRESHOLD:
|
|
142
|
+
return "yellow"
|
|
143
|
+
return "red"
|
|
144
|
+
|
|
145
|
+
|
|
128
146
|
def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
|
|
129
147
|
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
130
148
|
if not mi_scores:
|
|
131
149
|
return
|
|
132
150
|
|
|
133
151
|
avg_mi = sum(mi_scores.values()) / len(mi_scores)
|
|
134
|
-
color =
|
|
152
|
+
color = _mi_scorecard_color(avg_mi)
|
|
135
153
|
console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
|
|
136
154
|
|
|
137
155
|
|
|
@@ -11,6 +11,8 @@ from typing import Any
|
|
|
11
11
|
|
|
12
12
|
from rich.console import Console
|
|
13
13
|
|
|
14
|
+
from python_harness.python_file_inventory import collect_python_files
|
|
15
|
+
|
|
14
16
|
console = Console()
|
|
15
17
|
PYTEST_TIMEOUT_SECONDS = 60
|
|
16
18
|
|
|
@@ -22,6 +24,9 @@ class HardEvaluator:
|
|
|
22
24
|
def __init__(self, target_path: str):
|
|
23
25
|
self.target_path = Path(target_path).resolve()
|
|
24
26
|
|
|
27
|
+
def _radon_metric_targets(self) -> list[str]:
|
|
28
|
+
return [str(file_path) for file_path in collect_python_files(self.target_path)]
|
|
29
|
+
|
|
25
30
|
def run_ruff(self) -> dict[str, Any]:
|
|
26
31
|
"""
|
|
27
32
|
Run Ruff linter and return results.
|
|
@@ -47,6 +52,7 @@ class HardEvaluator:
|
|
|
47
52
|
"status": status,
|
|
48
53
|
"issues": issues,
|
|
49
54
|
"return_code": result.returncode,
|
|
55
|
+
"error_message": result.stderr.strip(),
|
|
50
56
|
}
|
|
51
57
|
except Exception as e:
|
|
52
58
|
return {"status": "error", "error_message": str(e)}
|
|
@@ -65,7 +71,7 @@ class HardEvaluator:
|
|
|
65
71
|
status = "success" if result.returncode == 0 else "failed"
|
|
66
72
|
return {
|
|
67
73
|
"status": status,
|
|
68
|
-
"output": result.stdout,
|
|
74
|
+
"output": result.stdout or result.stderr,
|
|
69
75
|
"return_code": result.returncode,
|
|
70
76
|
}
|
|
71
77
|
except Exception as e:
|
|
@@ -111,6 +117,14 @@ class HardEvaluator:
|
|
|
111
117
|
Flag any function/method with CC > 15 as a failure.
|
|
112
118
|
"""
|
|
113
119
|
try:
|
|
120
|
+
targets = self._radon_metric_targets()
|
|
121
|
+
if not targets:
|
|
122
|
+
return {
|
|
123
|
+
"status": "success",
|
|
124
|
+
"issues": [],
|
|
125
|
+
"return_code": 0,
|
|
126
|
+
"output": "",
|
|
127
|
+
}
|
|
114
128
|
result = subprocess.run(
|
|
115
129
|
[
|
|
116
130
|
sys.executable,
|
|
@@ -119,7 +133,7 @@ class HardEvaluator:
|
|
|
119
133
|
"cc",
|
|
120
134
|
"-j",
|
|
121
135
|
"-a",
|
|
122
|
-
|
|
136
|
+
*targets,
|
|
123
137
|
],
|
|
124
138
|
capture_output=True,
|
|
125
139
|
text=True,
|
|
@@ -177,8 +191,11 @@ class HardEvaluator:
|
|
|
177
191
|
but it contributes to the scorecard.
|
|
178
192
|
"""
|
|
179
193
|
try:
|
|
194
|
+
targets = self._radon_metric_targets()
|
|
195
|
+
if not targets:
|
|
196
|
+
return {"status": "success", "mi_scores": {}, "return_code": 0}
|
|
180
197
|
result = subprocess.run(
|
|
181
|
-
[sys.executable, "-m", "radon", "mi", "-j",
|
|
198
|
+
[sys.executable, "-m", "radon", "mi", "-j", *targets],
|
|
182
199
|
capture_output=True,
|
|
183
200
|
text=True,
|
|
184
201
|
check=False
|
|
@@ -243,6 +260,7 @@ class HardEvaluator:
|
|
|
243
260
|
"output": result.stdout,
|
|
244
261
|
"return_code": result.returncode,
|
|
245
262
|
"coverage_percentage": coverage_percentage,
|
|
263
|
+
"error_message": result.stderr.strip(),
|
|
246
264
|
}
|
|
247
265
|
except subprocess.TimeoutExpired:
|
|
248
266
|
return {
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Python file discovery helpers.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
SKIPPED_DIRS = {"__pycache__", "env", "test", "tests", "vendors", "venv"}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def should_skip_python_path(file_path: Path, root: Path) -> bool:
|
|
11
|
+
if file_path.name.startswith("test_") or file_path.name.endswith("_test.py"):
|
|
12
|
+
return True
|
|
13
|
+
try:
|
|
14
|
+
relative_parts = file_path.relative_to(root).parts
|
|
15
|
+
except ValueError:
|
|
16
|
+
relative_parts = file_path.parts
|
|
17
|
+
return any(part.startswith(".") or part in SKIPPED_DIRS for part in relative_parts)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def collect_python_files(root: Path) -> list[Path]:
|
|
21
|
+
if root.is_file():
|
|
22
|
+
return [root] if root.suffix == ".py" else []
|
|
23
|
+
return [
|
|
24
|
+
file_path
|
|
25
|
+
for file_path in sorted(root.rglob("*.py"))
|
|
26
|
+
if not should_skip_python_path(file_path, root)
|
|
27
|
+
]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Report-building helpers for soft evaluation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
MI_PASS_THRESHOLD = 70.0
|
|
9
|
+
QA_PASS_THRESHOLD = 75.0
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def collect_hard_errors(hard_results: dict[str, Any]) -> list[str]:
|
|
13
|
+
if hard_results.get("all_passed", True):
|
|
14
|
+
return []
|
|
15
|
+
|
|
16
|
+
hard_errors = []
|
|
17
|
+
if hard_results.get("ruff", {}).get("status") != "success":
|
|
18
|
+
hard_errors.append("Linter (Ruff) failed.")
|
|
19
|
+
if hard_results.get("mypy", {}).get("status") != "success":
|
|
20
|
+
hard_errors.append("Type checker (Mypy) failed.")
|
|
21
|
+
if hard_results.get("pytest", {}).get("status") != "success":
|
|
22
|
+
hard_errors.append(
|
|
23
|
+
hard_results.get("pytest", {}).get(
|
|
24
|
+
"error_message",
|
|
25
|
+
"Tests or Coverage failed.",
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
return hard_errors
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def extract_metrics(
|
|
32
|
+
hard_results: dict[str, Any],
|
|
33
|
+
qc_results: dict[str, Any],
|
|
34
|
+
soft_results: dict[str, Any],
|
|
35
|
+
) -> dict[str, Any]:
|
|
36
|
+
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
37
|
+
avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
|
|
38
|
+
return {
|
|
39
|
+
"avg_mi": avg_mi,
|
|
40
|
+
"cc_issues": hard_results.get("radon_cc", {}).get("issues", []),
|
|
41
|
+
"hard_errors": collect_hard_errors(hard_results),
|
|
42
|
+
"hard_failed": not hard_results.get("all_passed", True),
|
|
43
|
+
"qa_entities": soft_results.get("qa_results", {}).get("sampled_entities", []),
|
|
44
|
+
"qa_score": soft_results.get("understandability_score", 100.0),
|
|
45
|
+
"qc_errors": qc_results.get("failures", []),
|
|
46
|
+
"qc_failed": not qc_results.get("all_passed", True),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def determine_verdict(metrics: dict[str, Any], mock: bool = False) -> str:
|
|
51
|
+
suffix = " (Mock)" if mock else ""
|
|
52
|
+
if metrics["hard_failed"] or metrics["qc_failed"]:
|
|
53
|
+
return f"Fail{suffix}"
|
|
54
|
+
passed = (
|
|
55
|
+
metrics["avg_mi"] >= MI_PASS_THRESHOLD
|
|
56
|
+
and metrics["qa_score"] > QA_PASS_THRESHOLD
|
|
57
|
+
and not metrics["cc_issues"]
|
|
58
|
+
)
|
|
59
|
+
return f"Pass{suffix}" if passed else f"Fail{suffix}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def build_mock_summary(
|
|
63
|
+
metrics: dict[str, Any],
|
|
64
|
+
hard_results: dict[str, Any],
|
|
65
|
+
) -> str:
|
|
66
|
+
summary_parts = []
|
|
67
|
+
if metrics["hard_failed"]:
|
|
68
|
+
pytest_err = hard_results.get("pytest", {}).get("error_message", "")
|
|
69
|
+
summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
|
|
70
|
+
if metrics["qc_failed"]:
|
|
71
|
+
summary_parts.append("Governance QC failed.")
|
|
72
|
+
if not summary_parts:
|
|
73
|
+
summary_parts.append("Mock evaluation completed without LLM.")
|
|
74
|
+
return " ".join(summary_parts)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def build_mock_final_report(
|
|
78
|
+
hard_results: dict[str, Any],
|
|
79
|
+
metrics: dict[str, Any],
|
|
80
|
+
) -> dict[str, Any]:
|
|
81
|
+
return {
|
|
82
|
+
"verdict": determine_verdict(metrics, mock=True),
|
|
83
|
+
"summary": build_mock_summary(metrics, hard_results),
|
|
84
|
+
"suggestions": [
|
|
85
|
+
{
|
|
86
|
+
"title": "Mock Suggestion 1",
|
|
87
|
+
"description": "Add more docstrings.",
|
|
88
|
+
"target_file": "all",
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"title": "Mock Suggestion 2",
|
|
92
|
+
"description": "Refactor large functions.",
|
|
93
|
+
"target_file": "all",
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"title": "Mock Suggestion 3",
|
|
97
|
+
"description": "Improve test coverage.",
|
|
98
|
+
"target_file": "tests/",
|
|
99
|
+
},
|
|
100
|
+
],
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def build_final_report_messages(metrics: dict[str, Any]) -> list[dict[str, str]]:
|
|
105
|
+
sys_prompt = (
|
|
106
|
+
"You are an elite Python Codebase Evaluator. You have just analyzed "
|
|
107
|
+
"a repository. Your task is to provide a final judgment and EXACTLY "
|
|
108
|
+
"3 concrete, actionable improvement suggestions.\n"
|
|
109
|
+
"If the codebase failed its Hard or QC evaluations (e.g. tests "
|
|
110
|
+
"failed, coverage is low, or governance violated), your suggestions "
|
|
111
|
+
"MUST prioritize fixing those issues.\n"
|
|
112
|
+
"Otherwise, focus on refactoring/quality improvements without "
|
|
113
|
+
"changing external functionality.\n\n"
|
|
114
|
+
"Output MUST be in valid JSON matching this schema:\n"
|
|
115
|
+
"{\n"
|
|
116
|
+
' "verdict": "Pass" or "Fail",\n'
|
|
117
|
+
' "summary": "One paragraph summary of codebase health and '
|
|
118
|
+
'any critical failures",\n'
|
|
119
|
+
' "suggestions": [\n'
|
|
120
|
+
' {"title": "str", "description": "str", "target_file": "str"}\n'
|
|
121
|
+
" ]\n"
|
|
122
|
+
"}\n"
|
|
123
|
+
"Rule for Verdict: If there are Hard Failures or QC Failures, "
|
|
124
|
+
"verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
|
|
125
|
+
f">= {MI_PASS_THRESHOLD:.0f} and QA Score > {QA_PASS_THRESHOLD:.0f} "
|
|
126
|
+
"and no Critical CC issues (>15). Otherwise Fail."
|
|
127
|
+
)
|
|
128
|
+
user_content = (
|
|
129
|
+
f"Metrics:\n"
|
|
130
|
+
f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
|
|
131
|
+
f"- Number of functions with Cyclomatic Complexity > 15: "
|
|
132
|
+
f"{len(metrics['cc_issues'])}\n"
|
|
133
|
+
f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
|
|
134
|
+
f"Failures (Prioritize these!):\n"
|
|
135
|
+
f"- Hard Evaluation Errors: "
|
|
136
|
+
f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
|
|
137
|
+
f"- QC/Governance Errors: "
|
|
138
|
+
f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
|
|
139
|
+
f"QA Feedback Snippets:\n"
|
|
140
|
+
+ "\n".join(
|
|
141
|
+
[f" * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
return [
|
|
145
|
+
{"role": "system", "content": sys_prompt},
|
|
146
|
+
{"role": "user", "content": user_content},
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def parse_final_report_response(raw_content: str) -> dict[str, Any]:
|
|
151
|
+
parsed_json = json.loads(raw_content)
|
|
152
|
+
if isinstance(parsed_json, dict):
|
|
153
|
+
return parsed_json
|
|
154
|
+
raise ValueError("JSON response is not a dictionary.")
|
|
@@ -15,6 +15,17 @@ from openai import OpenAI
|
|
|
15
15
|
from pydantic import BaseModel
|
|
16
16
|
from rich.console import Console
|
|
17
17
|
|
|
18
|
+
from python_harness.python_file_inventory import collect_python_files
|
|
19
|
+
from python_harness.soft_eval_report import (
|
|
20
|
+
build_final_report_messages,
|
|
21
|
+
build_mock_final_report,
|
|
22
|
+
build_mock_summary,
|
|
23
|
+
collect_hard_errors,
|
|
24
|
+
determine_verdict,
|
|
25
|
+
extract_metrics,
|
|
26
|
+
parse_final_report_response,
|
|
27
|
+
)
|
|
28
|
+
|
|
18
29
|
console = Console()
|
|
19
30
|
|
|
20
31
|
class FileSummary(BaseModel):
|
|
@@ -57,23 +68,7 @@ class SoftEvaluator:
|
|
|
57
68
|
Recursively find all Python files in the target directory,
|
|
58
69
|
excluding hidden dirs and .venv.
|
|
59
70
|
"""
|
|
60
|
-
|
|
61
|
-
for root, dirs, files in os.walk(self.target_path):
|
|
62
|
-
# Exclude hidden directories and virtual environments
|
|
63
|
-
dirs[:] = [
|
|
64
|
-
d
|
|
65
|
-
for d in dirs
|
|
66
|
-
if not d.startswith(".") and d not in (
|
|
67
|
-
"__pycache__",
|
|
68
|
-
"venv",
|
|
69
|
-
"env",
|
|
70
|
-
"vendors",
|
|
71
|
-
)
|
|
72
|
-
]
|
|
73
|
-
for file in files:
|
|
74
|
-
if file.endswith(".py"):
|
|
75
|
-
python_files.append(Path(root) / file)
|
|
76
|
-
return python_files
|
|
71
|
+
return collect_python_files(self.target_path)
|
|
77
72
|
|
|
78
73
|
def _read_file_text(self, file_path: Path) -> str:
|
|
79
74
|
return file_path.read_text(encoding="utf-8")
|
|
@@ -164,145 +159,36 @@ class SoftEvaluator:
|
|
|
164
159
|
qc_results: dict[str, Any],
|
|
165
160
|
soft_results: dict[str, Any],
|
|
166
161
|
) -> dict[str, Any]:
|
|
167
|
-
|
|
168
|
-
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
169
|
-
avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
|
|
170
|
-
return {
|
|
171
|
-
"cc_issues": cc_issues,
|
|
172
|
-
"avg_mi": avg_mi,
|
|
173
|
-
"hard_failed": not hard_results.get("all_passed", True),
|
|
174
|
-
"qc_failed": not qc_results.get("all_passed", True),
|
|
175
|
-
"qc_errors": qc_results.get("failures", []),
|
|
176
|
-
"qa_score": soft_results.get("understandability_score", 100.0),
|
|
177
|
-
"qa_entities": soft_results.get("qa_results", {}).get(
|
|
178
|
-
"sampled_entities", []
|
|
179
|
-
),
|
|
180
|
-
"hard_errors": self._collect_hard_errors(hard_results),
|
|
181
|
-
}
|
|
162
|
+
return extract_metrics(hard_results, qc_results, soft_results)
|
|
182
163
|
|
|
183
164
|
def _collect_hard_errors(self, hard_results: dict[str, Any]) -> list[str]:
|
|
184
|
-
|
|
185
|
-
return []
|
|
186
|
-
|
|
187
|
-
hard_errors = []
|
|
188
|
-
if hard_results.get("ruff", {}).get("status") != "success":
|
|
189
|
-
hard_errors.append("Linter (Ruff) failed.")
|
|
190
|
-
if hard_results.get("mypy", {}).get("status") != "success":
|
|
191
|
-
hard_errors.append("Type checker (Mypy) failed.")
|
|
192
|
-
if hard_results.get("pytest", {}).get("status") != "success":
|
|
193
|
-
hard_errors.append(
|
|
194
|
-
hard_results.get("pytest", {}).get(
|
|
195
|
-
"error_message", "Tests or Coverage failed."
|
|
196
|
-
)
|
|
197
|
-
)
|
|
198
|
-
return hard_errors
|
|
165
|
+
return collect_hard_errors(hard_results)
|
|
199
166
|
|
|
200
167
|
def _determine_verdict(self, metrics: dict[str, Any], mock: bool = False) -> str:
|
|
201
|
-
|
|
202
|
-
if metrics["hard_failed"] or metrics["qc_failed"]:
|
|
203
|
-
return f"Fail{suffix}"
|
|
204
|
-
passed = (
|
|
205
|
-
metrics["avg_mi"] > 50
|
|
206
|
-
and metrics["qa_score"] > 75
|
|
207
|
-
and not metrics["cc_issues"]
|
|
208
|
-
)
|
|
209
|
-
return f"Pass{suffix}" if passed else f"Fail{suffix}"
|
|
168
|
+
return determine_verdict(metrics, mock=mock)
|
|
210
169
|
|
|
211
170
|
def _build_mock_summary(
|
|
212
171
|
self,
|
|
213
172
|
metrics: dict[str, Any],
|
|
214
173
|
hard_results: dict[str, Any],
|
|
215
174
|
) -> str:
|
|
216
|
-
|
|
217
|
-
if metrics["hard_failed"]:
|
|
218
|
-
pytest_err = hard_results.get("pytest", {}).get("error_message", "")
|
|
219
|
-
summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
|
|
220
|
-
if metrics["qc_failed"]:
|
|
221
|
-
summary_parts.append("Governance QC failed.")
|
|
222
|
-
if not summary_parts:
|
|
223
|
-
summary_parts.append("Mock evaluation completed without LLM.")
|
|
224
|
-
return " ".join(summary_parts)
|
|
175
|
+
return build_mock_summary(metrics, hard_results)
|
|
225
176
|
|
|
226
177
|
def _build_mock_final_report(
|
|
227
178
|
self,
|
|
228
179
|
hard_results: dict[str, Any],
|
|
229
180
|
metrics: dict[str, Any],
|
|
230
181
|
) -> dict[str, Any]:
|
|
231
|
-
return
|
|
232
|
-
"verdict": self._determine_verdict(metrics, mock=True),
|
|
233
|
-
"summary": self._build_mock_summary(metrics, hard_results),
|
|
234
|
-
"suggestions": [
|
|
235
|
-
{
|
|
236
|
-
"title": "Mock Suggestion 1",
|
|
237
|
-
"description": "Add more docstrings.",
|
|
238
|
-
"target_file": "all",
|
|
239
|
-
},
|
|
240
|
-
{
|
|
241
|
-
"title": "Mock Suggestion 2",
|
|
242
|
-
"description": "Refactor large functions.",
|
|
243
|
-
"target_file": "all",
|
|
244
|
-
},
|
|
245
|
-
{
|
|
246
|
-
"title": "Mock Suggestion 3",
|
|
247
|
-
"description": "Improve test coverage.",
|
|
248
|
-
"target_file": "tests/",
|
|
249
|
-
},
|
|
250
|
-
],
|
|
251
|
-
}
|
|
182
|
+
return build_mock_final_report(hard_results, metrics)
|
|
252
183
|
|
|
253
184
|
def _build_final_report_messages(
|
|
254
185
|
self,
|
|
255
186
|
metrics: dict[str, Any],
|
|
256
187
|
) -> list[dict[str, str]]:
|
|
257
|
-
|
|
258
|
-
"You are an elite Python Codebase Evaluator. You have just analyzed "
|
|
259
|
-
"a repository. Your task is to provide a final judgment and EXACTLY "
|
|
260
|
-
"3 concrete, actionable improvement suggestions.\n"
|
|
261
|
-
"If the codebase failed its Hard or QC evaluations (e.g. tests "
|
|
262
|
-
"failed, coverage is low, or governance violated), your suggestions "
|
|
263
|
-
"MUST prioritize fixing those issues.\n"
|
|
264
|
-
"Otherwise, focus on refactoring/quality improvements without "
|
|
265
|
-
"changing external functionality.\n\n"
|
|
266
|
-
"Output MUST be in valid JSON matching this schema:\n"
|
|
267
|
-
"{\n"
|
|
268
|
-
' "verdict": "Pass" or "Fail",\n'
|
|
269
|
-
' "summary": "One paragraph summary of codebase health and '
|
|
270
|
-
'any critical failures",\n'
|
|
271
|
-
' "suggestions": [\n'
|
|
272
|
-
' {"title": "str", "description": "str", "target_file": "str"}\n'
|
|
273
|
-
" ]\n"
|
|
274
|
-
"}\n"
|
|
275
|
-
"Rule for Verdict: If there are Hard Failures or QC Failures, "
|
|
276
|
-
"verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
|
|
277
|
-
"> 50 and QA Score > 75 and no Critical CC issues (>15). "
|
|
278
|
-
"Otherwise Fail."
|
|
279
|
-
)
|
|
280
|
-
user_content = (
|
|
281
|
-
f"Metrics:\n"
|
|
282
|
-
f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
|
|
283
|
-
f"- Number of functions with Cyclomatic Complexity > 15: "
|
|
284
|
-
f"{len(metrics['cc_issues'])}\n"
|
|
285
|
-
f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
|
|
286
|
-
f"Failures (Prioritize these!):\n"
|
|
287
|
-
f"- Hard Evaluation Errors: "
|
|
288
|
-
f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
|
|
289
|
-
f"- QC/Governance Errors: "
|
|
290
|
-
f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
|
|
291
|
-
f"QA Feedback Snippets:\n"
|
|
292
|
-
+ "\n".join(
|
|
293
|
-
[f" * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
|
|
294
|
-
)
|
|
295
|
-
)
|
|
296
|
-
return [
|
|
297
|
-
{"role": "system", "content": sys_prompt},
|
|
298
|
-
{"role": "user", "content": user_content},
|
|
299
|
-
]
|
|
188
|
+
return build_final_report_messages(metrics)
|
|
300
189
|
|
|
301
190
|
def _parse_final_report_response(self, raw_content: str) -> dict[str, Any]:
|
|
302
|
-
|
|
303
|
-
if isinstance(parsed_json, dict):
|
|
304
|
-
return parsed_json
|
|
305
|
-
raise ValueError("JSON response is not a dictionary.")
|
|
191
|
+
return parse_final_report_response(raw_content)
|
|
306
192
|
|
|
307
193
|
def calculate_token_complexity(self, file_path: Path) -> int:
|
|
308
194
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: python-harness
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: An agentic codebase evaluation and evolution tool for Python projects.
|
|
5
5
|
Author-email: Mingli Yuan <mingli.yuan@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -15,13 +15,13 @@ Requires-Dist: anthropic>=0.18.0
|
|
|
15
15
|
Requires-Dist: tenacity>=8.2.0
|
|
16
16
|
Requires-Dist: tiktoken>=0.6.0
|
|
17
17
|
Requires-Dist: python-dotenv>=1.0.0
|
|
18
|
+
Requires-Dist: pytest>=8.0.0
|
|
19
|
+
Requires-Dist: pytest-cov>=4.1.0
|
|
20
|
+
Requires-Dist: ruff>=0.3.0
|
|
21
|
+
Requires-Dist: mypy>=1.9.0
|
|
22
|
+
Requires-Dist: radon>=6.0.1
|
|
18
23
|
Provides-Extra: dev
|
|
19
|
-
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
20
|
-
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
21
|
-
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
22
|
-
Requires-Dist: mypy>=1.9.0; extra == "dev"
|
|
23
24
|
Requires-Dist: ty>=0.0.1; extra == "dev"
|
|
24
|
-
Requires-Dist: radon>=6.0.1; extra == "dev"
|
|
25
25
|
Dynamic: license-file
|
|
26
26
|
|
|
27
27
|
# Python Harness
|
|
@@ -5,7 +5,9 @@ python_harness/__init__.py
|
|
|
5
5
|
python_harness/cli.py
|
|
6
6
|
python_harness/evaluator.py
|
|
7
7
|
python_harness/hard_evaluator.py
|
|
8
|
+
python_harness/python_file_inventory.py
|
|
8
9
|
python_harness/qc_evaluator.py
|
|
10
|
+
python_harness/soft_eval_report.py
|
|
9
11
|
python_harness/soft_evaluator.py
|
|
10
12
|
python_harness.egg-info/PKG-INFO
|
|
11
13
|
python_harness.egg-info/SOURCES.txt
|
|
@@ -427,3 +427,91 @@ def test_refine_reports_suggestions(monkeypatch: Any) -> None:
|
|
|
427
427
|
assert result.exit_code == 0
|
|
428
428
|
assert "Found 2 suggestions. Starting evolution branches..." in result.stdout
|
|
429
429
|
assert "Evolution engine skeleton ready." in result.stdout
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def test_measure_surfaces_hard_tool_errors(monkeypatch: Any) -> None:
|
|
433
|
+
"""
|
|
434
|
+
Test that measure prints hard-tool error details when tool invocations fail early.
|
|
435
|
+
"""
|
|
436
|
+
class DummyHardEvaluator:
|
|
437
|
+
def evaluate(self) -> dict[str, Any]:
|
|
438
|
+
return {
|
|
439
|
+
"all_passed": False,
|
|
440
|
+
"ruff": {
|
|
441
|
+
"status": "failed",
|
|
442
|
+
"issues": [],
|
|
443
|
+
"error_message": "No module named ruff",
|
|
444
|
+
},
|
|
445
|
+
"mypy": {"status": "failed", "output": "No module named mypy"},
|
|
446
|
+
"ty": {
|
|
447
|
+
"status": "warning",
|
|
448
|
+
"error_message": "ty executable not found. Skipping ty checks.",
|
|
449
|
+
},
|
|
450
|
+
"radon_cc": {
|
|
451
|
+
"status": "warning",
|
|
452
|
+
"issues": [],
|
|
453
|
+
"error_message": "No module named radon",
|
|
454
|
+
},
|
|
455
|
+
"radon_mi": {"status": "success", "mi_scores": {}},
|
|
456
|
+
"pytest": {
|
|
457
|
+
"status": "failed",
|
|
458
|
+
"error_message": "No module named pytest",
|
|
459
|
+
},
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
class DummyQcEvaluator:
|
|
463
|
+
def evaluate(self) -> dict[str, Any]:
|
|
464
|
+
return {"all_passed": True, "failures": []}
|
|
465
|
+
|
|
466
|
+
class DummySoftEvaluator:
|
|
467
|
+
def evaluate(self) -> dict[str, Any]:
|
|
468
|
+
return {
|
|
469
|
+
"package_summary": {
|
|
470
|
+
"total_files": 1,
|
|
471
|
+
"total_tokens": 1,
|
|
472
|
+
"package_understanding": "Mock understanding",
|
|
473
|
+
},
|
|
474
|
+
"understandability_score": 100.0,
|
|
475
|
+
"qa_results": {"sampled_entities": []},
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
def generate_final_report(
|
|
479
|
+
self,
|
|
480
|
+
hard_results: dict[str, Any],
|
|
481
|
+
qc_results: dict[str, Any],
|
|
482
|
+
soft_results: dict[str, Any],
|
|
483
|
+
) -> dict[str, Any]:
|
|
484
|
+
return {"verdict": "Fail", "summary": "Mock summary", "suggestions": []}
|
|
485
|
+
|
|
486
|
+
class DummyEvaluator:
|
|
487
|
+
def __init__(self, path: str):
|
|
488
|
+
self.path = path
|
|
489
|
+
self.hard_evaluator = DummyHardEvaluator()
|
|
490
|
+
self.qc_evaluator = DummyQcEvaluator()
|
|
491
|
+
self.soft_evaluator = DummySoftEvaluator()
|
|
492
|
+
|
|
493
|
+
monkeypatch.setattr(cli_module, "Evaluator", DummyEvaluator)
|
|
494
|
+
|
|
495
|
+
result = runner.invoke(app, ["measure", "."])
|
|
496
|
+
|
|
497
|
+
assert result.exit_code == 1
|
|
498
|
+
assert "Ruff issues found" in result.stdout
|
|
499
|
+
assert "No module named ruff" in result.stdout
|
|
500
|
+
assert "Mypy issues found" in result.stdout
|
|
501
|
+
assert "No module named mypy" in result.stdout
|
|
502
|
+
assert "Pytest/Coverage issues found" in result.stdout
|
|
503
|
+
assert "No module named pytest" in result.stdout
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def test_mi_scorecard_uses_warning_color_below_70() -> None:
|
|
507
|
+
"""
|
|
508
|
+
Test that MI below 70 is no longer rendered as healthy green.
|
|
509
|
+
"""
|
|
510
|
+
assert cli_module._mi_scorecard_color(65.0) == "yellow"
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def test_mi_scorecard_uses_green_at_70() -> None:
|
|
514
|
+
"""
|
|
515
|
+
Test that MI 70 is rendered at the healthy threshold.
|
|
516
|
+
"""
|
|
517
|
+
assert cli_module._mi_scorecard_color(70.0) == "green"
|
|
@@ -79,6 +79,7 @@ def test_radon_cc_syntax_error(monkeypatch: Any, tmp_path: Path) -> None:
|
|
|
79
79
|
# and writing an error to stderr (which happens when there are syntax errors)
|
|
80
80
|
import subprocess
|
|
81
81
|
original_run = subprocess.run
|
|
82
|
+
(tmp_path / "bad.py").write_text("def broken(:\n")
|
|
82
83
|
|
|
83
84
|
def mock_run(args: Any, **kwargs: Any) -> Any:
|
|
84
85
|
# Check if the command is for radon cc (sys.executable, -m, radon, cc)
|
|
@@ -287,6 +288,49 @@ def test_run_mypy_returns_stdout(monkeypatch: Any) -> None:
|
|
|
287
288
|
assert "error: nope" in result["output"]
|
|
288
289
|
|
|
289
290
|
|
|
291
|
+
def test_run_ruff_surfaces_stderr_when_no_json_issues(monkeypatch: Any) -> None:
|
|
292
|
+
"""
|
|
293
|
+
Test that run_ruff preserves stderr when Ruff fails before emitting JSON.
|
|
294
|
+
"""
|
|
295
|
+
def mock_run(args: Any, **kwargs: Any) -> Any:
|
|
296
|
+
class MockResult:
|
|
297
|
+
returncode = 1
|
|
298
|
+
stdout = ""
|
|
299
|
+
stderr = "No module named ruff"
|
|
300
|
+
|
|
301
|
+
return MockResult()
|
|
302
|
+
|
|
303
|
+
monkeypatch.setattr("subprocess.run", mock_run)
|
|
304
|
+
|
|
305
|
+
evaluator = HardEvaluator(".")
|
|
306
|
+
result = evaluator.run_ruff()
|
|
307
|
+
|
|
308
|
+
assert result["status"] == "failed"
|
|
309
|
+
assert result["issues"] == []
|
|
310
|
+
assert result["error_message"] == "No module named ruff"
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def test_run_mypy_surfaces_stderr(monkeypatch: Any) -> None:
|
|
314
|
+
"""
|
|
315
|
+
Test that run_mypy preserves stderr when mypy fails before stdout output.
|
|
316
|
+
"""
|
|
317
|
+
def mock_run(args: Any, **kwargs: Any) -> Any:
|
|
318
|
+
class MockResult:
|
|
319
|
+
returncode = 1
|
|
320
|
+
stdout = ""
|
|
321
|
+
stderr = "No module named mypy"
|
|
322
|
+
|
|
323
|
+
return MockResult()
|
|
324
|
+
|
|
325
|
+
monkeypatch.setattr("subprocess.run", mock_run)
|
|
326
|
+
|
|
327
|
+
evaluator = HardEvaluator(".")
|
|
328
|
+
result = evaluator.run_mypy()
|
|
329
|
+
|
|
330
|
+
assert result["status"] == "failed"
|
|
331
|
+
assert result["output"] == "No module named mypy"
|
|
332
|
+
|
|
333
|
+
|
|
290
334
|
def test_run_radon_mi_reads_scores(monkeypatch: Any) -> None:
|
|
291
335
|
"""
|
|
292
336
|
Test that run_radon_mi parses maintainability scores from JSON.
|
|
@@ -308,6 +352,42 @@ def test_run_radon_mi_reads_scores(monkeypatch: Any) -> None:
|
|
|
308
352
|
assert result["mi_scores"] == {"a.py": 77.0}
|
|
309
353
|
|
|
310
354
|
|
|
355
|
+
def test_run_pytest_surfaces_stderr(monkeypatch: Any, tmp_path: Path) -> None:
|
|
356
|
+
"""
|
|
357
|
+
Test that run_pytest preserves stderr when pytest fails early.
|
|
358
|
+
"""
|
|
359
|
+
def mock_run(args: Any, **kwargs: Any) -> Any:
|
|
360
|
+
class MockResult:
|
|
361
|
+
returncode = 1
|
|
362
|
+
stdout = ""
|
|
363
|
+
stderr = "No module named pytest"
|
|
364
|
+
|
|
365
|
+
return MockResult()
|
|
366
|
+
|
|
367
|
+
monkeypatch.setattr("subprocess.run", mock_run)
|
|
368
|
+
|
|
369
|
+
evaluator = HardEvaluator(str(tmp_path))
|
|
370
|
+
result = evaluator.run_pytest()
|
|
371
|
+
|
|
372
|
+
assert result["status"] == "failed"
|
|
373
|
+
assert result["error_message"] == "No module named pytest"
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def test_radon_mi_targets_exclude_test_files(tmp_path: Path) -> None:
|
|
377
|
+
"""
|
|
378
|
+
Test that maintainability scoring ignores test files and directories.
|
|
379
|
+
"""
|
|
380
|
+
(tmp_path / "pkg").mkdir()
|
|
381
|
+
(tmp_path / "pkg" / "keep.py").write_text("x = 1\n")
|
|
382
|
+
(tmp_path / "tests").mkdir()
|
|
383
|
+
(tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
|
|
384
|
+
(tmp_path / "test_skip.py").write_text("x = 1\n")
|
|
385
|
+
|
|
386
|
+
evaluator = HardEvaluator(str(tmp_path))
|
|
387
|
+
|
|
388
|
+
assert evaluator._radon_metric_targets() == [str(tmp_path / "pkg" / "keep.py")]
|
|
389
|
+
|
|
390
|
+
|
|
311
391
|
def test_evaluate_fails_when_coverage_report_missing(monkeypatch: Any) -> None:
|
|
312
392
|
"""
|
|
313
393
|
Test that missing coverage data fails the hard gate even when tests pass.
|
|
@@ -111,6 +111,66 @@ def test_generate_final_report_mock_fails_on_hard_failure() -> None:
|
|
|
111
111
|
os.environ["LLM_API_KEY"] = old_key
|
|
112
112
|
|
|
113
113
|
|
|
114
|
+
def test_determine_verdict_fails_below_mi_70(tmp_path: Path) -> None:
|
|
115
|
+
"""
|
|
116
|
+
Test that MI below 70 no longer qualifies for a passing verdict.
|
|
117
|
+
"""
|
|
118
|
+
evaluator = SoftEvaluator(str(tmp_path))
|
|
119
|
+
|
|
120
|
+
verdict = evaluator._determine_verdict(
|
|
121
|
+
{
|
|
122
|
+
"hard_failed": False,
|
|
123
|
+
"qc_failed": False,
|
|
124
|
+
"avg_mi": 65.0,
|
|
125
|
+
"qa_score": 90.0,
|
|
126
|
+
"cc_issues": [],
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
assert verdict == "Fail"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_determine_verdict_passes_at_mi_70(tmp_path: Path) -> None:
|
|
134
|
+
"""
|
|
135
|
+
Test that MI of 70 is sufficient for a passing verdict.
|
|
136
|
+
"""
|
|
137
|
+
evaluator = SoftEvaluator(str(tmp_path))
|
|
138
|
+
|
|
139
|
+
verdict = evaluator._determine_verdict(
|
|
140
|
+
{
|
|
141
|
+
"hard_failed": False,
|
|
142
|
+
"qc_failed": False,
|
|
143
|
+
"avg_mi": 70.0,
|
|
144
|
+
"qa_score": 90.0,
|
|
145
|
+
"cc_issues": [],
|
|
146
|
+
}
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
assert verdict == "Pass"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_final_report_prompt_mentions_mi_70_threshold(tmp_path: Path) -> None:
|
|
153
|
+
"""
|
|
154
|
+
Test that the final report prompt advertises the updated MI threshold.
|
|
155
|
+
"""
|
|
156
|
+
evaluator = SoftEvaluator(str(tmp_path))
|
|
157
|
+
|
|
158
|
+
messages = evaluator._build_final_report_messages(
|
|
159
|
+
{
|
|
160
|
+
"avg_mi": 70.0,
|
|
161
|
+
"cc_issues": [],
|
|
162
|
+
"qa_score": 90.0,
|
|
163
|
+
"hard_errors": [],
|
|
164
|
+
"qc_errors": [],
|
|
165
|
+
"qa_entities": [],
|
|
166
|
+
"hard_failed": False,
|
|
167
|
+
"qc_failed": False,
|
|
168
|
+
}
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
assert "Average Maintainability >= 70" in messages[0]["content"]
|
|
172
|
+
|
|
173
|
+
|
|
114
174
|
def test_read_file_text_helper_reads_utf8_content(tmp_path: Path) -> None:
|
|
115
175
|
"""
|
|
116
176
|
Test that the file-reading helper returns UTF-8 text content.
|
|
@@ -145,6 +205,8 @@ def test_get_python_files_filters_hidden_and_virtualenv_dirs(tmp_path: Path) ->
|
|
|
145
205
|
(tmp_path / "venv" / "skip.py").write_text("x = 1\n")
|
|
146
206
|
(tmp_path / "vendors").mkdir()
|
|
147
207
|
(tmp_path / "vendors" / "skip.py").write_text("x = 1\n")
|
|
208
|
+
(tmp_path / "tests").mkdir()
|
|
209
|
+
(tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
|
|
148
210
|
|
|
149
211
|
evaluator = SoftEvaluator(str(tmp_path))
|
|
150
212
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|