python-harness 0.0.11__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {python_harness-0.0.11/python_harness.egg-info → python_harness-0.0.12}/PKG-INFO +1 -1
  2. {python_harness-0.0.11 → python_harness-0.0.12}/pyproject.toml +1 -1
  3. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/__init__.py +1 -1
  4. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/cli.py +11 -1
  5. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/hard_evaluator.py +18 -2
  6. python_harness-0.0.12/python_harness/python_file_inventory.py +27 -0
  7. python_harness-0.0.12/python_harness/soft_eval_report.py +154 -0
  8. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/soft_evaluator.py +19 -133
  9. {python_harness-0.0.11 → python_harness-0.0.12/python_harness.egg-info}/PKG-INFO +1 -1
  10. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/SOURCES.txt +2 -0
  11. {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_cli.py +14 -0
  12. {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_hard_evaluator.py +16 -0
  13. {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_soft_evaluator.py +62 -0
  14. {python_harness-0.0.11 → python_harness-0.0.12}/LICENSE +0 -0
  15. {python_harness-0.0.11 → python_harness-0.0.12}/README.md +0 -0
  16. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/evaluator.py +0 -0
  17. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness/qc_evaluator.py +0 -0
  18. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/dependency_links.txt +0 -0
  19. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/entry_points.txt +0 -0
  20. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/requires.txt +0 -0
  21. {python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/top_level.txt +0 -0
  22. {python_harness-0.0.11 → python_harness-0.0.12}/setup.cfg +0 -0
  23. {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_evaluator.py +0 -0
  24. {python_harness-0.0.11 → python_harness-0.0.12}/tests/test_qc_evaluator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-harness
3
- Version: 0.0.11
3
+ Version: 0.0.12
4
4
  Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
5
  Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "python-harness"
3
- version = "0.0.11"
3
+ version = "0.0.12"
4
4
  description = "An agentic codebase evaluation and evolution tool for Python projects."
5
5
  requires-python = ">=3.10"
6
6
  readme = "README.md"
@@ -2,4 +2,4 @@
2
2
  Python Harness - An agentic evaluation tool for codebases.
3
3
  """
4
4
 
5
- __version__ = "0.0.11"
5
+ __version__ = "0.0.12"
@@ -21,6 +21,8 @@ else:
21
21
 
22
22
  app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
23
23
  console = Console()
24
+ MI_HEALTHY_THRESHOLD = 70.0
25
+ MI_WARNING_THRESHOLD = 40.0
24
26
 
25
27
 
26
28
  def _print_detail_block(title: str, details: str, color: str) -> None:
@@ -133,13 +135,21 @@ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
133
135
  _print_hard_failure_details(hard_results)
134
136
 
135
137
 
138
+ def _mi_scorecard_color(avg_mi: float) -> str:
139
+ if avg_mi >= MI_HEALTHY_THRESHOLD:
140
+ return "green"
141
+ if avg_mi >= MI_WARNING_THRESHOLD:
142
+ return "yellow"
143
+ return "red"
144
+
145
+
136
146
  def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
137
147
  mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
138
148
  if not mi_scores:
139
149
  return
140
150
 
141
151
  avg_mi = sum(mi_scores.values()) / len(mi_scores)
142
- color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
152
+ color = _mi_scorecard_color(avg_mi)
143
153
  console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
144
154
 
145
155
 
@@ -11,6 +11,8 @@ from typing import Any
11
11
 
12
12
  from rich.console import Console
13
13
 
14
+ from python_harness.python_file_inventory import collect_python_files
15
+
14
16
  console = Console()
15
17
  PYTEST_TIMEOUT_SECONDS = 60
16
18
 
@@ -22,6 +24,9 @@ class HardEvaluator:
22
24
  def __init__(self, target_path: str):
23
25
  self.target_path = Path(target_path).resolve()
24
26
 
27
+ def _radon_metric_targets(self) -> list[str]:
28
+ return [str(file_path) for file_path in collect_python_files(self.target_path)]
29
+
25
30
  def run_ruff(self) -> dict[str, Any]:
26
31
  """
27
32
  Run Ruff linter and return results.
@@ -112,6 +117,14 @@ class HardEvaluator:
112
117
  Flag any function/method with CC > 15 as a failure.
113
118
  """
114
119
  try:
120
+ targets = self._radon_metric_targets()
121
+ if not targets:
122
+ return {
123
+ "status": "success",
124
+ "issues": [],
125
+ "return_code": 0,
126
+ "output": "",
127
+ }
115
128
  result = subprocess.run(
116
129
  [
117
130
  sys.executable,
@@ -120,7 +133,7 @@ class HardEvaluator:
120
133
  "cc",
121
134
  "-j",
122
135
  "-a",
123
- str(self.target_path),
136
+ *targets,
124
137
  ],
125
138
  capture_output=True,
126
139
  text=True,
@@ -178,8 +191,11 @@ class HardEvaluator:
178
191
  but it contributes to the scorecard.
179
192
  """
180
193
  try:
194
+ targets = self._radon_metric_targets()
195
+ if not targets:
196
+ return {"status": "success", "mi_scores": {}, "return_code": 0}
181
197
  result = subprocess.run(
182
- [sys.executable, "-m", "radon", "mi", "-j", str(self.target_path)],
198
+ [sys.executable, "-m", "radon", "mi", "-j", *targets],
183
199
  capture_output=True,
184
200
  text=True,
185
201
  check=False
@@ -0,0 +1,27 @@
1
+ """
2
+ Python file discovery helpers.
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ SKIPPED_DIRS = {"__pycache__", "env", "test", "tests", "vendors", "venv"}
8
+
9
+
10
+ def should_skip_python_path(file_path: Path, root: Path) -> bool:
11
+ if file_path.name.startswith("test_") or file_path.name.endswith("_test.py"):
12
+ return True
13
+ try:
14
+ relative_parts = file_path.relative_to(root).parts
15
+ except ValueError:
16
+ relative_parts = file_path.parts
17
+ return any(part.startswith(".") or part in SKIPPED_DIRS for part in relative_parts)
18
+
19
+
20
+ def collect_python_files(root: Path) -> list[Path]:
21
+ if root.is_file():
22
+ return [root] if root.suffix == ".py" else []
23
+ return [
24
+ file_path
25
+ for file_path in sorted(root.rglob("*.py"))
26
+ if not should_skip_python_path(file_path, root)
27
+ ]
@@ -0,0 +1,154 @@
1
+ """
2
+ Report-building helpers for soft evaluation.
3
+ """
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ MI_PASS_THRESHOLD = 70.0
9
+ QA_PASS_THRESHOLD = 75.0
10
+
11
+
12
+ def collect_hard_errors(hard_results: dict[str, Any]) -> list[str]:
13
+ if hard_results.get("all_passed", True):
14
+ return []
15
+
16
+ hard_errors = []
17
+ if hard_results.get("ruff", {}).get("status") != "success":
18
+ hard_errors.append("Linter (Ruff) failed.")
19
+ if hard_results.get("mypy", {}).get("status") != "success":
20
+ hard_errors.append("Type checker (Mypy) failed.")
21
+ if hard_results.get("pytest", {}).get("status") != "success":
22
+ hard_errors.append(
23
+ hard_results.get("pytest", {}).get(
24
+ "error_message",
25
+ "Tests or Coverage failed.",
26
+ )
27
+ )
28
+ return hard_errors
29
+
30
+
31
+ def extract_metrics(
32
+ hard_results: dict[str, Any],
33
+ qc_results: dict[str, Any],
34
+ soft_results: dict[str, Any],
35
+ ) -> dict[str, Any]:
36
+ mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
37
+ avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
38
+ return {
39
+ "avg_mi": avg_mi,
40
+ "cc_issues": hard_results.get("radon_cc", {}).get("issues", []),
41
+ "hard_errors": collect_hard_errors(hard_results),
42
+ "hard_failed": not hard_results.get("all_passed", True),
43
+ "qa_entities": soft_results.get("qa_results", {}).get("sampled_entities", []),
44
+ "qa_score": soft_results.get("understandability_score", 100.0),
45
+ "qc_errors": qc_results.get("failures", []),
46
+ "qc_failed": not qc_results.get("all_passed", True),
47
+ }
48
+
49
+
50
+ def determine_verdict(metrics: dict[str, Any], mock: bool = False) -> str:
51
+ suffix = " (Mock)" if mock else ""
52
+ if metrics["hard_failed"] or metrics["qc_failed"]:
53
+ return f"Fail{suffix}"
54
+ passed = (
55
+ metrics["avg_mi"] >= MI_PASS_THRESHOLD
56
+ and metrics["qa_score"] > QA_PASS_THRESHOLD
57
+ and not metrics["cc_issues"]
58
+ )
59
+ return f"Pass{suffix}" if passed else f"Fail{suffix}"
60
+
61
+
62
+ def build_mock_summary(
63
+ metrics: dict[str, Any],
64
+ hard_results: dict[str, Any],
65
+ ) -> str:
66
+ summary_parts = []
67
+ if metrics["hard_failed"]:
68
+ pytest_err = hard_results.get("pytest", {}).get("error_message", "")
69
+ summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
70
+ if metrics["qc_failed"]:
71
+ summary_parts.append("Governance QC failed.")
72
+ if not summary_parts:
73
+ summary_parts.append("Mock evaluation completed without LLM.")
74
+ return " ".join(summary_parts)
75
+
76
+
77
+ def build_mock_final_report(
78
+ hard_results: dict[str, Any],
79
+ metrics: dict[str, Any],
80
+ ) -> dict[str, Any]:
81
+ return {
82
+ "verdict": determine_verdict(metrics, mock=True),
83
+ "summary": build_mock_summary(metrics, hard_results),
84
+ "suggestions": [
85
+ {
86
+ "title": "Mock Suggestion 1",
87
+ "description": "Add more docstrings.",
88
+ "target_file": "all",
89
+ },
90
+ {
91
+ "title": "Mock Suggestion 2",
92
+ "description": "Refactor large functions.",
93
+ "target_file": "all",
94
+ },
95
+ {
96
+ "title": "Mock Suggestion 3",
97
+ "description": "Improve test coverage.",
98
+ "target_file": "tests/",
99
+ },
100
+ ],
101
+ }
102
+
103
+
104
+ def build_final_report_messages(metrics: dict[str, Any]) -> list[dict[str, str]]:
105
+ sys_prompt = (
106
+ "You are an elite Python Codebase Evaluator. You have just analyzed "
107
+ "a repository. Your task is to provide a final judgment and EXACTLY "
108
+ "3 concrete, actionable improvement suggestions.\n"
109
+ "If the codebase failed its Hard or QC evaluations (e.g. tests "
110
+ "failed, coverage is low, or governance violated), your suggestions "
111
+ "MUST prioritize fixing those issues.\n"
112
+ "Otherwise, focus on refactoring/quality improvements without "
113
+ "changing external functionality.\n\n"
114
+ "Output MUST be in valid JSON matching this schema:\n"
115
+ "{\n"
116
+ ' "verdict": "Pass" or "Fail",\n'
117
+ ' "summary": "One paragraph summary of codebase health and '
118
+ 'any critical failures",\n'
119
+ ' "suggestions": [\n'
120
+ ' {"title": "str", "description": "str", "target_file": "str"}\n'
121
+ " ]\n"
122
+ "}\n"
123
+ "Rule for Verdict: If there are Hard Failures or QC Failures, "
124
+ "verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
125
+ f">= {MI_PASS_THRESHOLD:.0f} and QA Score > {QA_PASS_THRESHOLD:.0f} "
126
+ "and no Critical CC issues (>15). Otherwise Fail."
127
+ )
128
+ user_content = (
129
+ f"Metrics:\n"
130
+ f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
131
+ f"- Number of functions with Cyclomatic Complexity > 15: "
132
+ f"{len(metrics['cc_issues'])}\n"
133
+ f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
134
+ f"Failures (Prioritize these!):\n"
135
+ f"- Hard Evaluation Errors: "
136
+ f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
137
+ f"- QC/Governance Errors: "
138
+ f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
139
+ f"QA Feedback Snippets:\n"
140
+ + "\n".join(
141
+ [f" * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
142
+ )
143
+ )
144
+ return [
145
+ {"role": "system", "content": sys_prompt},
146
+ {"role": "user", "content": user_content},
147
+ ]
148
+
149
+
150
+ def parse_final_report_response(raw_content: str) -> dict[str, Any]:
151
+ parsed_json = json.loads(raw_content)
152
+ if isinstance(parsed_json, dict):
153
+ return parsed_json
154
+ raise ValueError("JSON response is not a dictionary.")
@@ -15,6 +15,17 @@ from openai import OpenAI
15
15
  from pydantic import BaseModel
16
16
  from rich.console import Console
17
17
 
18
+ from python_harness.python_file_inventory import collect_python_files
19
+ from python_harness.soft_eval_report import (
20
+ build_final_report_messages,
21
+ build_mock_final_report,
22
+ build_mock_summary,
23
+ collect_hard_errors,
24
+ determine_verdict,
25
+ extract_metrics,
26
+ parse_final_report_response,
27
+ )
28
+
18
29
  console = Console()
19
30
 
20
31
  class FileSummary(BaseModel):
@@ -57,23 +68,7 @@ class SoftEvaluator:
57
68
  Recursively find all Python files in the target directory,
58
69
  excluding hidden dirs and .venv.
59
70
  """
60
- python_files = []
61
- for root, dirs, files in os.walk(self.target_path):
62
- # Exclude hidden directories and virtual environments
63
- dirs[:] = [
64
- d
65
- for d in dirs
66
- if not d.startswith(".") and d not in (
67
- "__pycache__",
68
- "venv",
69
- "env",
70
- "vendors",
71
- )
72
- ]
73
- for file in files:
74
- if file.endswith(".py"):
75
- python_files.append(Path(root) / file)
76
- return python_files
71
+ return collect_python_files(self.target_path)
77
72
 
78
73
  def _read_file_text(self, file_path: Path) -> str:
79
74
  return file_path.read_text(encoding="utf-8")
@@ -164,145 +159,36 @@ class SoftEvaluator:
164
159
  qc_results: dict[str, Any],
165
160
  soft_results: dict[str, Any],
166
161
  ) -> dict[str, Any]:
167
- cc_issues = hard_results.get("radon_cc", {}).get("issues", [])
168
- mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
169
- avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
170
- return {
171
- "cc_issues": cc_issues,
172
- "avg_mi": avg_mi,
173
- "hard_failed": not hard_results.get("all_passed", True),
174
- "qc_failed": not qc_results.get("all_passed", True),
175
- "qc_errors": qc_results.get("failures", []),
176
- "qa_score": soft_results.get("understandability_score", 100.0),
177
- "qa_entities": soft_results.get("qa_results", {}).get(
178
- "sampled_entities", []
179
- ),
180
- "hard_errors": self._collect_hard_errors(hard_results),
181
- }
162
+ return extract_metrics(hard_results, qc_results, soft_results)
182
163
 
183
164
  def _collect_hard_errors(self, hard_results: dict[str, Any]) -> list[str]:
184
- if hard_results.get("all_passed", True):
185
- return []
186
-
187
- hard_errors = []
188
- if hard_results.get("ruff", {}).get("status") != "success":
189
- hard_errors.append("Linter (Ruff) failed.")
190
- if hard_results.get("mypy", {}).get("status") != "success":
191
- hard_errors.append("Type checker (Mypy) failed.")
192
- if hard_results.get("pytest", {}).get("status") != "success":
193
- hard_errors.append(
194
- hard_results.get("pytest", {}).get(
195
- "error_message", "Tests or Coverage failed."
196
- )
197
- )
198
- return hard_errors
165
+ return collect_hard_errors(hard_results)
199
166
 
200
167
  def _determine_verdict(self, metrics: dict[str, Any], mock: bool = False) -> str:
201
- suffix = " (Mock)" if mock else ""
202
- if metrics["hard_failed"] or metrics["qc_failed"]:
203
- return f"Fail{suffix}"
204
- passed = (
205
- metrics["avg_mi"] > 50
206
- and metrics["qa_score"] > 75
207
- and not metrics["cc_issues"]
208
- )
209
- return f"Pass{suffix}" if passed else f"Fail{suffix}"
168
+ return determine_verdict(metrics, mock=mock)
210
169
 
211
170
  def _build_mock_summary(
212
171
  self,
213
172
  metrics: dict[str, Any],
214
173
  hard_results: dict[str, Any],
215
174
  ) -> str:
216
- summary_parts = []
217
- if metrics["hard_failed"]:
218
- pytest_err = hard_results.get("pytest", {}).get("error_message", "")
219
- summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
220
- if metrics["qc_failed"]:
221
- summary_parts.append("Governance QC failed.")
222
- if not summary_parts:
223
- summary_parts.append("Mock evaluation completed without LLM.")
224
- return " ".join(summary_parts)
175
+ return build_mock_summary(metrics, hard_results)
225
176
 
226
177
  def _build_mock_final_report(
227
178
  self,
228
179
  hard_results: dict[str, Any],
229
180
  metrics: dict[str, Any],
230
181
  ) -> dict[str, Any]:
231
- return {
232
- "verdict": self._determine_verdict(metrics, mock=True),
233
- "summary": self._build_mock_summary(metrics, hard_results),
234
- "suggestions": [
235
- {
236
- "title": "Mock Suggestion 1",
237
- "description": "Add more docstrings.",
238
- "target_file": "all",
239
- },
240
- {
241
- "title": "Mock Suggestion 2",
242
- "description": "Refactor large functions.",
243
- "target_file": "all",
244
- },
245
- {
246
- "title": "Mock Suggestion 3",
247
- "description": "Improve test coverage.",
248
- "target_file": "tests/",
249
- },
250
- ],
251
- }
182
+ return build_mock_final_report(hard_results, metrics)
252
183
 
253
184
  def _build_final_report_messages(
254
185
  self,
255
186
  metrics: dict[str, Any],
256
187
  ) -> list[dict[str, str]]:
257
- sys_prompt = (
258
- "You are an elite Python Codebase Evaluator. You have just analyzed "
259
- "a repository. Your task is to provide a final judgment and EXACTLY "
260
- "3 concrete, actionable improvement suggestions.\n"
261
- "If the codebase failed its Hard or QC evaluations (e.g. tests "
262
- "failed, coverage is low, or governance violated), your suggestions "
263
- "MUST prioritize fixing those issues.\n"
264
- "Otherwise, focus on refactoring/quality improvements without "
265
- "changing external functionality.\n\n"
266
- "Output MUST be in valid JSON matching this schema:\n"
267
- "{\n"
268
- ' "verdict": "Pass" or "Fail",\n'
269
- ' "summary": "One paragraph summary of codebase health and '
270
- 'any critical failures",\n'
271
- ' "suggestions": [\n'
272
- ' {"title": "str", "description": "str", "target_file": "str"}\n'
273
- " ]\n"
274
- "}\n"
275
- "Rule for Verdict: If there are Hard Failures or QC Failures, "
276
- "verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
277
- "> 50 and QA Score > 75 and no Critical CC issues (>15). "
278
- "Otherwise Fail."
279
- )
280
- user_content = (
281
- f"Metrics:\n"
282
- f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
283
- f"- Number of functions with Cyclomatic Complexity > 15: "
284
- f"{len(metrics['cc_issues'])}\n"
285
- f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
286
- f"Failures (Prioritize these!):\n"
287
- f"- Hard Evaluation Errors: "
288
- f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
289
- f"- QC/Governance Errors: "
290
- f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
291
- f"QA Feedback Snippets:\n"
292
- + "\n".join(
293
- [f" * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
294
- )
295
- )
296
- return [
297
- {"role": "system", "content": sys_prompt},
298
- {"role": "user", "content": user_content},
299
- ]
188
+ return build_final_report_messages(metrics)
300
189
 
301
190
  def _parse_final_report_response(self, raw_content: str) -> dict[str, Any]:
302
- parsed_json = json.loads(raw_content)
303
- if isinstance(parsed_json, dict):
304
- return parsed_json
305
- raise ValueError("JSON response is not a dictionary.")
191
+ return parse_final_report_response(raw_content)
306
192
 
307
193
  def calculate_token_complexity(self, file_path: Path) -> int:
308
194
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-harness
3
- Version: 0.0.11
3
+ Version: 0.0.12
4
4
  Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
5
  Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
6
  License: MIT
@@ -5,7 +5,9 @@ python_harness/__init__.py
5
5
  python_harness/cli.py
6
6
  python_harness/evaluator.py
7
7
  python_harness/hard_evaluator.py
8
+ python_harness/python_file_inventory.py
8
9
  python_harness/qc_evaluator.py
10
+ python_harness/soft_eval_report.py
9
11
  python_harness/soft_evaluator.py
10
12
  python_harness.egg-info/PKG-INFO
11
13
  python_harness.egg-info/SOURCES.txt
@@ -501,3 +501,17 @@ def test_measure_surfaces_hard_tool_errors(monkeypatch: Any) -> None:
501
501
  assert "No module named mypy" in result.stdout
502
502
  assert "Pytest/Coverage issues found" in result.stdout
503
503
  assert "No module named pytest" in result.stdout
504
+
505
+
506
+ def test_mi_scorecard_uses_warning_color_below_70() -> None:
507
+ """
508
+ Test that MI below 70 is no longer rendered as healthy green.
509
+ """
510
+ assert cli_module._mi_scorecard_color(65.0) == "yellow"
511
+
512
+
513
+ def test_mi_scorecard_uses_green_at_70() -> None:
514
+ """
515
+ Test that MI 70 is rendered at the healthy threshold.
516
+ """
517
+ assert cli_module._mi_scorecard_color(70.0) == "green"
@@ -79,6 +79,7 @@ def test_radon_cc_syntax_error(monkeypatch: Any, tmp_path: Path) -> None:
79
79
  # and writing an error to stderr (which happens when there are syntax errors)
80
80
  import subprocess
81
81
  original_run = subprocess.run
82
+ (tmp_path / "bad.py").write_text("def broken(:\n")
82
83
 
83
84
  def mock_run(args: Any, **kwargs: Any) -> Any:
84
85
  # Check if the command is for radon cc (sys.executable, -m, radon, cc)
@@ -372,6 +373,21 @@ def test_run_pytest_surfaces_stderr(monkeypatch: Any, tmp_path: Path) -> None:
372
373
  assert result["error_message"] == "No module named pytest"
373
374
 
374
375
 
376
+ def test_radon_mi_targets_exclude_test_files(tmp_path: Path) -> None:
377
+ """
378
+ Test that maintainability scoring ignores test files and directories.
379
+ """
380
+ (tmp_path / "pkg").mkdir()
381
+ (tmp_path / "pkg" / "keep.py").write_text("x = 1\n")
382
+ (tmp_path / "tests").mkdir()
383
+ (tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
384
+ (tmp_path / "test_skip.py").write_text("x = 1\n")
385
+
386
+ evaluator = HardEvaluator(str(tmp_path))
387
+
388
+ assert evaluator._radon_metric_targets() == [str(tmp_path / "pkg" / "keep.py")]
389
+
390
+
375
391
  def test_evaluate_fails_when_coverage_report_missing(monkeypatch: Any) -> None:
376
392
  """
377
393
  Test that missing coverage data fails the hard gate even when tests pass.
@@ -111,6 +111,66 @@ def test_generate_final_report_mock_fails_on_hard_failure() -> None:
111
111
  os.environ["LLM_API_KEY"] = old_key
112
112
 
113
113
 
114
+ def test_determine_verdict_fails_below_mi_70(tmp_path: Path) -> None:
115
+ """
116
+ Test that MI below 70 no longer qualifies for a passing verdict.
117
+ """
118
+ evaluator = SoftEvaluator(str(tmp_path))
119
+
120
+ verdict = evaluator._determine_verdict(
121
+ {
122
+ "hard_failed": False,
123
+ "qc_failed": False,
124
+ "avg_mi": 65.0,
125
+ "qa_score": 90.0,
126
+ "cc_issues": [],
127
+ }
128
+ )
129
+
130
+ assert verdict == "Fail"
131
+
132
+
133
+ def test_determine_verdict_passes_at_mi_70(tmp_path: Path) -> None:
134
+ """
135
+ Test that MI of 70 is sufficient for a passing verdict.
136
+ """
137
+ evaluator = SoftEvaluator(str(tmp_path))
138
+
139
+ verdict = evaluator._determine_verdict(
140
+ {
141
+ "hard_failed": False,
142
+ "qc_failed": False,
143
+ "avg_mi": 70.0,
144
+ "qa_score": 90.0,
145
+ "cc_issues": [],
146
+ }
147
+ )
148
+
149
+ assert verdict == "Pass"
150
+
151
+
152
+ def test_final_report_prompt_mentions_mi_70_threshold(tmp_path: Path) -> None:
153
+ """
154
+ Test that the final report prompt advertises the updated MI threshold.
155
+ """
156
+ evaluator = SoftEvaluator(str(tmp_path))
157
+
158
+ messages = evaluator._build_final_report_messages(
159
+ {
160
+ "avg_mi": 70.0,
161
+ "cc_issues": [],
162
+ "qa_score": 90.0,
163
+ "hard_errors": [],
164
+ "qc_errors": [],
165
+ "qa_entities": [],
166
+ "hard_failed": False,
167
+ "qc_failed": False,
168
+ }
169
+ )
170
+
171
+ assert "Average Maintainability >= 70" in messages[0]["content"]
172
+
173
+
114
174
  def test_read_file_text_helper_reads_utf8_content(tmp_path: Path) -> None:
115
175
  """
116
176
  Test that the file-reading helper returns UTF-8 text content.
@@ -145,6 +205,8 @@ def test_get_python_files_filters_hidden_and_virtualenv_dirs(tmp_path: Path) ->
145
205
  (tmp_path / "venv" / "skip.py").write_text("x = 1\n")
146
206
  (tmp_path / "vendors").mkdir()
147
207
  (tmp_path / "vendors" / "skip.py").write_text("x = 1\n")
208
+ (tmp_path / "tests").mkdir()
209
+ (tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
148
210
 
149
211
  evaluator = SoftEvaluator(str(tmp_path))
150
212
 
File without changes