python-harness 0.0.10__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {python_harness-0.0.10/python_harness.egg-info → python_harness-0.0.12}/PKG-INFO +6 -6
  2. {python_harness-0.0.10 → python_harness-0.0.12}/pyproject.toml +6 -6
  3. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/__init__.py +1 -1
  4. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/cli.py +21 -3
  5. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/hard_evaluator.py +21 -3
  6. python_harness-0.0.12/python_harness/python_file_inventory.py +27 -0
  7. python_harness-0.0.12/python_harness/soft_eval_report.py +154 -0
  8. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/soft_evaluator.py +19 -133
  9. {python_harness-0.0.10 → python_harness-0.0.12/python_harness.egg-info}/PKG-INFO +6 -6
  10. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/SOURCES.txt +2 -0
  11. {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_cli.py +88 -0
  12. {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_hard_evaluator.py +80 -0
  13. {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_soft_evaluator.py +62 -0
  14. {python_harness-0.0.10 → python_harness-0.0.12}/LICENSE +0 -0
  15. {python_harness-0.0.10 → python_harness-0.0.12}/README.md +0 -0
  16. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/evaluator.py +0 -0
  17. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness/qc_evaluator.py +0 -0
  18. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/dependency_links.txt +0 -0
  19. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/entry_points.txt +0 -0
  20. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/requires.txt +3 -3
  21. {python_harness-0.0.10 → python_harness-0.0.12}/python_harness.egg-info/top_level.txt +0 -0
  22. {python_harness-0.0.10 → python_harness-0.0.12}/setup.cfg +0 -0
  23. {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_evaluator.py +0 -0
  24. {python_harness-0.0.10 → python_harness-0.0.12}/tests/test_qc_evaluator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-harness
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
5
  Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
6
  License: MIT
@@ -15,13 +15,13 @@ Requires-Dist: anthropic>=0.18.0
15
15
  Requires-Dist: tenacity>=8.2.0
16
16
  Requires-Dist: tiktoken>=0.6.0
17
17
  Requires-Dist: python-dotenv>=1.0.0
18
+ Requires-Dist: pytest>=8.0.0
19
+ Requires-Dist: pytest-cov>=4.1.0
20
+ Requires-Dist: ruff>=0.3.0
21
+ Requires-Dist: mypy>=1.9.0
22
+ Requires-Dist: radon>=6.0.1
18
23
  Provides-Extra: dev
19
- Requires-Dist: pytest>=8.0.0; extra == "dev"
20
- Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
21
- Requires-Dist: ruff>=0.3.0; extra == "dev"
22
- Requires-Dist: mypy>=1.9.0; extra == "dev"
23
24
  Requires-Dist: ty>=0.0.1; extra == "dev"
24
- Requires-Dist: radon>=6.0.1; extra == "dev"
25
25
  Dynamic: license-file
26
26
 
27
27
  # Python Harness
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "python-harness"
3
- version = "0.0.10"
3
+ version = "0.0.12"
4
4
  description = "An agentic codebase evaluation and evolution tool for Python projects."
5
5
  requires-python = ">=3.10"
6
6
  readme = "README.md"
@@ -17,18 +17,18 @@ dependencies = [
17
17
  "tenacity>=8.2.0",
18
18
  "tiktoken>=0.6.0",
19
19
  "python-dotenv>=1.0.0",
20
- ]
21
-
22
- [project.optional-dependencies]
23
- dev = [
24
20
  "pytest>=8.0.0",
25
21
  "pytest-cov>=4.1.0",
26
22
  "ruff>=0.3.0",
27
23
  "mypy>=1.9.0",
28
- "ty>=0.0.1", # Assuming ty is available or will be replaced with actual LSP integration
29
24
  "radon>=6.0.1",
30
25
  ]
31
26
 
27
+ [project.optional-dependencies]
28
+ dev = [
29
+ "ty>=0.0.1", # Assuming ty is available or will be replaced with actual LSP integration
30
+ ]
31
+
32
32
  [build-system]
33
33
  requires = ["setuptools>=61.0"]
34
34
  build-backend = "setuptools.build_meta"
@@ -2,4 +2,4 @@
2
2
  Python Harness - An agentic evaluation tool for codebases.
3
3
  """
4
4
 
5
- __version__ = "0.0.10"
5
+ __version__ = "0.0.12"
@@ -21,6 +21,8 @@ else:
21
21
 
22
22
  app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
23
23
  console = Console()
24
+ MI_HEALTHY_THRESHOLD = 70.0
25
+ MI_WARNING_THRESHOLD = 40.0
24
26
 
25
27
 
26
28
  def _print_detail_block(title: str, details: str, color: str) -> None:
@@ -33,13 +35,18 @@ def _print_detail_block(title: str, details: str, color: str) -> None:
33
35
  console.print()
34
36
 
35
37
 
36
- def _print_ruff_issues(issues: list[dict[str, Any]]) -> None:
38
+ def _print_ruff_issues(
39
+ issues: list[dict[str, Any]],
40
+ error_message: str = "",
41
+ ) -> None:
37
42
  console.print("[red]Ruff issues found:[/red]")
38
43
  for issue in issues:
39
44
  file = issue.get("filename", "unknown")
40
45
  line = issue.get("location", {}).get("row", "?")
41
46
  msg = issue.get("message", "unknown issue")
42
47
  console.print(f" - {file}:{line} {msg}")
48
+ if not issues and error_message:
49
+ console.print(f" {error_message}")
43
50
  console.print()
44
51
 
45
52
 
@@ -99,7 +106,10 @@ def _print_hard_failure_details(hard_results: dict[str, Any]) -> None:
99
106
 
100
107
  ruff_issues = hard_results.get("ruff", {}).get("issues", [])
101
108
  if hard_results.get("ruff", {}).get("status") != "success":
102
- _print_ruff_issues(ruff_issues)
109
+ _print_ruff_issues(
110
+ ruff_issues,
111
+ str(hard_results.get("ruff", {}).get("error_message", "")),
112
+ )
103
113
 
104
114
  if hard_results.get("mypy", {}).get("status") != "success":
105
115
  output = str(hard_results.get("mypy", {}).get("output", ""))
@@ -125,13 +135,21 @@ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
125
135
  _print_hard_failure_details(hard_results)
126
136
 
127
137
 
138
+ def _mi_scorecard_color(avg_mi: float) -> str:
139
+ if avg_mi >= MI_HEALTHY_THRESHOLD:
140
+ return "green"
141
+ if avg_mi >= MI_WARNING_THRESHOLD:
142
+ return "yellow"
143
+ return "red"
144
+
145
+
128
146
  def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
129
147
  mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
130
148
  if not mi_scores:
131
149
  return
132
150
 
133
151
  avg_mi = sum(mi_scores.values()) / len(mi_scores)
134
- color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
152
+ color = _mi_scorecard_color(avg_mi)
135
153
  console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
136
154
 
137
155
 
@@ -11,6 +11,8 @@ from typing import Any
11
11
 
12
12
  from rich.console import Console
13
13
 
14
+ from python_harness.python_file_inventory import collect_python_files
15
+
14
16
  console = Console()
15
17
  PYTEST_TIMEOUT_SECONDS = 60
16
18
 
@@ -22,6 +24,9 @@ class HardEvaluator:
22
24
  def __init__(self, target_path: str):
23
25
  self.target_path = Path(target_path).resolve()
24
26
 
27
+ def _radon_metric_targets(self) -> list[str]:
28
+ return [str(file_path) for file_path in collect_python_files(self.target_path)]
29
+
25
30
  def run_ruff(self) -> dict[str, Any]:
26
31
  """
27
32
  Run Ruff linter and return results.
@@ -47,6 +52,7 @@ class HardEvaluator:
47
52
  "status": status,
48
53
  "issues": issues,
49
54
  "return_code": result.returncode,
55
+ "error_message": result.stderr.strip(),
50
56
  }
51
57
  except Exception as e:
52
58
  return {"status": "error", "error_message": str(e)}
@@ -65,7 +71,7 @@ class HardEvaluator:
65
71
  status = "success" if result.returncode == 0 else "failed"
66
72
  return {
67
73
  "status": status,
68
- "output": result.stdout,
74
+ "output": result.stdout or result.stderr,
69
75
  "return_code": result.returncode,
70
76
  }
71
77
  except Exception as e:
@@ -111,6 +117,14 @@ class HardEvaluator:
111
117
  Flag any function/method with CC > 15 as a failure.
112
118
  """
113
119
  try:
120
+ targets = self._radon_metric_targets()
121
+ if not targets:
122
+ return {
123
+ "status": "success",
124
+ "issues": [],
125
+ "return_code": 0,
126
+ "output": "",
127
+ }
114
128
  result = subprocess.run(
115
129
  [
116
130
  sys.executable,
@@ -119,7 +133,7 @@ class HardEvaluator:
119
133
  "cc",
120
134
  "-j",
121
135
  "-a",
122
- str(self.target_path),
136
+ *targets,
123
137
  ],
124
138
  capture_output=True,
125
139
  text=True,
@@ -177,8 +191,11 @@ class HardEvaluator:
177
191
  but it contributes to the scorecard.
178
192
  """
179
193
  try:
194
+ targets = self._radon_metric_targets()
195
+ if not targets:
196
+ return {"status": "success", "mi_scores": {}, "return_code": 0}
180
197
  result = subprocess.run(
181
- [sys.executable, "-m", "radon", "mi", "-j", str(self.target_path)],
198
+ [sys.executable, "-m", "radon", "mi", "-j", *targets],
182
199
  capture_output=True,
183
200
  text=True,
184
201
  check=False
@@ -243,6 +260,7 @@ class HardEvaluator:
243
260
  "output": result.stdout,
244
261
  "return_code": result.returncode,
245
262
  "coverage_percentage": coverage_percentage,
263
+ "error_message": result.stderr.strip(),
246
264
  }
247
265
  except subprocess.TimeoutExpired:
248
266
  return {
@@ -0,0 +1,27 @@
1
+ """
2
+ Python file discovery helpers.
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ SKIPPED_DIRS = {"__pycache__", "env", "test", "tests", "vendors", "venv"}
8
+
9
+
10
+ def should_skip_python_path(file_path: Path, root: Path) -> bool:
11
+ if file_path.name.startswith("test_") or file_path.name.endswith("_test.py"):
12
+ return True
13
+ try:
14
+ relative_parts = file_path.relative_to(root).parts
15
+ except ValueError:
16
+ relative_parts = file_path.parts
17
+ return any(part.startswith(".") or part in SKIPPED_DIRS for part in relative_parts)
18
+
19
+
20
+ def collect_python_files(root: Path) -> list[Path]:
21
+ if root.is_file():
22
+ return [root] if root.suffix == ".py" else []
23
+ return [
24
+ file_path
25
+ for file_path in sorted(root.rglob("*.py"))
26
+ if not should_skip_python_path(file_path, root)
27
+ ]
@@ -0,0 +1,154 @@
1
+ """
2
+ Report-building helpers for soft evaluation.
3
+ """
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ MI_PASS_THRESHOLD = 70.0
9
+ QA_PASS_THRESHOLD = 75.0
10
+
11
+
12
+ def collect_hard_errors(hard_results: dict[str, Any]) -> list[str]:
13
+ if hard_results.get("all_passed", True):
14
+ return []
15
+
16
+ hard_errors = []
17
+ if hard_results.get("ruff", {}).get("status") != "success":
18
+ hard_errors.append("Linter (Ruff) failed.")
19
+ if hard_results.get("mypy", {}).get("status") != "success":
20
+ hard_errors.append("Type checker (Mypy) failed.")
21
+ if hard_results.get("pytest", {}).get("status") != "success":
22
+ hard_errors.append(
23
+ hard_results.get("pytest", {}).get(
24
+ "error_message",
25
+ "Tests or Coverage failed.",
26
+ )
27
+ )
28
+ return hard_errors
29
+
30
+
31
+ def extract_metrics(
32
+ hard_results: dict[str, Any],
33
+ qc_results: dict[str, Any],
34
+ soft_results: dict[str, Any],
35
+ ) -> dict[str, Any]:
36
+ mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
37
+ avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
38
+ return {
39
+ "avg_mi": avg_mi,
40
+ "cc_issues": hard_results.get("radon_cc", {}).get("issues", []),
41
+ "hard_errors": collect_hard_errors(hard_results),
42
+ "hard_failed": not hard_results.get("all_passed", True),
43
+ "qa_entities": soft_results.get("qa_results", {}).get("sampled_entities", []),
44
+ "qa_score": soft_results.get("understandability_score", 100.0),
45
+ "qc_errors": qc_results.get("failures", []),
46
+ "qc_failed": not qc_results.get("all_passed", True),
47
+ }
48
+
49
+
50
+ def determine_verdict(metrics: dict[str, Any], mock: bool = False) -> str:
51
+ suffix = " (Mock)" if mock else ""
52
+ if metrics["hard_failed"] or metrics["qc_failed"]:
53
+ return f"Fail{suffix}"
54
+ passed = (
55
+ metrics["avg_mi"] >= MI_PASS_THRESHOLD
56
+ and metrics["qa_score"] > QA_PASS_THRESHOLD
57
+ and not metrics["cc_issues"]
58
+ )
59
+ return f"Pass{suffix}" if passed else f"Fail{suffix}"
60
+
61
+
62
+ def build_mock_summary(
63
+ metrics: dict[str, Any],
64
+ hard_results: dict[str, Any],
65
+ ) -> str:
66
+ summary_parts = []
67
+ if metrics["hard_failed"]:
68
+ pytest_err = hard_results.get("pytest", {}).get("error_message", "")
69
+ summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
70
+ if metrics["qc_failed"]:
71
+ summary_parts.append("Governance QC failed.")
72
+ if not summary_parts:
73
+ summary_parts.append("Mock evaluation completed without LLM.")
74
+ return " ".join(summary_parts)
75
+
76
+
77
+ def build_mock_final_report(
78
+ hard_results: dict[str, Any],
79
+ metrics: dict[str, Any],
80
+ ) -> dict[str, Any]:
81
+ return {
82
+ "verdict": determine_verdict(metrics, mock=True),
83
+ "summary": build_mock_summary(metrics, hard_results),
84
+ "suggestions": [
85
+ {
86
+ "title": "Mock Suggestion 1",
87
+ "description": "Add more docstrings.",
88
+ "target_file": "all",
89
+ },
90
+ {
91
+ "title": "Mock Suggestion 2",
92
+ "description": "Refactor large functions.",
93
+ "target_file": "all",
94
+ },
95
+ {
96
+ "title": "Mock Suggestion 3",
97
+ "description": "Improve test coverage.",
98
+ "target_file": "tests/",
99
+ },
100
+ ],
101
+ }
102
+
103
+
104
+ def build_final_report_messages(metrics: dict[str, Any]) -> list[dict[str, str]]:
105
+ sys_prompt = (
106
+ "You are an elite Python Codebase Evaluator. You have just analyzed "
107
+ "a repository. Your task is to provide a final judgment and EXACTLY "
108
+ "3 concrete, actionable improvement suggestions.\n"
109
+ "If the codebase failed its Hard or QC evaluations (e.g. tests "
110
+ "failed, coverage is low, or governance violated), your suggestions "
111
+ "MUST prioritize fixing those issues.\n"
112
+ "Otherwise, focus on refactoring/quality improvements without "
113
+ "changing external functionality.\n\n"
114
+ "Output MUST be in valid JSON matching this schema:\n"
115
+ "{\n"
116
+ ' "verdict": "Pass" or "Fail",\n'
117
+ ' "summary": "One paragraph summary of codebase health and '
118
+ 'any critical failures",\n'
119
+ ' "suggestions": [\n'
120
+ ' {"title": "str", "description": "str", "target_file": "str"}\n'
121
+ " ]\n"
122
+ "}\n"
123
+ "Rule for Verdict: If there are Hard Failures or QC Failures, "
124
+ "verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
125
+ f">= {MI_PASS_THRESHOLD:.0f} and QA Score > {QA_PASS_THRESHOLD:.0f} "
126
+ "and no Critical CC issues (>15). Otherwise Fail."
127
+ )
128
+ user_content = (
129
+ f"Metrics:\n"
130
+ f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
131
+ f"- Number of functions with Cyclomatic Complexity > 15: "
132
+ f"{len(metrics['cc_issues'])}\n"
133
+ f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
134
+ f"Failures (Prioritize these!):\n"
135
+ f"- Hard Evaluation Errors: "
136
+ f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
137
+ f"- QC/Governance Errors: "
138
+ f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
139
+ f"QA Feedback Snippets:\n"
140
+ + "\n".join(
141
+ [f" * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
142
+ )
143
+ )
144
+ return [
145
+ {"role": "system", "content": sys_prompt},
146
+ {"role": "user", "content": user_content},
147
+ ]
148
+
149
+
150
+ def parse_final_report_response(raw_content: str) -> dict[str, Any]:
151
+ parsed_json = json.loads(raw_content)
152
+ if isinstance(parsed_json, dict):
153
+ return parsed_json
154
+ raise ValueError("JSON response is not a dictionary.")
@@ -15,6 +15,17 @@ from openai import OpenAI
15
15
  from pydantic import BaseModel
16
16
  from rich.console import Console
17
17
 
18
+ from python_harness.python_file_inventory import collect_python_files
19
+ from python_harness.soft_eval_report import (
20
+ build_final_report_messages,
21
+ build_mock_final_report,
22
+ build_mock_summary,
23
+ collect_hard_errors,
24
+ determine_verdict,
25
+ extract_metrics,
26
+ parse_final_report_response,
27
+ )
28
+
18
29
  console = Console()
19
30
 
20
31
  class FileSummary(BaseModel):
@@ -57,23 +68,7 @@ class SoftEvaluator:
57
68
  Recursively find all Python files in the target directory,
58
69
  excluding hidden dirs and .venv.
59
70
  """
60
- python_files = []
61
- for root, dirs, files in os.walk(self.target_path):
62
- # Exclude hidden directories and virtual environments
63
- dirs[:] = [
64
- d
65
- for d in dirs
66
- if not d.startswith(".") and d not in (
67
- "__pycache__",
68
- "venv",
69
- "env",
70
- "vendors",
71
- )
72
- ]
73
- for file in files:
74
- if file.endswith(".py"):
75
- python_files.append(Path(root) / file)
76
- return python_files
71
+ return collect_python_files(self.target_path)
77
72
 
78
73
  def _read_file_text(self, file_path: Path) -> str:
79
74
  return file_path.read_text(encoding="utf-8")
@@ -164,145 +159,36 @@ class SoftEvaluator:
164
159
  qc_results: dict[str, Any],
165
160
  soft_results: dict[str, Any],
166
161
  ) -> dict[str, Any]:
167
- cc_issues = hard_results.get("radon_cc", {}).get("issues", [])
168
- mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
169
- avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
170
- return {
171
- "cc_issues": cc_issues,
172
- "avg_mi": avg_mi,
173
- "hard_failed": not hard_results.get("all_passed", True),
174
- "qc_failed": not qc_results.get("all_passed", True),
175
- "qc_errors": qc_results.get("failures", []),
176
- "qa_score": soft_results.get("understandability_score", 100.0),
177
- "qa_entities": soft_results.get("qa_results", {}).get(
178
- "sampled_entities", []
179
- ),
180
- "hard_errors": self._collect_hard_errors(hard_results),
181
- }
162
+ return extract_metrics(hard_results, qc_results, soft_results)
182
163
 
183
164
  def _collect_hard_errors(self, hard_results: dict[str, Any]) -> list[str]:
184
- if hard_results.get("all_passed", True):
185
- return []
186
-
187
- hard_errors = []
188
- if hard_results.get("ruff", {}).get("status") != "success":
189
- hard_errors.append("Linter (Ruff) failed.")
190
- if hard_results.get("mypy", {}).get("status") != "success":
191
- hard_errors.append("Type checker (Mypy) failed.")
192
- if hard_results.get("pytest", {}).get("status") != "success":
193
- hard_errors.append(
194
- hard_results.get("pytest", {}).get(
195
- "error_message", "Tests or Coverage failed."
196
- )
197
- )
198
- return hard_errors
165
+ return collect_hard_errors(hard_results)
199
166
 
200
167
  def _determine_verdict(self, metrics: dict[str, Any], mock: bool = False) -> str:
201
- suffix = " (Mock)" if mock else ""
202
- if metrics["hard_failed"] or metrics["qc_failed"]:
203
- return f"Fail{suffix}"
204
- passed = (
205
- metrics["avg_mi"] > 50
206
- and metrics["qa_score"] > 75
207
- and not metrics["cc_issues"]
208
- )
209
- return f"Pass{suffix}" if passed else f"Fail{suffix}"
168
+ return determine_verdict(metrics, mock=mock)
210
169
 
211
170
  def _build_mock_summary(
212
171
  self,
213
172
  metrics: dict[str, Any],
214
173
  hard_results: dict[str, Any],
215
174
  ) -> str:
216
- summary_parts = []
217
- if metrics["hard_failed"]:
218
- pytest_err = hard_results.get("pytest", {}).get("error_message", "")
219
- summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
220
- if metrics["qc_failed"]:
221
- summary_parts.append("Governance QC failed.")
222
- if not summary_parts:
223
- summary_parts.append("Mock evaluation completed without LLM.")
224
- return " ".join(summary_parts)
175
+ return build_mock_summary(metrics, hard_results)
225
176
 
226
177
  def _build_mock_final_report(
227
178
  self,
228
179
  hard_results: dict[str, Any],
229
180
  metrics: dict[str, Any],
230
181
  ) -> dict[str, Any]:
231
- return {
232
- "verdict": self._determine_verdict(metrics, mock=True),
233
- "summary": self._build_mock_summary(metrics, hard_results),
234
- "suggestions": [
235
- {
236
- "title": "Mock Suggestion 1",
237
- "description": "Add more docstrings.",
238
- "target_file": "all",
239
- },
240
- {
241
- "title": "Mock Suggestion 2",
242
- "description": "Refactor large functions.",
243
- "target_file": "all",
244
- },
245
- {
246
- "title": "Mock Suggestion 3",
247
- "description": "Improve test coverage.",
248
- "target_file": "tests/",
249
- },
250
- ],
251
- }
182
+ return build_mock_final_report(hard_results, metrics)
252
183
 
253
184
  def _build_final_report_messages(
254
185
  self,
255
186
  metrics: dict[str, Any],
256
187
  ) -> list[dict[str, str]]:
257
- sys_prompt = (
258
- "You are an elite Python Codebase Evaluator. You have just analyzed "
259
- "a repository. Your task is to provide a final judgment and EXACTLY "
260
- "3 concrete, actionable improvement suggestions.\n"
261
- "If the codebase failed its Hard or QC evaluations (e.g. tests "
262
- "failed, coverage is low, or governance violated), your suggestions "
263
- "MUST prioritize fixing those issues.\n"
264
- "Otherwise, focus on refactoring/quality improvements without "
265
- "changing external functionality.\n\n"
266
- "Output MUST be in valid JSON matching this schema:\n"
267
- "{\n"
268
- ' "verdict": "Pass" or "Fail",\n'
269
- ' "summary": "One paragraph summary of codebase health and '
270
- 'any critical failures",\n'
271
- ' "suggestions": [\n'
272
- ' {"title": "str", "description": "str", "target_file": "str"}\n'
273
- " ]\n"
274
- "}\n"
275
- "Rule for Verdict: If there are Hard Failures or QC Failures, "
276
- "verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
277
- "> 50 and QA Score > 75 and no Critical CC issues (>15). "
278
- "Otherwise Fail."
279
- )
280
- user_content = (
281
- f"Metrics:\n"
282
- f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
283
- f"- Number of functions with Cyclomatic Complexity > 15: "
284
- f"{len(metrics['cc_issues'])}\n"
285
- f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
286
- f"Failures (Prioritize these!):\n"
287
- f"- Hard Evaluation Errors: "
288
- f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
289
- f"- QC/Governance Errors: "
290
- f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
291
- f"QA Feedback Snippets:\n"
292
- + "\n".join(
293
- [f" * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
294
- )
295
- )
296
- return [
297
- {"role": "system", "content": sys_prompt},
298
- {"role": "user", "content": user_content},
299
- ]
188
+ return build_final_report_messages(metrics)
300
189
 
301
190
  def _parse_final_report_response(self, raw_content: str) -> dict[str, Any]:
302
- parsed_json = json.loads(raw_content)
303
- if isinstance(parsed_json, dict):
304
- return parsed_json
305
- raise ValueError("JSON response is not a dictionary.")
191
+ return parse_final_report_response(raw_content)
306
192
 
307
193
  def calculate_token_complexity(self, file_path: Path) -> int:
308
194
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-harness
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
5
  Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
6
  License: MIT
@@ -15,13 +15,13 @@ Requires-Dist: anthropic>=0.18.0
15
15
  Requires-Dist: tenacity>=8.2.0
16
16
  Requires-Dist: tiktoken>=0.6.0
17
17
  Requires-Dist: python-dotenv>=1.0.0
18
+ Requires-Dist: pytest>=8.0.0
19
+ Requires-Dist: pytest-cov>=4.1.0
20
+ Requires-Dist: ruff>=0.3.0
21
+ Requires-Dist: mypy>=1.9.0
22
+ Requires-Dist: radon>=6.0.1
18
23
  Provides-Extra: dev
19
- Requires-Dist: pytest>=8.0.0; extra == "dev"
20
- Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
21
- Requires-Dist: ruff>=0.3.0; extra == "dev"
22
- Requires-Dist: mypy>=1.9.0; extra == "dev"
23
24
  Requires-Dist: ty>=0.0.1; extra == "dev"
24
- Requires-Dist: radon>=6.0.1; extra == "dev"
25
25
  Dynamic: license-file
26
26
 
27
27
  # Python Harness
@@ -5,7 +5,9 @@ python_harness/__init__.py
5
5
  python_harness/cli.py
6
6
  python_harness/evaluator.py
7
7
  python_harness/hard_evaluator.py
8
+ python_harness/python_file_inventory.py
8
9
  python_harness/qc_evaluator.py
10
+ python_harness/soft_eval_report.py
9
11
  python_harness/soft_evaluator.py
10
12
  python_harness.egg-info/PKG-INFO
11
13
  python_harness.egg-info/SOURCES.txt
@@ -427,3 +427,91 @@ def test_refine_reports_suggestions(monkeypatch: Any) -> None:
427
427
  assert result.exit_code == 0
428
428
  assert "Found 2 suggestions. Starting evolution branches..." in result.stdout
429
429
  assert "Evolution engine skeleton ready." in result.stdout
430
+
431
+
432
+ def test_measure_surfaces_hard_tool_errors(monkeypatch: Any) -> None:
433
+ """
434
+ Test that measure prints hard-tool error details when tool invocations fail early.
435
+ """
436
+ class DummyHardEvaluator:
437
+ def evaluate(self) -> dict[str, Any]:
438
+ return {
439
+ "all_passed": False,
440
+ "ruff": {
441
+ "status": "failed",
442
+ "issues": [],
443
+ "error_message": "No module named ruff",
444
+ },
445
+ "mypy": {"status": "failed", "output": "No module named mypy"},
446
+ "ty": {
447
+ "status": "warning",
448
+ "error_message": "ty executable not found. Skipping ty checks.",
449
+ },
450
+ "radon_cc": {
451
+ "status": "warning",
452
+ "issues": [],
453
+ "error_message": "No module named radon",
454
+ },
455
+ "radon_mi": {"status": "success", "mi_scores": {}},
456
+ "pytest": {
457
+ "status": "failed",
458
+ "error_message": "No module named pytest",
459
+ },
460
+ }
461
+
462
+ class DummyQcEvaluator:
463
+ def evaluate(self) -> dict[str, Any]:
464
+ return {"all_passed": True, "failures": []}
465
+
466
+ class DummySoftEvaluator:
467
+ def evaluate(self) -> dict[str, Any]:
468
+ return {
469
+ "package_summary": {
470
+ "total_files": 1,
471
+ "total_tokens": 1,
472
+ "package_understanding": "Mock understanding",
473
+ },
474
+ "understandability_score": 100.0,
475
+ "qa_results": {"sampled_entities": []},
476
+ }
477
+
478
+ def generate_final_report(
479
+ self,
480
+ hard_results: dict[str, Any],
481
+ qc_results: dict[str, Any],
482
+ soft_results: dict[str, Any],
483
+ ) -> dict[str, Any]:
484
+ return {"verdict": "Fail", "summary": "Mock summary", "suggestions": []}
485
+
486
+ class DummyEvaluator:
487
+ def __init__(self, path: str):
488
+ self.path = path
489
+ self.hard_evaluator = DummyHardEvaluator()
490
+ self.qc_evaluator = DummyQcEvaluator()
491
+ self.soft_evaluator = DummySoftEvaluator()
492
+
493
+ monkeypatch.setattr(cli_module, "Evaluator", DummyEvaluator)
494
+
495
+ result = runner.invoke(app, ["measure", "."])
496
+
497
+ assert result.exit_code == 1
498
+ assert "Ruff issues found" in result.stdout
499
+ assert "No module named ruff" in result.stdout
500
+ assert "Mypy issues found" in result.stdout
501
+ assert "No module named mypy" in result.stdout
502
+ assert "Pytest/Coverage issues found" in result.stdout
503
+ assert "No module named pytest" in result.stdout
504
+
505
+
506
+ def test_mi_scorecard_uses_warning_color_below_70() -> None:
507
+ """
508
+ Test that MI below 70 is no longer rendered as healthy green.
509
+ """
510
+ assert cli_module._mi_scorecard_color(65.0) == "yellow"
511
+
512
+
513
+ def test_mi_scorecard_uses_green_at_70() -> None:
514
+ """
515
+ Test that MI 70 is rendered at the healthy threshold.
516
+ """
517
+ assert cli_module._mi_scorecard_color(70.0) == "green"
@@ -79,6 +79,7 @@ def test_radon_cc_syntax_error(monkeypatch: Any, tmp_path: Path) -> None:
79
79
  # and writing an error to stderr (which happens when there are syntax errors)
80
80
  import subprocess
81
81
  original_run = subprocess.run
82
+ (tmp_path / "bad.py").write_text("def broken(:\n")
82
83
 
83
84
  def mock_run(args: Any, **kwargs: Any) -> Any:
84
85
  # Check if the command is for radon cc (sys.executable, -m, radon, cc)
@@ -287,6 +288,49 @@ def test_run_mypy_returns_stdout(monkeypatch: Any) -> None:
287
288
  assert "error: nope" in result["output"]
288
289
 
289
290
 
291
+ def test_run_ruff_surfaces_stderr_when_no_json_issues(monkeypatch: Any) -> None:
292
+ """
293
+ Test that run_ruff preserves stderr when Ruff fails before emitting JSON.
294
+ """
295
+ def mock_run(args: Any, **kwargs: Any) -> Any:
296
+ class MockResult:
297
+ returncode = 1
298
+ stdout = ""
299
+ stderr = "No module named ruff"
300
+
301
+ return MockResult()
302
+
303
+ monkeypatch.setattr("subprocess.run", mock_run)
304
+
305
+ evaluator = HardEvaluator(".")
306
+ result = evaluator.run_ruff()
307
+
308
+ assert result["status"] == "failed"
309
+ assert result["issues"] == []
310
+ assert result["error_message"] == "No module named ruff"
311
+
312
+
313
+ def test_run_mypy_surfaces_stderr(monkeypatch: Any) -> None:
314
+ """
315
+ Test that run_mypy preserves stderr when mypy fails before stdout output.
316
+ """
317
+ def mock_run(args: Any, **kwargs: Any) -> Any:
318
+ class MockResult:
319
+ returncode = 1
320
+ stdout = ""
321
+ stderr = "No module named mypy"
322
+
323
+ return MockResult()
324
+
325
+ monkeypatch.setattr("subprocess.run", mock_run)
326
+
327
+ evaluator = HardEvaluator(".")
328
+ result = evaluator.run_mypy()
329
+
330
+ assert result["status"] == "failed"
331
+ assert result["output"] == "No module named mypy"
332
+
333
+
290
334
  def test_run_radon_mi_reads_scores(monkeypatch: Any) -> None:
291
335
  """
292
336
  Test that run_radon_mi parses maintainability scores from JSON.
@@ -308,6 +352,42 @@ def test_run_radon_mi_reads_scores(monkeypatch: Any) -> None:
308
352
  assert result["mi_scores"] == {"a.py": 77.0}
309
353
 
310
354
 
355
+ def test_run_pytest_surfaces_stderr(monkeypatch: Any, tmp_path: Path) -> None:
356
+ """
357
+ Test that run_pytest preserves stderr when pytest fails early.
358
+ """
359
+ def mock_run(args: Any, **kwargs: Any) -> Any:
360
+ class MockResult:
361
+ returncode = 1
362
+ stdout = ""
363
+ stderr = "No module named pytest"
364
+
365
+ return MockResult()
366
+
367
+ monkeypatch.setattr("subprocess.run", mock_run)
368
+
369
+ evaluator = HardEvaluator(str(tmp_path))
370
+ result = evaluator.run_pytest()
371
+
372
+ assert result["status"] == "failed"
373
+ assert result["error_message"] == "No module named pytest"
374
+
375
+
376
+ def test_radon_mi_targets_exclude_test_files(tmp_path: Path) -> None:
377
+ """
378
+ Test that maintainability scoring ignores test files and directories.
379
+ """
380
+ (tmp_path / "pkg").mkdir()
381
+ (tmp_path / "pkg" / "keep.py").write_text("x = 1\n")
382
+ (tmp_path / "tests").mkdir()
383
+ (tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
384
+ (tmp_path / "test_skip.py").write_text("x = 1\n")
385
+
386
+ evaluator = HardEvaluator(str(tmp_path))
387
+
388
+ assert evaluator._radon_metric_targets() == [str(tmp_path / "pkg" / "keep.py")]
389
+
390
+
311
391
  def test_evaluate_fails_when_coverage_report_missing(monkeypatch: Any) -> None:
312
392
  """
313
393
  Test that missing coverage data fails the hard gate even when tests pass.
@@ -111,6 +111,66 @@ def test_generate_final_report_mock_fails_on_hard_failure() -> None:
111
111
  os.environ["LLM_API_KEY"] = old_key
112
112
 
113
113
 
114
+ def test_determine_verdict_fails_below_mi_70(tmp_path: Path) -> None:
115
+ """
116
+ Test that MI below 70 no longer qualifies for a passing verdict.
117
+ """
118
+ evaluator = SoftEvaluator(str(tmp_path))
119
+
120
+ verdict = evaluator._determine_verdict(
121
+ {
122
+ "hard_failed": False,
123
+ "qc_failed": False,
124
+ "avg_mi": 65.0,
125
+ "qa_score": 90.0,
126
+ "cc_issues": [],
127
+ }
128
+ )
129
+
130
+ assert verdict == "Fail"
131
+
132
+
133
+ def test_determine_verdict_passes_at_mi_70(tmp_path: Path) -> None:
134
+ """
135
+ Test that MI of 70 is sufficient for a passing verdict.
136
+ """
137
+ evaluator = SoftEvaluator(str(tmp_path))
138
+
139
+ verdict = evaluator._determine_verdict(
140
+ {
141
+ "hard_failed": False,
142
+ "qc_failed": False,
143
+ "avg_mi": 70.0,
144
+ "qa_score": 90.0,
145
+ "cc_issues": [],
146
+ }
147
+ )
148
+
149
+ assert verdict == "Pass"
150
+
151
+
152
+ def test_final_report_prompt_mentions_mi_70_threshold(tmp_path: Path) -> None:
153
+ """
154
+ Test that the final report prompt advertises the updated MI threshold.
155
+ """
156
+ evaluator = SoftEvaluator(str(tmp_path))
157
+
158
+ messages = evaluator._build_final_report_messages(
159
+ {
160
+ "avg_mi": 70.0,
161
+ "cc_issues": [],
162
+ "qa_score": 90.0,
163
+ "hard_errors": [],
164
+ "qc_errors": [],
165
+ "qa_entities": [],
166
+ "hard_failed": False,
167
+ "qc_failed": False,
168
+ }
169
+ )
170
+
171
+ assert "Average Maintainability >= 70" in messages[0]["content"]
172
+
173
+
114
174
  def test_read_file_text_helper_reads_utf8_content(tmp_path: Path) -> None:
115
175
  """
116
176
  Test that the file-reading helper returns UTF-8 text content.
@@ -145,6 +205,8 @@ def test_get_python_files_filters_hidden_and_virtualenv_dirs(tmp_path: Path) ->
145
205
  (tmp_path / "venv" / "skip.py").write_text("x = 1\n")
146
206
  (tmp_path / "vendors").mkdir()
147
207
  (tmp_path / "vendors" / "skip.py").write_text("x = 1\n")
208
+ (tmp_path / "tests").mkdir()
209
+ (tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
148
210
 
149
211
  evaluator = SoftEvaluator(str(tmp_path))
150
212
 
File without changes
@@ -6,11 +6,11 @@ anthropic>=0.18.0
6
6
  tenacity>=8.2.0
7
7
  tiktoken>=0.6.0
8
8
  python-dotenv>=1.0.0
9
-
10
- [dev]
11
9
  pytest>=8.0.0
12
10
  pytest-cov>=4.1.0
13
11
  ruff>=0.3.0
14
12
  mypy>=1.9.0
15
- ty>=0.0.1
16
13
  radon>=6.0.1
14
+
15
+ [dev]
16
+ ty>=0.0.1