python-harness 0.0.5__tar.gz → 0.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {python_harness-0.0.5/python_harness.egg-info → python_harness-0.0.8}/PKG-INFO +1 -1
  2. {python_harness-0.0.5 → python_harness-0.0.8}/pyproject.toml +1 -1
  3. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/cli.py +36 -8
  4. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/evaluator.py +1 -1
  5. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/hard_evaluator.py +50 -5
  6. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/soft_evaluator.py +38 -7
  7. {python_harness-0.0.5 → python_harness-0.0.8/python_harness.egg-info}/PKG-INFO +1 -1
  8. python_harness-0.0.8/tests/test_hard_evaluator.py +95 -0
  9. python_harness-0.0.5/tests/test_hard_evaluator.py +0 -29
  10. {python_harness-0.0.5 → python_harness-0.0.8}/LICENSE +0 -0
  11. {python_harness-0.0.5 → python_harness-0.0.8}/README.md +0 -0
  12. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/__init__.py +0 -0
  13. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness/qc_evaluator.py +0 -0
  14. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/SOURCES.txt +0 -0
  15. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/dependency_links.txt +0 -0
  16. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/entry_points.txt +0 -0
  17. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/requires.txt +0 -0
  18. {python_harness-0.0.5 → python_harness-0.0.8}/python_harness.egg-info/top_level.txt +0 -0
  19. {python_harness-0.0.5 → python_harness-0.0.8}/setup.cfg +0 -0
  20. {python_harness-0.0.5 → python_harness-0.0.8}/tests/test_cli.py +0 -0
  21. {python_harness-0.0.5 → python_harness-0.0.8}/tests/test_evaluator.py +0 -0
  22. {python_harness-0.0.5 → python_harness-0.0.8}/tests/test_soft_evaluator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-harness
3
- Version: 0.0.5
3
+ Version: 0.0.8
4
4
  Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
5
  Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "python-harness"
3
- version = "0.0.5"
3
+ version = "0.0.8"
4
4
  description = "An agentic codebase evaluation and evolution tool for Python projects."
5
5
  requires-python = ">=3.10"
6
6
  readme = "README.md"
@@ -130,9 +130,32 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
130
130
  f" - {issue['file']}: {issue['type']} '{issue['name']}' "
131
131
  f"has CC {issue['complexity']}"
132
132
  )
133
- sys.exit(1)
133
+
134
+ # If radon failed for another reason
135
+ # (e.g. radon not installed or syntax error)
136
+ if not issues and hard_results["radon_cc"].get("error_message"):
137
+ err_msg = hard_results['radon_cc'].get('error_message')
138
+ console.print(f"[red]Radon CC Error:[/red] {err_msg}")
139
+ elif not issues:
140
+ console.print(
141
+ "[red]Radon CC failed but no specific issues were parsed.[/red]"
142
+ )
143
+ elif hard_results["radon_cc"]["status"] == "warning":
144
+ err_msg = hard_results['radon_cc'].get('error_message')
145
+ console.print(f"[yellow]Radon CC warning:[/yellow] {err_msg}")
134
146
 
135
- console.print("[bold green]Hard Evaluation Passed![/bold green]")
147
+ if hard_results.get("pytest", {}).get("status") == "failed":
148
+ error_msg = hard_results["pytest"].get("error_message", "Tests failed")
149
+ console.print(f"[red]Pytest/Coverage issues found:[/red] {error_msg}")
150
+
151
+ # DO NOT sys.exit(1) here anymore!
152
+ # We want to generate the report even if it fails.
153
+ console.print(
154
+ "[yellow]Continuing to soft evaluation to generate "
155
+ "suggestions despite hard failures...[/yellow]"
156
+ )
157
+ else:
158
+ console.print("[bold green]Hard Evaluation Passed![/bold green]")
136
159
 
137
160
  # Print Maintainability Index scorecard
138
161
  mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
@@ -155,11 +178,15 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
155
178
  )
156
179
  for failure in qc_results["failures"]:
157
180
  console.print(f"[red]- {failure}[/red]")
158
- sys.exit(1)
159
-
160
- console.print(
161
- "[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
162
- )
181
+ # DO NOT sys.exit(1) here! We want to generate suggestions for QC failures too.
182
+ console.print(
183
+ "[yellow]Continuing to soft evaluation to generate "
184
+ "suggestions despite QC failures...[/yellow]"
185
+ )
186
+ else:
187
+ console.print(
188
+ "[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
189
+ )
163
190
 
164
191
  # 3. Soft Evaluation/Readability (Third Fence)
165
192
  console.print(
@@ -194,8 +221,9 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
194
221
  console.print("\n[yellow]Evaluation completed. Generating report...[/yellow]\n")
195
222
 
196
223
  # Generate Final Report
224
+ # Pass all results to the reporter so it knows *why* things failed
197
225
  final_report = evaluator.soft_evaluator.generate_final_report(
198
- hard_results, soft_results
226
+ hard_results, qc_results, soft_results
199
227
  )
200
228
 
201
229
  if final_report:
@@ -30,7 +30,7 @@ class Evaluator:
30
30
 
31
31
  # Generate Final Synthesized Report with 3 Suggestions
32
32
  final_report = self.soft_evaluator.generate_final_report(
33
- hard_results, soft_results
33
+ hard_results, qc_results, soft_results
34
34
  )
35
35
 
36
36
  return {
@@ -123,16 +123,32 @@ class HardEvaluator:
123
123
  "complexity": block.get('complexity')
124
124
  })
125
125
 
126
- if issues:
126
+ if result.returncode != 0:
127
+ # E.g. syntax error in target code preventing radon from parsing
128
+ status = "failed"
129
+ elif issues:
127
130
  status = "failed"
128
131
 
129
132
  return {
130
133
  "status": status,
131
134
  "issues": issues,
132
135
  "return_code": result.returncode,
133
- "output": result.stdout
136
+ "output": result.stdout,
137
+ "error_message": result.stderr if result.returncode != 0 else ""
138
+ }
139
+ except FileNotFoundError:
140
+ return {
141
+ "status": "warning",
142
+ "issues": [],
143
+ "error_message": "radon executable not found. Please install it."
134
144
  }
135
145
  except Exception as e:
146
+ if "No such file or directory: 'radon'" in str(e):
147
+ return {
148
+ "status": "warning",
149
+ "issues": [],
150
+ "error_message": "radon executable not found. Please install it."
151
+ }
136
152
  return {"status": "error", "error_message": str(e)}
137
153
 
138
154
  def run_radon_mi(self) -> dict[str, Any]:
@@ -160,7 +176,19 @@ class HardEvaluator:
160
176
  "mi_scores": mi_scores,
161
177
  "return_code": result.returncode,
162
178
  }
179
+ except FileNotFoundError:
180
+ return {
181
+ "status": "warning",
182
+ "mi_scores": {},
183
+ "error_message": "radon executable not found. Please install it."
184
+ }
163
185
  except Exception as e:
186
+ if "No such file or directory: 'radon'" in str(e):
187
+ return {
188
+ "status": "warning",
189
+ "mi_scores": {},
190
+ "error_message": "radon executable not found. Please install it."
191
+ }
164
192
  return {"status": "error", "error_message": str(e)}
165
193
 
166
194
  def run_pytest(self) -> dict[str, Any]:
@@ -195,13 +223,29 @@ class HardEvaluator:
195
223
  ty_res = self.run_ty()
196
224
  radon_cc_res = self.run_radon_cc()
197
225
  radon_mi_res = self.run_radon_mi()
198
- # pytest_res = self.run_pytest() # Better handled as a separate stage
226
+ pytest_res = self.run_pytest()
199
227
 
228
+ # Parse pytest coverage to check if it's < 90%
229
+ cov_percentage = 0.0
230
+ if pytest_res.get("status") == "success" and pytest_res.get("output"):
231
+ try:
232
+ cov_data = json.loads(pytest_res["output"])
233
+ cov_percentage = cov_data.get("totals", {}).get("percent_covered", 0.0)
234
+ if cov_percentage < 90.0:
235
+ pytest_res["status"] = "failed"
236
+ pytest_res["error_message"] = (
237
+ f"Test coverage is {cov_percentage:.2f}%, "
238
+ f"which is below the 90% threshold."
239
+ )
240
+ except Exception:
241
+ pass
242
+
200
243
  all_passed = (
201
244
  ruff_res.get("status") == "success" and
202
245
  mypy_res.get("status") == "success" and
203
246
  ty_res.get("status") in ("success", "warning") and
204
- radon_cc_res.get("status") == "success"
247
+ radon_cc_res.get("status") in ("success", "warning") and
248
+ pytest_res.get("status") == "success"
205
249
  )
206
250
 
207
251
  return {
@@ -210,5 +254,6 @@ class HardEvaluator:
210
254
  "mypy": mypy_res,
211
255
  "ty": ty_res,
212
256
  "radon_cc": radon_cc_res,
213
- "radon_mi": radon_mi_res
257
+ "radon_mi": radon_mi_res,
258
+ "pytest": pytest_res
214
259
  }
@@ -373,7 +373,10 @@ class SoftEvaluator:
373
373
  }
374
374
 
375
375
  def generate_final_report(
376
- self, hard_results: dict[str, Any], soft_results: dict[str, Any]
376
+ self,
377
+ hard_results: dict[str, Any],
378
+ qc_results: dict[str, Any],
379
+ soft_results: dict[str, Any]
377
380
  ) -> dict[str, Any]:
378
381
  """
379
382
  Synthesize all evaluation results into a final verdict and exactly
@@ -408,25 +411,48 @@ class SoftEvaluator:
408
411
  mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
409
412
  avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
410
413
 
414
+ # Extract failures
415
+ hard_failed = not hard_results.get("all_passed", True)
416
+
417
+ hard_errors = []
418
+ if hard_failed:
419
+ if hard_results.get("ruff", {}).get("status") != "success":
420
+ hard_errors.append("Linter (Ruff) failed.")
421
+ if hard_results.get("mypy", {}).get("status") != "success":
422
+ hard_errors.append("Type checker (Mypy) failed.")
423
+ if hard_results.get("pytest", {}).get("status") != "success":
424
+ pytest_err = hard_results.get("pytest", {}).get(
425
+ "error_message", "Tests or Coverage failed."
426
+ )
427
+ hard_errors.append(pytest_err)
428
+
429
+ qc_errors = qc_results.get("failures", [])
430
+
411
431
  qa_score = soft_results.get("understandability_score", 100.0)
412
432
  qa_entities = soft_results.get("qa_results", {}).get("sampled_entities", [])
413
433
 
414
434
  sys_prompt = (
415
435
  "You are an elite Python Codebase Evaluator. You have just analyzed "
416
436
  "a repository. Your task is to provide a final judgment and EXACTLY "
417
- "3 concrete, actionable improvement suggestions. These suggestions "
418
- "MUST NOT change the external functionality (they are refactoring/"
419
- "quality improvements).\n\n"
437
+ "3 concrete, actionable improvement suggestions.\n"
438
+ "If the codebase failed its Hard or QC evaluations (e.g. tests "
439
+ "failed, coverage is low, or governance violated), your suggestions "
440
+ "MUST prioritize fixing those issues.\n"
441
+ "Otherwise, focus on refactoring/quality improvements without "
442
+ "changing external functionality.\n\n"
420
443
  "Output MUST be in valid JSON matching this schema:\n"
421
444
  "{\n"
422
445
  ' "verdict": "Pass" or "Fail",\n'
423
- ' "summary": "One paragraph summary of codebase health",\n'
446
+ ' "summary": "One paragraph summary of codebase health and '
447
+ 'any critical failures",\n'
424
448
  ' "suggestions": [\n'
425
449
  ' {"title": "str", "description": "str", "target_file": "str"}\n'
426
450
  " ]\n"
427
451
  "}\n"
428
- "Rule for Verdict: Pass if Average Maintainability > 50 and "
429
- "QA Score > 75 and no Critical CC issues (>15). Otherwise Fail."
452
+ "Rule for Verdict: If there are Hard Failures or QC Failures, "
453
+ "verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
454
+ "> 50 and QA Score > 75 and no Critical CC issues (>15). "
455
+ "Otherwise Fail."
430
456
  )
431
457
 
432
458
  user_content = (
@@ -435,6 +461,11 @@ class SoftEvaluator:
435
461
  f"- Number of functions with Cyclomatic Complexity > 15: "
436
462
  f"{len(cc_issues)}\n"
437
463
  f"- Agent QA Readability Score: {qa_score:.1f}/100\n\n"
464
+ f"Failures (Prioritize these!):\n"
465
+ f"- Hard Evaluation Errors: "
466
+ f"{hard_errors if hard_errors else 'None'}\n"
467
+ f"- QC/Governance Errors: "
468
+ f"{qc_errors if qc_errors else 'None'}\n\n"
438
469
  f"QA Feedback Snippets:\n"
439
470
  + "\n".join(
440
471
  [f" * {q['entity']}: {q['feedback']}" for q in qa_entities]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-harness
3
- Version: 0.0.5
3
+ Version: 0.0.8
4
4
  Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
5
  Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
6
  License: MIT
@@ -0,0 +1,95 @@
1
+ """
2
+ Tests for hard evaluation logic.
3
+ """
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from python_harness.hard_evaluator import HardEvaluator
9
+
10
+
11
+ def test_hard_evaluator_methods() -> None:
12
+ """
13
+ Test methods of HardEvaluator.
14
+ """
15
+ evaluator = HardEvaluator(".")
16
+
17
+ ruff_result = evaluator.run_ruff()
18
+ assert "status" in ruff_result
19
+
20
+ mypy_result = evaluator.run_mypy()
21
+ assert "status" in mypy_result
22
+
23
+ ty_result = evaluator.run_ty()
24
+ assert "status" in ty_result
25
+
26
+ # pytest_result = evaluator.run_pytest() # Causes infinite loop when run in test
27
+ # assert "status" in pytest_result
28
+
29
+ eval_result = evaluator.evaluate()
30
+ assert "ruff" in eval_result
31
+ assert "mypy" in eval_result
32
+ assert "ty" in eval_result
33
+
34
+ def test_ty_fallback_behavior(monkeypatch: Any) -> None:
35
+ """
36
+ Test that run_ty gracefully falls back to a warning when 'ty' is not installed.
37
+ """
38
+ # Create a mock subprocess.run that always raises FileNotFoundError
39
+ # to simulate 'ty' not being on the system PATH
40
+ def mock_run(*args: Any, **kwargs: Any) -> Any:
41
+ raise FileNotFoundError("[Errno 2] No such file or directory: 'ty'")
42
+
43
+ monkeypatch.setattr("subprocess.run", mock_run)
44
+
45
+ evaluator = HardEvaluator(".")
46
+ result = evaluator.run_ty()
47
+
48
+ assert result["status"] == "warning"
49
+ assert "ty executable not found" in result["error_message"]
50
+
51
+ def test_ty_fallback_behavior_oserror(monkeypatch: Any) -> None:
52
+ """
53
+ Test that run_ty gracefully falls back to a warning when a generic Exception
54
+ containing the Errno 2 string is thrown.
55
+ """
56
+ def mock_run(*args: Any, **kwargs: Any) -> Any:
57
+ raise Exception("[Errno 2] No such file or directory: 'ty'")
58
+
59
+ monkeypatch.setattr("subprocess.run", mock_run)
60
+
61
+ evaluator = HardEvaluator(".")
62
+ result = evaluator.run_ty()
63
+
64
+ assert result["status"] == "warning"
65
+ assert "ty executable not found" in result["error_message"]
66
+
67
+ def test_radon_cc_syntax_error(monkeypatch: Any, tmp_path: Path) -> None:
68
+ """
69
+ Test that run_radon_cc correctly captures and reports stderr when radon
70
+ fails (e.g. due to syntax errors in the target codebase).
71
+ """
72
+ # Create a mock subprocess.run that simulates radon exiting with code 1
73
+ # and writing an error to stderr (which happens when there are syntax errors)
74
+ import subprocess
75
+ original_run = subprocess.run
76
+
77
+ def mock_run(args: Any, **kwargs: Any) -> Any:
78
+ if args and args[0] == "radon" and args[1] == "cc":
79
+ # Simulate radon failing on syntax error
80
+ class MockResult:
81
+ returncode = 1
82
+ stdout = ""
83
+ stderr = "ERROR: SyntaxError in bad.py"
84
+ return MockResult()
85
+ return original_run(args, **kwargs)
86
+
87
+ monkeypatch.setattr("subprocess.run", mock_run)
88
+
89
+ evaluator = HardEvaluator(str(tmp_path))
90
+ result = evaluator.run_radon_cc()
91
+
92
+ assert result["status"] == "failed"
93
+ assert len(result.get("issues", [])) == 0
94
+ # Radon should output to stderr because of the syntax error
95
+ assert "SyntaxError" in result["error_message"] or result["return_code"] != 0
@@ -1,29 +0,0 @@
1
- """
2
- Tests for hard evaluation logic.
3
- """
4
-
5
- from python_harness.hard_evaluator import HardEvaluator
6
-
7
-
8
- def test_hard_evaluator_methods() -> None:
9
- """
10
- Test methods of HardEvaluator.
11
- """
12
- evaluator = HardEvaluator(".")
13
-
14
- ruff_result = evaluator.run_ruff()
15
- assert "status" in ruff_result
16
-
17
- mypy_result = evaluator.run_mypy()
18
- assert "status" in mypy_result
19
-
20
- ty_result = evaluator.run_ty()
21
- assert "status" in ty_result
22
-
23
- # pytest_result = evaluator.run_pytest() # Causes infinite loop when run in test
24
- # assert "status" in pytest_result
25
-
26
- eval_result = evaluator.evaluate()
27
- assert "ruff" in eval_result
28
- assert "mypy" in eval_result
29
- assert "ty" in eval_result
File without changes
File without changes
File without changes