PyPI - python-harness - Versions diffs - 0.0.5__tar.gz → 0.0.8__tar.gz - Mend

python-harness 0.0.5tar.gz → 0.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{python_harness-0.0.5/python_harness.egg-info → python_harness-0.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-harness
-Version: 0.0.5
+Version: 0.0.8
 Summary: An agentic codebase evaluation and evolution tool for Python projects.
 Author-email: Mingli Yuan <mingli.yuan@gmail.com>
 License: MIT

{python_harness-0.0.5 → python_harness-0.0.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "python-harness"
-version = "0.0.5"
+version = "0.0.8"
 description = "An agentic codebase evaluation and evolution tool for Python projects."
 requires-python = ">=3.10"
 readme = "README.md"

{python_harness-0.0.5 → python_harness-0.0.8}/python_harness/cli.py RENAMED Viewed

@@ -130,9 +130,32 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
                     f"  - {issue['file']}: {issue['type']} '{issue['name']}' "
                     f"has CC {issue['complexity']}"
                 )
-        sys.exit(1)
+            # If radon failed for another reason
+            # (e.g. radon not installed or syntax error)
+            if not issues and hard_results["radon_cc"].get("error_message"):
+                err_msg = hard_results['radon_cc'].get('error_message')
+                console.print(f"[red]Radon CC Error:[/red] {err_msg}")
+            elif not issues:
+                console.print(
+                    "[red]Radon CC failed but no specific issues were parsed.[/red]"
+                )
+        elif hard_results["radon_cc"]["status"] == "warning":
+            err_msg = hard_results['radon_cc'].get('error_message')
+            console.print(f"[yellow]Radon CC warning:[/yellow] {err_msg}")
-    console.print("[bold green]Hard Evaluation Passed![/bold green]")
+        if hard_results.get("pytest", {}).get("status") == "failed":
+            error_msg = hard_results["pytest"].get("error_message", "Tests failed")
+            console.print(f"[red]Pytest/Coverage issues found:[/red] {error_msg}")
+        # DO NOT sys.exit(1) here anymore!
+        # We want to generate the report even if it fails.
+        console.print(
+            "[yellow]Continuing to soft evaluation to generate "
+            "suggestions despite hard failures...[/yellow]"
+        )
+    else:
+        console.print("[bold green]Hard Evaluation Passed![/bold green]")
     # Print Maintainability Index scorecard
     mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
@@ -155,11 +178,15 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
         )
         for failure in qc_results["failures"]:
             console.print(f"[red]- {failure}[/red]")
-        sys.exit(1)
-    console.print(
-        "[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
-    )
+        # DO NOT sys.exit(1) here! We want to generate suggestions for QC failures too.
+        console.print(
+            "[yellow]Continuing to soft evaluation to generate "
+            "suggestions despite QC failures...[/yellow]"
+        )
+    else:
+        console.print(
+            "[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
+        )
     # 3. Soft Evaluation/Readability (Third Fence)
     console.print(
@@ -194,8 +221,9 @@ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> Non
     console.print("\n[yellow]Evaluation completed. Generating report...[/yellow]\n")
     # Generate Final Report
+    # Pass all results to the reporter so it knows *why* things failed
     final_report = evaluator.soft_evaluator.generate_final_report(
-        hard_results, soft_results
+        hard_results, qc_results, soft_results
     )
     if final_report:

{python_harness-0.0.5 → python_harness-0.0.8}/python_harness/evaluator.py RENAMED Viewed

@@ -30,7 +30,7 @@ class Evaluator:
         # Generate Final Synthesized Report with 3 Suggestions
         final_report = self.soft_evaluator.generate_final_report(
-            hard_results, soft_results
+            hard_results, qc_results, soft_results
         )
         return {

{python_harness-0.0.5 → python_harness-0.0.8}/python_harness/hard_evaluator.py RENAMED Viewed

@@ -123,16 +123,32 @@ class HardEvaluator:
                                     "complexity": block.get('complexity')
                                 })
-            if issues:
+            if result.returncode != 0:
+                # E.g. syntax error in target code preventing radon from parsing
+                status = "failed"
+            elif issues:
                 status = "failed"
             return {
                 "status": status,
                 "issues": issues,
                 "return_code": result.returncode,
-                "output": result.stdout
+                "output": result.stdout,
+                "error_message": result.stderr if result.returncode != 0 else ""
+            }
+        except FileNotFoundError:
+            return {
+                "status": "warning",
+                "issues": [],
+                "error_message": "radon executable not found. Please install it."
             }
         except Exception as e:
+            if "No such file or directory: 'radon'" in str(e):
+                return {
+                    "status": "warning",
+                    "issues": [],
+                    "error_message": "radon executable not found. Please install it."
+                }
             return {"status": "error", "error_message": str(e)}
     def run_radon_mi(self) -> dict[str, Any]:
@@ -160,7 +176,19 @@ class HardEvaluator:
                 "mi_scores": mi_scores,
                 "return_code": result.returncode,
             }
+        except FileNotFoundError:
+            return {
+                "status": "warning",
+                "mi_scores": {},
+                "error_message": "radon executable not found. Please install it."
+            }
         except Exception as e:
+            if "No such file or directory: 'radon'" in str(e):
+                return {
+                    "status": "warning",
+                    "mi_scores": {},
+                    "error_message": "radon executable not found. Please install it."
+                }
             return {"status": "error", "error_message": str(e)}
     def run_pytest(self) -> dict[str, Any]:
@@ -195,13 +223,29 @@ class HardEvaluator:
         ty_res = self.run_ty()
         radon_cc_res = self.run_radon_cc()
         radon_mi_res = self.run_radon_mi()
-        # pytest_res = self.run_pytest() # Better handled as a separate stage
+        pytest_res = self.run_pytest()
+        # Parse pytest coverage to check if it's < 90%
+        cov_percentage = 0.0
+        if pytest_res.get("status") == "success" and pytest_res.get("output"):
+            try:
+                cov_data = json.loads(pytest_res["output"])
+                cov_percentage = cov_data.get("totals", {}).get("percent_covered", 0.0)
+                if cov_percentage < 90.0:
+                    pytest_res["status"] = "failed"
+                    pytest_res["error_message"] = (
+                        f"Test coverage is {cov_percentage:.2f}%, "
+                        f"which is below the 90% threshold."
+                    )
+            except Exception:
+                pass
         all_passed = (
             ruff_res.get("status") == "success" and
             mypy_res.get("status") == "success" and
             ty_res.get("status") in ("success", "warning") and
-            radon_cc_res.get("status") == "success"
+            radon_cc_res.get("status") in ("success", "warning") and
+            pytest_res.get("status") == "success"
         )
         return {
@@ -210,5 +254,6 @@ class HardEvaluator:
             "mypy": mypy_res,
             "ty": ty_res,
             "radon_cc": radon_cc_res,
-            "radon_mi": radon_mi_res
+            "radon_mi": radon_mi_res,
+            "pytest": pytest_res
         }

{python_harness-0.0.5 → python_harness-0.0.8}/python_harness/soft_evaluator.py RENAMED Viewed

@@ -373,7 +373,10 @@ class SoftEvaluator:
         }
     def generate_final_report(
-        self, hard_results: dict[str, Any], soft_results: dict[str, Any]
+        self,
+        hard_results: dict[str, Any],
+        qc_results: dict[str, Any],
+        soft_results: dict[str, Any]
     ) -> dict[str, Any]:
         """
         Synthesize all evaluation results into a final verdict and exactly
@@ -408,25 +411,48 @@ class SoftEvaluator:
             mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
             avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
+            # Extract failures
+            hard_failed = not hard_results.get("all_passed", True)
+            hard_errors = []
+            if hard_failed:
+                if hard_results.get("ruff", {}).get("status") != "success":
+                    hard_errors.append("Linter (Ruff) failed.")
+                if hard_results.get("mypy", {}).get("status") != "success":
+                    hard_errors.append("Type checker (Mypy) failed.")
+                if hard_results.get("pytest", {}).get("status") != "success":
+                    pytest_err = hard_results.get("pytest", {}).get(
+                        "error_message", "Tests or Coverage failed."
+                    )
+                    hard_errors.append(pytest_err)
+            qc_errors = qc_results.get("failures", [])
             qa_score = soft_results.get("understandability_score", 100.0)
             qa_entities = soft_results.get("qa_results", {}).get("sampled_entities", [])
             sys_prompt = (
                 "You are an elite Python Codebase Evaluator. You have just analyzed "
                 "a repository. Your task is to provide a final judgment and EXACTLY "
-                "3 concrete, actionable improvement suggestions. These suggestions "
-                "MUST NOT change the external functionality (they are refactoring/"
-                "quality improvements).\n\n"
+                "3 concrete, actionable improvement suggestions.\n"
+                "If the codebase failed its Hard or QC evaluations (e.g. tests "
+                "failed, coverage is low, or governance violated), your suggestions "
+                "MUST prioritize fixing those issues.\n"
+                "Otherwise, focus on refactoring/quality improvements without "
+                "changing external functionality.\n\n"
                 "Output MUST be in valid JSON matching this schema:\n"
                 "{\n"
                 '  "verdict": "Pass" or "Fail",\n'
-                '  "summary": "One paragraph summary of codebase health",\n'
+                '  "summary": "One paragraph summary of codebase health and '
+                'any critical failures",\n'
                 '  "suggestions": [\n'
                 '    {"title": "str", "description": "str", "target_file": "str"}\n'
                 "  ]\n"
                 "}\n"
-                "Rule for Verdict: Pass if Average Maintainability > 50 and "
-                "QA Score > 75 and no Critical CC issues (>15). Otherwise Fail."
+                "Rule for Verdict: If there are Hard Failures or QC Failures, "
+                "verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
+                "> 50 and QA Score > 75 and no Critical CC issues (>15). "
+                "Otherwise Fail."
             )
             user_content = (
@@ -435,6 +461,11 @@ class SoftEvaluator:
                 f"- Number of functions with Cyclomatic Complexity > 15: "
                 f"{len(cc_issues)}\n"
                 f"- Agent QA Readability Score: {qa_score:.1f}/100\n\n"
+                f"Failures (Prioritize these!):\n"
+                f"- Hard Evaluation Errors: "
+                f"{hard_errors if hard_errors else 'None'}\n"
+                f"- QC/Governance Errors: "
+                f"{qc_errors if qc_errors else 'None'}\n\n"
                 f"QA Feedback Snippets:\n"
                 + "\n".join(
                     [f"  * {q['entity']}: {q['feedback']}" for q in qa_entities]

{python_harness-0.0.5 → python_harness-0.0.8/python_harness.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-harness
-Version: 0.0.5
+Version: 0.0.8
 Summary: An agentic codebase evaluation and evolution tool for Python projects.
 Author-email: Mingli Yuan <mingli.yuan@gmail.com>
 License: MIT

python_harness-0.0.8/tests/test_hard_evaluator.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""
+Tests for hard evaluation logic.
+"""
+from pathlib import Path
+from typing import Any
+from python_harness.hard_evaluator import HardEvaluator
+def test_hard_evaluator_methods() -> None:
+    """
+    Test methods of HardEvaluator.
+    """
+    evaluator = HardEvaluator(".")
+    ruff_result = evaluator.run_ruff()
+    assert "status" in ruff_result
+    mypy_result = evaluator.run_mypy()
+    assert "status" in mypy_result
+    ty_result = evaluator.run_ty()
+    assert "status" in ty_result
+    # pytest_result = evaluator.run_pytest() # Causes infinite loop when run in test
+    # assert "status" in pytest_result
+    eval_result = evaluator.evaluate()
+    assert "ruff" in eval_result
+    assert "mypy" in eval_result
+    assert "ty" in eval_result
+def test_ty_fallback_behavior(monkeypatch: Any) -> None:
+    """
+    Test that run_ty gracefully falls back to a warning when 'ty' is not installed.
+    """
+    # Create a mock subprocess.run that always raises FileNotFoundError
+    # to simulate 'ty' not being on the system PATH
+    def mock_run(*args: Any, **kwargs: Any) -> Any:
+        raise FileNotFoundError("[Errno 2] No such file or directory: 'ty'")
+    monkeypatch.setattr("subprocess.run", mock_run)
+    evaluator = HardEvaluator(".")
+    result = evaluator.run_ty()
+    assert result["status"] == "warning"
+    assert "ty executable not found" in result["error_message"]
+def test_ty_fallback_behavior_oserror(monkeypatch: Any) -> None:
+    """
+    Test that run_ty gracefully falls back to a warning when a generic Exception
+    containing the Errno 2 string is thrown.
+    """
+    def mock_run(*args: Any, **kwargs: Any) -> Any:
+        raise Exception("[Errno 2] No such file or directory: 'ty'")
+    monkeypatch.setattr("subprocess.run", mock_run)
+    evaluator = HardEvaluator(".")
+    result = evaluator.run_ty()
+    assert result["status"] == "warning"
+    assert "ty executable not found" in result["error_message"]
+def test_radon_cc_syntax_error(monkeypatch: Any, tmp_path: Path) -> None:
+    """
+    Test that run_radon_cc correctly captures and reports stderr when radon
+    fails (e.g. due to syntax errors in the target codebase).
+    """
+    # Create a mock subprocess.run that simulates radon exiting with code 1
+    # and writing an error to stderr (which happens when there are syntax errors)
+    import subprocess
+    original_run = subprocess.run
+    def mock_run(args: Any, **kwargs: Any) -> Any:
+        if args and args[0] == "radon" and args[1] == "cc":
+            # Simulate radon failing on syntax error
+            class MockResult:
+                returncode = 1
+                stdout = ""
+                stderr = "ERROR: SyntaxError in bad.py"
+            return MockResult()
+        return original_run(args, **kwargs)
+    monkeypatch.setattr("subprocess.run", mock_run)
+    evaluator = HardEvaluator(str(tmp_path))
+    result = evaluator.run_radon_cc()
+    assert result["status"] == "failed"
+    assert len(result.get("issues", [])) == 0
+    # Radon should output to stderr because of the syntax error
+    assert "SyntaxError" in result["error_message"] or result["return_code"] != 0

python_harness-0.0.5/tests/test_hard_evaluator.py DELETED Viewed

@@ -1,29 +0,0 @@
-"""
-Tests for hard evaluation logic.
-"""
-from python_harness.hard_evaluator import HardEvaluator
-def test_hard_evaluator_methods() -> None:
-    """
-    Test methods of HardEvaluator.
-    """
-    evaluator = HardEvaluator(".")
-    ruff_result = evaluator.run_ruff()
-    assert "status" in ruff_result
-    mypy_result = evaluator.run_mypy()
-    assert "status" in mypy_result
-    ty_result = evaluator.run_ty()
-    assert "status" in ty_result
-    # pytest_result = evaluator.run_pytest() # Causes infinite loop when run in test
-    # assert "status" in pytest_result
-    eval_result = evaluator.evaluate()
-    assert "ruff" in eval_result
-    assert "mypy" in eval_result
-    assert "ty" in eval_result