python-harness 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mingli Yuan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.4
2
+ Name: python-harness
3
+ Version: 0.0.1
4
+ Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
+ Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: typer>=0.9.0
11
+ Requires-Dist: rich>=13.0.0
12
+ Requires-Dist: pydantic>=2.0.0
13
+ Requires-Dist: openai>=1.0.0
14
+ Requires-Dist: anthropic>=0.18.0
15
+ Requires-Dist: tenacity>=8.2.0
16
+ Requires-Dist: tiktoken>=0.6.0
17
+ Requires-Dist: python-dotenv>=1.0.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
20
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
21
+ Requires-Dist: ruff>=0.3.0; extra == "dev"
22
+ Requires-Dist: mypy>=1.9.0; extra == "dev"
23
+ Requires-Dist: ty>=0.0.1; extra == "dev"
24
+ Requires-Dist: radon>=6.0.1; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # Python Harness
28
+
29
+ An agentic codebase evaluation and evolution tool for Python projects.
30
+
31
+ `python-harness` is designed to be a universal standard tool—just like `pytest` or `ruff`—but instead of just checking syntax or running tests, it evaluates the **architecture, readability, and governance** of your codebase using both static analysis and LLMs (DeepSeek/OpenAI).
32
+
33
+ ## Features
34
+
35
+ 1. **Hard Evaluation (First Fence)**: Enforces strict rules using `ruff`, `mypy`, and `ty`. Evaluates Cyclomatic Complexity (CC) and Maintainability Index (MI) via `radon`.
36
+ 2. **Governance QC (Second Fence)**: Checks if the changes violate core project governance or attempt to bypass the evaluation rules themselves.
37
+ 3. **Soft Evaluation (Third Fence)**:
38
+ - Calculates architecture metrics like Fan-out (coupling).
39
+ - Generates a holistic package understanding using LLMs.
40
+ - Performs "Blind QA": Randomly samples functions/classes and tests the LLM's ability to understand them without context.
41
+ 4. **Actionable Output**: Synthesizes the evaluation into a final `Pass/Fail` verdict with exactly 3 concrete, actionable refactoring suggestions.
42
+
43
+ ## Installation
44
+
45
+ You can install `python-harness` using `uv` or `pip`:
46
+
47
+ ```bash
48
+ uv pip install python-harness
49
+ ```
50
+
51
+ ## Configuration
52
+
53
+ `python-harness` requires an LLM to perform its soft evaluation. Create a `.env` file in the root of your project:
54
+
55
+ ```env
56
+ LLM_API_KEY=your_api_key_here
57
+ LLM_BASE_URL=https://api.deepseek.com/v1
58
+ LLM_MODEL_NAME=deepseek-reasoner
59
+ LLM_MINI_MODEL_NAME=deepseek-chat
60
+ ```
61
+
62
+ *(Note: If you don't provide an API key, the harness will safely run in Mock mode).*
63
+
64
+ ## Usage
65
+
66
+ ### 1. Measure
67
+
68
+ To evaluate your codebase, run the `measure` command in your project directory:
69
+
70
+ ```bash
71
+ harness measure .
72
+ ```
73
+
74
+ This will run the full 3-fence evaluation and output a report with a final verdict and top 3 improvement suggestions.
75
+
76
+ ### 2. Refine (Evolution Loop - WIP)
77
+
78
+ The `refine` command is an Agentic Edit-Test-Improve loop. It takes the suggestions generated by `measure`, automatically creates branches (variants), applies the changes, runs the tests (`pytest`), and picks the best variant.
79
+
80
+ ```bash
81
+ harness refine . --steps 1 --max-retries 3
82
+ ```
83
+
84
+ ## License
85
+
86
+ MIT License. See [LICENSE](LICENSE) for more details.
87
+
88
+ A harness toolkit for Python projects
@@ -0,0 +1,62 @@
1
+ # Python Harness
2
+
3
+ An agentic codebase evaluation and evolution tool for Python projects.
4
+
5
+ `python-harness` is designed to be a universal standard tool—just like `pytest` or `ruff`—but instead of just checking syntax or running tests, it evaluates the **architecture, readability, and governance** of your codebase using both static analysis and LLMs (DeepSeek/OpenAI).
6
+
7
+ ## Features
8
+
9
+ 1. **Hard Evaluation (First Fence)**: Enforces strict rules using `ruff`, `mypy`, and `ty`. Evaluates Cyclomatic Complexity (CC) and Maintainability Index (MI) via `radon`.
10
+ 2. **Governance QC (Second Fence)**: Checks if the changes violate core project governance or attempt to bypass the evaluation rules themselves.
11
+ 3. **Soft Evaluation (Third Fence)**:
12
+ - Calculates architecture metrics like Fan-out (coupling).
13
+ - Generates a holistic package understanding using LLMs.
14
+ - Performs "Blind QA": Randomly samples functions/classes and tests the LLM's ability to understand them without context.
15
+ 4. **Actionable Output**: Synthesizes the evaluation into a final `Pass/Fail` verdict with exactly 3 concrete, actionable refactoring suggestions.
16
+
17
+ ## Installation
18
+
19
+ You can install `python-harness` using `uv` or `pip`:
20
+
21
+ ```bash
22
+ uv pip install python-harness
23
+ ```
24
+
25
+ ## Configuration
26
+
27
+ `python-harness` requires an LLM to perform its soft evaluation. Create a `.env` file in the root of your project:
28
+
29
+ ```env
30
+ LLM_API_KEY=your_api_key_here
31
+ LLM_BASE_URL=https://api.deepseek.com/v1
32
+ LLM_MODEL_NAME=deepseek-reasoner
33
+ LLM_MINI_MODEL_NAME=deepseek-chat
34
+ ```
35
+
36
+ *(Note: If you don't provide an API key, the harness will safely run in Mock mode).*
37
+
38
+ ## Usage
39
+
40
+ ### 1. Measure
41
+
42
+ To evaluate your codebase, run the `measure` command in your project directory:
43
+
44
+ ```bash
45
+ harness measure .
46
+ ```
47
+
48
+ This will run the full 3-fence evaluation and output a report with a final verdict and top 3 improvement suggestions.
49
+
50
+ ### 2. Refine (Evolution Loop - WIP)
51
+
52
+ The `refine` command is an Agentic Edit-Test-Improve loop. It takes the suggestions generated by `measure`, automatically creates branches (variants), applies the changes, runs the tests (`pytest`), and picks the best variant.
53
+
54
+ ```bash
55
+ harness refine . --steps 1 --max-retries 3
56
+ ```
57
+
58
+ ## License
59
+
60
+ MIT License. See [LICENSE](LICENSE) for more details.
61
+
62
+ A harness toolkit for Python projects
@@ -0,0 +1,65 @@
1
+ [project]
2
+ name = "python-harness"
3
+ version = "0.0.1"
4
+ description = "An agentic codebase evaluation and evolution tool for Python projects."
5
+ requires-python = ">=3.11"
6
+ readme = "README.md"
7
+ authors = [
8
+ {name = "Mingli Yuan", email = "mingli.yuan@gmail.com"}
9
+ ]
10
+ license = {text = "MIT"}
11
+ dependencies = [
12
+ "typer>=0.9.0",
13
+ "rich>=13.0.0",
14
+ "pydantic>=2.0.0",
15
+ "openai>=1.0.0",
16
+ "anthropic>=0.18.0",
17
+ "tenacity>=8.2.0",
18
+ "tiktoken>=0.6.0",
19
+ "python-dotenv>=1.0.0",
20
+ ]
21
+
22
+ [project.optional-dependencies]
23
+ dev = [
24
+ "pytest>=8.0.0",
25
+ "pytest-cov>=4.1.0",
26
+ "ruff>=0.3.0",
27
+ "mypy>=1.9.0",
28
+ "ty>=0.0.1", # Assuming ty is available or will be replaced with actual LSP integration
29
+ "radon>=6.0.1",
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["setuptools>=61.0"]
34
+ build-backend = "setuptools.build_meta"
35
+
36
+ [tool.setuptools.packages.find]
37
+ include = ["python_harness", "python_harness.*"]
38
+
39
+ [project.scripts]
40
+ harness = "python_harness.cli:app"
41
+
42
+ [tool.ruff]
43
+ line-length = 88
44
+ target-version = "py311"
45
+
46
+ [tool.ruff.lint]
47
+ select = ["E", "F", "I", "UP", "B", "SIM"]
48
+ ignore = []
49
+
50
+ [tool.mypy]
51
+ python_version = "3.11"
52
+ strict = true
53
+ warn_return_any = true
54
+ warn_unused_configs = true
55
+ exclude = [
56
+ "vendors/.*"
57
+ ]
58
+
59
+ [tool.pytest.ini_options]
60
+ minversion = "8.0"
61
+ addopts = "-ra -q --cov=python_harness --cov-report=term-missing --cov-report=html"
62
+ testpaths = [
63
+ "tests",
64
+ ]
65
+
@@ -0,0 +1,5 @@
1
+ """
2
+ Python Harness - An agentic evaluation tool for codebases.
3
+ """
4
+
5
+ __version__ = "0.0.1"
@@ -0,0 +1,207 @@
1
+ """
2
+ Command-line interface for python-harness.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+
8
+ import typer
9
+ from dotenv import load_dotenv
10
+ from rich.console import Console
11
+
12
+ from python_harness.evaluator import Evaluator
13
+
14
+ # Try to find .env file explicitly before anything else executes
15
+ env_path = os.path.join(os.getcwd(), '.env')
16
+ if os.path.exists(env_path):
17
+ load_dotenv(dotenv_path=env_path)
18
+ else:
19
+ load_dotenv() # Fallback to default search
20
+
21
+ app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
22
+ console = Console()
23
+
24
+
25
+ @app.command()
26
+ def refine(
27
+ path: str = typer.Argument(".", help="The path to evaluate and evolve"),
28
+ steps: int = typer.Option(1, help="Number of evolution steps to perform"),
29
+ max_retries: int = typer.Option(3, help="Maximum retries per variant if tests fail")
30
+ ) -> None:
31
+ """
32
+ Refine the codebase through an agentic Edit-Test-Improve loop.
33
+ Generates variants based on suggestions, tests them, and picks the best.
34
+ """
35
+ console.print(
36
+ f"[bold magenta]Starting evolution loop for path:[/bold magenta] {path} "
37
+ f"[dim](steps={steps}, max_retries={max_retries})[/dim]"
38
+ )
39
+
40
+ # 1. First, run a baseline evaluation to get suggestions
41
+ evaluator = Evaluator(path)
42
+ console.print("[cyan]Running baseline evaluation...[/cyan]")
43
+ hard_results = evaluator.hard_evaluator.evaluate()
44
+ soft_results = evaluator.soft_evaluator.evaluate()
45
+ baseline_report = evaluator.soft_evaluator.generate_final_report(
46
+ hard_results, soft_results
47
+ )
48
+
49
+ suggestions = baseline_report.get("suggestions", [])
50
+ if not suggestions:
51
+ console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
52
+ return
53
+
54
+ console.print(
55
+ f"[green]Found {len(suggestions)} suggestions. "
56
+ f"Starting evolution branches...[/green]"
57
+ )
58
+
59
+ # TODO: Implement the Git branching and Agent modification logic here.
60
+ # The loop will be:
61
+ # for step in range(steps):
62
+ # for suggestion in suggestions:
63
+ # checkout new branch variant-X
64
+ # for retry in range(max_retries):
65
+ # ask LLM to apply suggestion to code
66
+ # run pytest
67
+ # if pytest passes:
68
+ # run harness . to get new score
69
+ # break
70
+ # else:
71
+ # feed error back to LLM for retry
72
+ # compare all variants and checkout the best one
73
+
74
+ console.print(
75
+ "[yellow]Evolution engine skeleton ready. "
76
+ "Actual git mutation logic pending.[/yellow]"
77
+ )
78
+ @app.command()
79
+ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
80
+ """
81
+ Measure the codebase against hard, soft, and governance constraints.
82
+ Outputs a final report with scores and actionable improvement suggestions.
83
+ """
84
+ console.print(
85
+ f"[bold green]Starting harness measurement for path:[/bold green] {path}"
86
+ )
87
+
88
+ evaluator = Evaluator(path)
89
+
90
+ # 1. Hard Evaluation Gate (First Fence)
91
+ console.print("[bold blue]Running Hard Evaluation (ruff, mypy)...[/bold blue]")
92
+ hard_results = evaluator.hard_evaluator.evaluate()
93
+
94
+ if not hard_results["all_passed"]:
95
+ console.print("[bold red]Hard Evaluation Failed! Exiting.[/bold red]")
96
+ if hard_results["ruff"]["status"] != "success":
97
+ console.print("[red]Ruff issues found.[/red]")
98
+ if hard_results["mypy"]["status"] != "success":
99
+ output = hard_results["mypy"].get("output", "")
100
+ console.print(f"[red]Mypy issues found:[/red]\n{output}")
101
+ if hard_results["ty"]["status"] != "success":
102
+ output = hard_results["ty"].get("output", "")
103
+ console.print(f"[red]Ty issues found:[/red]\n{output}")
104
+ if hard_results["radon_cc"]["status"] != "success":
105
+ issues = hard_results["radon_cc"].get("issues", [])
106
+ console.print(
107
+ f"[red]Cyclomatic Complexity too high "
108
+ f"({len(issues)} functions > 15):[/red]"
109
+ )
110
+ for issue in issues:
111
+ console.print(
112
+ f" - {issue['file']}: {issue['type']} '{issue['name']}' "
113
+ f"has CC {issue['complexity']}"
114
+ )
115
+ sys.exit(1)
116
+
117
+ console.print("[bold green]Hard Evaluation Passed![/bold green]")
118
+
119
+ # Print Maintainability Index scorecard
120
+ mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
121
+ if mi_scores:
122
+ avg_mi = sum(mi_scores.values()) / len(mi_scores)
123
+ color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
124
+ console.print(
125
+ f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]"
126
+ )
127
+
128
+ # 2. Governance/QC Evaluation (Second Fence)
129
+ console.print("\n[bold blue]Running Governance QC (Second Fence)...[/bold blue]")
130
+ qc_results = evaluator.qc_evaluator.evaluate()
131
+
132
+ if not qc_results["all_passed"]:
133
+ console.print("[bold red]Governance QC Failed! Exiting.[/bold red]")
134
+ console.print(
135
+ "[red]The proposed changes violate governance constraints "
136
+ "or lack sufficient evidence.[/red]"
137
+ )
138
+ for failure in qc_results["failures"]:
139
+ console.print(f"[red]- {failure}[/red]")
140
+ sys.exit(1)
141
+
142
+ console.print(
143
+ "[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
144
+ )
145
+
146
+ # 3. Soft Evaluation/Readability (Third Fence)
147
+ console.print(
148
+ "[bold blue]Running Soft Evaluation "
149
+ "(Readability & Understandability)...[/bold blue]"
150
+ )
151
+ soft_results = evaluator.soft_evaluator.evaluate()
152
+
153
+ pkg_summary = soft_results["package_summary"]
154
+ console.print(
155
+ f"[green]Analyzed {pkg_summary['total_files']} files with a total of "
156
+ f"{pkg_summary['total_tokens']} tokens.[/green]"
157
+ )
158
+ console.print(
159
+ f"[magenta]Agent's Understanding of the Package:[/magenta]\n"
160
+ f"{pkg_summary['package_understanding']}"
161
+ )
162
+
163
+ console.print(
164
+ f"\n[cyan]Overall Understandability Score:[/cyan] "
165
+ f"{soft_results['understandability_score']:.1f}/100"
166
+ )
167
+
168
+ qa_results = soft_results.get("qa_results", {}).get("sampled_entities", [])
169
+ if qa_results:
170
+ console.print("\n[bold yellow]Blind QA Sampling Results:[/bold yellow]")
171
+ for qa in qa_results:
172
+ color = "green" if qa['score'] >= 80 else "red"
173
+ console.print(f" - [{color}]{qa['entity']}: Score {qa['score']}[/{color}]")
174
+ console.print(f" [dim]Feedback: {qa['feedback']}[/dim]")
175
+
176
+ console.print("\n[yellow]Evaluation completed. Generating report...[/yellow]\n")
177
+
178
+ # Generate Final Report
179
+ final_report = evaluator.soft_evaluator.generate_final_report(
180
+ hard_results, soft_results
181
+ )
182
+
183
+ if final_report:
184
+ verdict = final_report.get("verdict", "Unknown")
185
+ verdict_color = "bold green" if "Pass" in verdict else "bold red"
186
+
187
+ console.print(
188
+ f"[{verdict_color}]=== FINAL VERDICT: {verdict} ===[/{verdict_color}]"
189
+ )
190
+ console.print(f"[bold]Summary:[/bold] {final_report.get('summary', '')}\n")
191
+
192
+ suggestions = final_report.get("suggestions", [])
193
+ if suggestions:
194
+ console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
195
+ for i, sug in enumerate(suggestions, 1):
196
+ console.print(
197
+ f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
198
+ f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
199
+ )
200
+ console.print(f" [dim]{sug.get('description', '')}[/dim]")
201
+
202
+ if "Fail" in verdict:
203
+ sys.exit(1)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ app()
@@ -0,0 +1,42 @@
1
+ """
2
+ Core module for integrating all evaluations and producing the final report.
3
+ """
4
+
5
+ from typing import Any
6
+
7
+ from python_harness.hard_evaluator import HardEvaluator
8
+ from python_harness.qc_evaluator import QCEvaluator
9
+ from python_harness.soft_evaluator import SoftEvaluator
10
+
11
+
12
+ class Evaluator:
13
+ """
14
+ Main evaluator coordinating hard, QC, and soft assessments.
15
+ """
16
+
17
+ def __init__(self, target_path: str):
18
+ self.target_path = target_path
19
+ self.hard_evaluator = HardEvaluator(target_path)
20
+ self.qc_evaluator = QCEvaluator(target_path)
21
+ self.soft_evaluator = SoftEvaluator(target_path)
22
+
23
+ def run(self) -> dict[str, Any]:
24
+ """
25
+ Run the complete evaluation process.
26
+ """
27
+ hard_results = self.hard_evaluator.evaluate()
28
+ qc_results = self.qc_evaluator.evaluate()
29
+ soft_results = self.soft_evaluator.evaluate()
30
+
31
+ # Generate Final Synthesized Report with 3 Suggestions
32
+ final_report = self.soft_evaluator.generate_final_report(
33
+ hard_results, soft_results
34
+ )
35
+
36
+ return {
37
+ "hard_evaluation": hard_results,
38
+ "qc_evaluation": qc_results,
39
+ "soft_evaluation": soft_results,
40
+ "final_report": final_report,
41
+ "overall_status": "success",
42
+ }
@@ -0,0 +1,200 @@
1
+ """
2
+ Core module for integrating hard evaluation tools like ruff, mypy, and pytest.
3
+ """
4
+
5
+ import json
6
+ import subprocess
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from rich.console import Console
11
+
12
+ console = Console()
13
+
14
+ class HardEvaluator:
15
+ """
16
+ Evaluator for collecting structural code quality metrics.
17
+ """
18
+
19
+ def __init__(self, target_path: str):
20
+ self.target_path = Path(target_path).resolve()
21
+
22
+ def run_ruff(self) -> dict[str, Any]:
23
+ """
24
+ Run Ruff linter and return results.
25
+ """
26
+ try:
27
+ result = subprocess.run(
28
+ ["ruff", "check", str(self.target_path), "--output-format", "json"],
29
+ capture_output=True,
30
+ text=True,
31
+ check=False
32
+ )
33
+ issues = json.loads(result.stdout) if result.stdout else []
34
+ status = "success" if result.returncode == 0 else "failed"
35
+ return {
36
+ "status": status,
37
+ "issues": issues,
38
+ "return_code": result.returncode,
39
+ }
40
+ except Exception as e:
41
+ return {"status": "error", "error_message": str(e)}
42
+
43
+ def run_mypy(self) -> dict[str, Any]:
44
+ """
45
+ Run Mypy type checker and return results.
46
+ """
47
+ try:
48
+ result = subprocess.run(
49
+ ["mypy", str(self.target_path)],
50
+ capture_output=True,
51
+ text=True,
52
+ check=False
53
+ )
54
+ status = "success" if result.returncode == 0 else "failed"
55
+ return {
56
+ "status": status,
57
+ "output": result.stdout,
58
+ "return_code": result.returncode,
59
+ }
60
+ except Exception as e:
61
+ return {"status": "error", "error_message": str(e)}
62
+
63
+ def run_ty(self) -> dict[str, Any]:
64
+ """
65
+ Run ty language server checks.
66
+ """
67
+ try:
68
+ result = subprocess.run(
69
+ ["ty", "check", str(self.target_path)],
70
+ capture_output=True,
71
+ text=True,
72
+ check=False
73
+ )
74
+ status = "success" if result.returncode == 0 else "failed"
75
+ return {
76
+ "status": status,
77
+ "output": result.stdout,
78
+ "return_code": result.returncode,
79
+ }
80
+ except Exception as e:
81
+ return {"status": "error", "error_message": str(e)}
82
+
83
+ def run_radon_cc(self) -> dict[str, Any]:
84
+ """
85
+ Run Radon cyclomatic complexity check.
86
+ Flag any function/method with CC > 15 as a failure.
87
+ """
88
+ try:
89
+ result = subprocess.run(
90
+ ["radon", "cc", "-j", "-a", str(self.target_path)],
91
+ capture_output=True,
92
+ text=True,
93
+ check=False
94
+ )
95
+
96
+ issues = []
97
+ status = "success"
98
+
99
+ if result.stdout:
100
+ data = json.loads(result.stdout)
101
+ for file_path, blocks in data.items():
102
+ if isinstance(blocks, list):
103
+ for block in blocks:
104
+ if block.get('complexity', 0) > 15:
105
+ issues.append({
106
+ "file": file_path,
107
+ "name": block.get('name'),
108
+ "type": block.get('type'),
109
+ "complexity": block.get('complexity')
110
+ })
111
+
112
+ if issues:
113
+ status = "failed"
114
+
115
+ return {
116
+ "status": status,
117
+ "issues": issues,
118
+ "return_code": result.returncode,
119
+ "output": result.stdout
120
+ }
121
+ except Exception as e:
122
+ return {"status": "error", "error_message": str(e)}
123
+
124
+ def run_radon_mi(self) -> dict[str, Any]:
125
+ """
126
+ Run Radon Maintainability Index (MI) check.
127
+ This is a diagnostic metric, so it won't fail the build,
128
+ but it contributes to the scorecard.
129
+ """
130
+ try:
131
+ result = subprocess.run(
132
+ ["radon", "mi", "-j", str(self.target_path)],
133
+ capture_output=True,
134
+ text=True,
135
+ check=False
136
+ )
137
+
138
+ mi_scores = {}
139
+ if result.stdout:
140
+ data = json.loads(result.stdout)
141
+ for file_path, info in data.items():
142
+ mi_scores[file_path] = info.get('mi', 100.0)
143
+
144
+ return {
145
+ "status": "success",
146
+ "mi_scores": mi_scores,
147
+ "return_code": result.returncode,
148
+ }
149
+ except Exception as e:
150
+ return {"status": "error", "error_message": str(e)}
151
+
152
+ def run_pytest(self) -> dict[str, Any]:
153
+ """
154
+ Run Pytest test suite and return coverage results.
155
+ """
156
+ try:
157
+ # When pytest is run within pytest, it can cause issues or hang.
158
+ # Here we just run it as a subprocess to gather results.
159
+ result = subprocess.run(
160
+ ["pytest", str(self.target_path), "--cov", "--cov-report=json"],
161
+ capture_output=True,
162
+ text=True,
163
+ check=False
164
+ )
165
+ status = "success" if result.returncode == 0 else "failed"
166
+ return {
167
+ "status": status,
168
+ "output": result.stdout,
169
+ "return_code": result.returncode,
170
+ }
171
+ except Exception as e:
172
+ return {"status": "error", "error_message": str(e)}
173
+
174
+ def evaluate(self) -> dict[str, Any]:
175
+ """
176
+ Execute all hard evaluation tools.
177
+ Returns a dictionary with results and an overall success boolean.
178
+ """
179
+ ruff_res = self.run_ruff()
180
+ mypy_res = self.run_mypy()
181
+ ty_res = self.run_ty()
182
+ radon_cc_res = self.run_radon_cc()
183
+ radon_mi_res = self.run_radon_mi()
184
+ # pytest_res = self.run_pytest() # Better handled as a separate stage
185
+
186
+ all_passed = (
187
+ ruff_res.get("status") == "success" and
188
+ mypy_res.get("status") == "success" and
189
+ ty_res.get("status") == "success" and
190
+ radon_cc_res.get("status") == "success"
191
+ )
192
+
193
+ return {
194
+ "all_passed": all_passed,
195
+ "ruff": ruff_res,
196
+ "mypy": mypy_res,
197
+ "ty": ty_res,
198
+ "radon_cc": radon_cc_res,
199
+ "radon_mi": radon_mi_res
200
+ }