python-harness 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- python_harness-0.0.1/LICENSE +21 -0
- python_harness-0.0.1/PKG-INFO +88 -0
- python_harness-0.0.1/README.md +62 -0
- python_harness-0.0.1/pyproject.toml +65 -0
- python_harness-0.0.1/python_harness/__init__.py +5 -0
- python_harness-0.0.1/python_harness/cli.py +207 -0
- python_harness-0.0.1/python_harness/evaluator.py +42 -0
- python_harness-0.0.1/python_harness/hard_evaluator.py +200 -0
- python_harness-0.0.1/python_harness/qc_evaluator.py +89 -0
- python_harness-0.0.1/python_harness/soft_evaluator.py +486 -0
- python_harness-0.0.1/python_harness.egg-info/PKG-INFO +88 -0
- python_harness-0.0.1/python_harness.egg-info/SOURCES.txt +19 -0
- python_harness-0.0.1/python_harness.egg-info/dependency_links.txt +1 -0
- python_harness-0.0.1/python_harness.egg-info/entry_points.txt +2 -0
- python_harness-0.0.1/python_harness.egg-info/requires.txt +16 -0
- python_harness-0.0.1/python_harness.egg-info/top_level.txt +1 -0
- python_harness-0.0.1/setup.cfg +4 -0
- python_harness-0.0.1/tests/test_cli.py +26 -0
- python_harness-0.0.1/tests/test_evaluator.py +18 -0
- python_harness-0.0.1/tests/test_hard_evaluator.py +29 -0
- python_harness-0.0.1/tests/test_soft_evaluator.py +42 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mingli Yuan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: python-harness
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: An agentic codebase evaluation and evolution tool for Python projects.
|
|
5
|
+
Author-email: Mingli Yuan <mingli.yuan@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: typer>=0.9.0
|
|
11
|
+
Requires-Dist: rich>=13.0.0
|
|
12
|
+
Requires-Dist: pydantic>=2.0.0
|
|
13
|
+
Requires-Dist: openai>=1.0.0
|
|
14
|
+
Requires-Dist: anthropic>=0.18.0
|
|
15
|
+
Requires-Dist: tenacity>=8.2.0
|
|
16
|
+
Requires-Dist: tiktoken>=0.6.0
|
|
17
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
21
|
+
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
22
|
+
Requires-Dist: mypy>=1.9.0; extra == "dev"
|
|
23
|
+
Requires-Dist: ty>=0.0.1; extra == "dev"
|
|
24
|
+
Requires-Dist: radon>=6.0.1; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# Python Harness
|
|
28
|
+
|
|
29
|
+
An agentic codebase evaluation and evolution tool for Python projects.
|
|
30
|
+
|
|
31
|
+
`python-harness` is designed to be a universal standard tool—just like `pytest` or `ruff`—but instead of just checking syntax or running tests, it evaluates the **architecture, readability, and governance** of your codebase using both static analysis and LLMs (DeepSeek/OpenAI).
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
1. **Hard Evaluation (First Fence)**: Enforces strict rules using `ruff`, `mypy`, and `ty`. Evaluates Cyclomatic Complexity (CC) and Maintainability Index (MI) via `radon`.
|
|
36
|
+
2. **Governance QC (Second Fence)**: Checks if the changes violate core project governance or attempt to bypass the evaluation rules themselves.
|
|
37
|
+
3. **Soft Evaluation (Third Fence)**:
|
|
38
|
+
- Calculates architecture metrics like Fan-out (coupling).
|
|
39
|
+
- Generates a holistic package understanding using LLMs.
|
|
40
|
+
- Performs "Blind QA": Randomly samples functions/classes and tests the LLM's ability to understand them without context.
|
|
41
|
+
4. **Actionable Output**: Synthesizes the evaluation into a final `Pass/Fail` verdict with exactly 3 concrete, actionable refactoring suggestions.
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
You can install `python-harness` using `uv` or `pip`:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
uv pip install python-harness
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Configuration
|
|
52
|
+
|
|
53
|
+
`python-harness` requires an LLM to perform its soft evaluation. Create a `.env` file in the root of your project:
|
|
54
|
+
|
|
55
|
+
```env
|
|
56
|
+
LLM_API_KEY=your_api_key_here
|
|
57
|
+
LLM_BASE_URL=https://api.deepseek.com/v1
|
|
58
|
+
LLM_MODEL_NAME=deepseek-reasoner
|
|
59
|
+
LLM_MINI_MODEL_NAME=deepseek-chat
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
*(Note: If you don't provide an API key, the harness will safely run in Mock mode).*
|
|
63
|
+
|
|
64
|
+
## Usage
|
|
65
|
+
|
|
66
|
+
### 1. Measure
|
|
67
|
+
|
|
68
|
+
To evaluate your codebase, run the `measure` command in your project directory:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
harness measure .
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This will run the full 3-fence evaluation and output a report with a final verdict and top 3 improvement suggestions.
|
|
75
|
+
|
|
76
|
+
### 2. Refine (Evolution Loop - WIP)
|
|
77
|
+
|
|
78
|
+
The `refine` command is an Agentic Edit-Test-Improve loop. It takes the suggestions generated by `measure`, automatically creates branches (variants), applies the changes, runs the tests (`pytest`), and picks the best variant.
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
harness refine . --steps 1 --max-retries 3
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
MIT License. See [LICENSE](LICENSE) for more details.
|
|
87
|
+
|
|
88
|
+
A harness toolkit for Python projects
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Python Harness
|
|
2
|
+
|
|
3
|
+
An agentic codebase evaluation and evolution tool for Python projects.
|
|
4
|
+
|
|
5
|
+
`python-harness` is designed to be a universal standard tool—just like `pytest` or `ruff`—but instead of just checking syntax or running tests, it evaluates the **architecture, readability, and governance** of your codebase using both static analysis and LLMs (DeepSeek/OpenAI).
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
1. **Hard Evaluation (First Fence)**: Enforces strict rules using `ruff`, `mypy`, and `ty`. Evaluates Cyclomatic Complexity (CC) and Maintainability Index (MI) via `radon`.
|
|
10
|
+
2. **Governance QC (Second Fence)**: Checks if the changes violate core project governance or attempt to bypass the evaluation rules themselves.
|
|
11
|
+
3. **Soft Evaluation (Third Fence)**:
|
|
12
|
+
- Calculates architecture metrics like Fan-out (coupling).
|
|
13
|
+
- Generates a holistic package understanding using LLMs.
|
|
14
|
+
- Performs "Blind QA": Randomly samples functions/classes and tests the LLM's ability to understand them without context.
|
|
15
|
+
4. **Actionable Output**: Synthesizes the evaluation into a final `Pass/Fail` verdict with exactly 3 concrete, actionable refactoring suggestions.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
You can install `python-harness` using `uv` or `pip`:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
uv pip install python-harness
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Configuration
|
|
26
|
+
|
|
27
|
+
`python-harness` requires an LLM to perform its soft evaluation. Create a `.env` file in the root of your project:
|
|
28
|
+
|
|
29
|
+
```env
|
|
30
|
+
LLM_API_KEY=your_api_key_here
|
|
31
|
+
LLM_BASE_URL=https://api.deepseek.com/v1
|
|
32
|
+
LLM_MODEL_NAME=deepseek-reasoner
|
|
33
|
+
LLM_MINI_MODEL_NAME=deepseek-chat
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
*(Note: If you don't provide an API key, the harness will safely run in Mock mode).*
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
### 1. Measure
|
|
41
|
+
|
|
42
|
+
To evaluate your codebase, run the `measure` command in your project directory:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
harness measure .
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
This will run the full 3-fence evaluation and output a report with a final verdict and top 3 improvement suggestions.
|
|
49
|
+
|
|
50
|
+
### 2. Refine (Evolution Loop - WIP)
|
|
51
|
+
|
|
52
|
+
The `refine` command is an Agentic Edit-Test-Improve loop. It takes the suggestions generated by `measure`, automatically creates branches (variants), applies the changes, runs the tests (`pytest`), and picks the best variant.
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
harness refine . --steps 1 --max-retries 3
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## License
|
|
59
|
+
|
|
60
|
+
MIT License. See [LICENSE](LICENSE) for more details.
|
|
61
|
+
|
|
62
|
+
A harness toolkit for Python projects
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "python-harness"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "An agentic codebase evaluation and evolution tool for Python projects."
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
authors = [
|
|
8
|
+
{name = "Mingli Yuan", email = "mingli.yuan@gmail.com"}
|
|
9
|
+
]
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
dependencies = [
|
|
12
|
+
"typer>=0.9.0",
|
|
13
|
+
"rich>=13.0.0",
|
|
14
|
+
"pydantic>=2.0.0",
|
|
15
|
+
"openai>=1.0.0",
|
|
16
|
+
"anthropic>=0.18.0",
|
|
17
|
+
"tenacity>=8.2.0",
|
|
18
|
+
"tiktoken>=0.6.0",
|
|
19
|
+
"python-dotenv>=1.0.0",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
dev = [
|
|
24
|
+
"pytest>=8.0.0",
|
|
25
|
+
"pytest-cov>=4.1.0",
|
|
26
|
+
"ruff>=0.3.0",
|
|
27
|
+
"mypy>=1.9.0",
|
|
28
|
+
"ty>=0.0.1", # Assuming ty is available or will be replaced with actual LSP integration
|
|
29
|
+
"radon>=6.0.1",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["setuptools>=61.0"]
|
|
34
|
+
build-backend = "setuptools.build_meta"
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.packages.find]
|
|
37
|
+
include = ["python_harness", "python_harness.*"]
|
|
38
|
+
|
|
39
|
+
[project.scripts]
|
|
40
|
+
harness = "python_harness.cli:app"
|
|
41
|
+
|
|
42
|
+
[tool.ruff]
|
|
43
|
+
line-length = 88
|
|
44
|
+
target-version = "py311"
|
|
45
|
+
|
|
46
|
+
[tool.ruff.lint]
|
|
47
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
48
|
+
ignore = []
|
|
49
|
+
|
|
50
|
+
[tool.mypy]
|
|
51
|
+
python_version = "3.11"
|
|
52
|
+
strict = true
|
|
53
|
+
warn_return_any = true
|
|
54
|
+
warn_unused_configs = true
|
|
55
|
+
exclude = [
|
|
56
|
+
"vendors/.*"
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
[tool.pytest.ini_options]
|
|
60
|
+
minversion = "8.0"
|
|
61
|
+
addopts = "-ra -q --cov=python_harness --cov-report=term-missing --cov-report=html"
|
|
62
|
+
testpaths = [
|
|
63
|
+
"tests",
|
|
64
|
+
]
|
|
65
|
+
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for python-harness.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
|
|
12
|
+
from python_harness.evaluator import Evaluator
|
|
13
|
+
|
|
14
|
+
# Try to find .env file explicitly before anything else executes
|
|
15
|
+
env_path = os.path.join(os.getcwd(), '.env')
|
|
16
|
+
if os.path.exists(env_path):
|
|
17
|
+
load_dotenv(dotenv_path=env_path)
|
|
18
|
+
else:
|
|
19
|
+
load_dotenv() # Fallback to default search
|
|
20
|
+
|
|
21
|
+
app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
|
|
22
|
+
console = Console()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@app.command()
|
|
26
|
+
def refine(
|
|
27
|
+
path: str = typer.Argument(".", help="The path to evaluate and evolve"),
|
|
28
|
+
steps: int = typer.Option(1, help="Number of evolution steps to perform"),
|
|
29
|
+
max_retries: int = typer.Option(3, help="Maximum retries per variant if tests fail")
|
|
30
|
+
) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Refine the codebase through an agentic Edit-Test-Improve loop.
|
|
33
|
+
Generates variants based on suggestions, tests them, and picks the best.
|
|
34
|
+
"""
|
|
35
|
+
console.print(
|
|
36
|
+
f"[bold magenta]Starting evolution loop for path:[/bold magenta] {path} "
|
|
37
|
+
f"[dim](steps={steps}, max_retries={max_retries})[/dim]"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# 1. First, run a baseline evaluation to get suggestions
|
|
41
|
+
evaluator = Evaluator(path)
|
|
42
|
+
console.print("[cyan]Running baseline evaluation...[/cyan]")
|
|
43
|
+
hard_results = evaluator.hard_evaluator.evaluate()
|
|
44
|
+
soft_results = evaluator.soft_evaluator.evaluate()
|
|
45
|
+
baseline_report = evaluator.soft_evaluator.generate_final_report(
|
|
46
|
+
hard_results, soft_results
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
suggestions = baseline_report.get("suggestions", [])
|
|
50
|
+
if not suggestions:
|
|
51
|
+
console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
console.print(
|
|
55
|
+
f"[green]Found {len(suggestions)} suggestions. "
|
|
56
|
+
f"Starting evolution branches...[/green]"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# TODO: Implement the Git branching and Agent modification logic here.
|
|
60
|
+
# The loop will be:
|
|
61
|
+
# for step in range(steps):
|
|
62
|
+
# for suggestion in suggestions:
|
|
63
|
+
# checkout new branch variant-X
|
|
64
|
+
# for retry in range(max_retries):
|
|
65
|
+
# ask LLM to apply suggestion to code
|
|
66
|
+
# run pytest
|
|
67
|
+
# if pytest passes:
|
|
68
|
+
# run harness . to get new score
|
|
69
|
+
# break
|
|
70
|
+
# else:
|
|
71
|
+
# feed error back to LLM for retry
|
|
72
|
+
# compare all variants and checkout the best one
|
|
73
|
+
|
|
74
|
+
console.print(
|
|
75
|
+
"[yellow]Evolution engine skeleton ready. "
|
|
76
|
+
"Actual git mutation logic pending.[/yellow]"
|
|
77
|
+
)
|
|
78
|
+
@app.command()
|
|
79
|
+
def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Measure the codebase against hard, soft, and governance constraints.
|
|
82
|
+
Outputs a final report with scores and actionable improvement suggestions.
|
|
83
|
+
"""
|
|
84
|
+
console.print(
|
|
85
|
+
f"[bold green]Starting harness measurement for path:[/bold green] {path}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
evaluator = Evaluator(path)
|
|
89
|
+
|
|
90
|
+
# 1. Hard Evaluation Gate (First Fence)
|
|
91
|
+
console.print("[bold blue]Running Hard Evaluation (ruff, mypy)...[/bold blue]")
|
|
92
|
+
hard_results = evaluator.hard_evaluator.evaluate()
|
|
93
|
+
|
|
94
|
+
if not hard_results["all_passed"]:
|
|
95
|
+
console.print("[bold red]Hard Evaluation Failed! Exiting.[/bold red]")
|
|
96
|
+
if hard_results["ruff"]["status"] != "success":
|
|
97
|
+
console.print("[red]Ruff issues found.[/red]")
|
|
98
|
+
if hard_results["mypy"]["status"] != "success":
|
|
99
|
+
output = hard_results["mypy"].get("output", "")
|
|
100
|
+
console.print(f"[red]Mypy issues found:[/red]\n{output}")
|
|
101
|
+
if hard_results["ty"]["status"] != "success":
|
|
102
|
+
output = hard_results["ty"].get("output", "")
|
|
103
|
+
console.print(f"[red]Ty issues found:[/red]\n{output}")
|
|
104
|
+
if hard_results["radon_cc"]["status"] != "success":
|
|
105
|
+
issues = hard_results["radon_cc"].get("issues", [])
|
|
106
|
+
console.print(
|
|
107
|
+
f"[red]Cyclomatic Complexity too high "
|
|
108
|
+
f"({len(issues)} functions > 15):[/red]"
|
|
109
|
+
)
|
|
110
|
+
for issue in issues:
|
|
111
|
+
console.print(
|
|
112
|
+
f" - {issue['file']}: {issue['type']} '{issue['name']}' "
|
|
113
|
+
f"has CC {issue['complexity']}"
|
|
114
|
+
)
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
console.print("[bold green]Hard Evaluation Passed![/bold green]")
|
|
118
|
+
|
|
119
|
+
# Print Maintainability Index scorecard
|
|
120
|
+
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
121
|
+
if mi_scores:
|
|
122
|
+
avg_mi = sum(mi_scores.values()) / len(mi_scores)
|
|
123
|
+
color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
|
|
124
|
+
console.print(
|
|
125
|
+
f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# 2. Governance/QC Evaluation (Second Fence)
|
|
129
|
+
console.print("\n[bold blue]Running Governance QC (Second Fence)...[/bold blue]")
|
|
130
|
+
qc_results = evaluator.qc_evaluator.evaluate()
|
|
131
|
+
|
|
132
|
+
if not qc_results["all_passed"]:
|
|
133
|
+
console.print("[bold red]Governance QC Failed! Exiting.[/bold red]")
|
|
134
|
+
console.print(
|
|
135
|
+
"[red]The proposed changes violate governance constraints "
|
|
136
|
+
"or lack sufficient evidence.[/red]"
|
|
137
|
+
)
|
|
138
|
+
for failure in qc_results["failures"]:
|
|
139
|
+
console.print(f"[red]- {failure}[/red]")
|
|
140
|
+
sys.exit(1)
|
|
141
|
+
|
|
142
|
+
console.print(
|
|
143
|
+
"[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# 3. Soft Evaluation/Readability (Third Fence)
|
|
147
|
+
console.print(
|
|
148
|
+
"[bold blue]Running Soft Evaluation "
|
|
149
|
+
"(Readability & Understandability)...[/bold blue]"
|
|
150
|
+
)
|
|
151
|
+
soft_results = evaluator.soft_evaluator.evaluate()
|
|
152
|
+
|
|
153
|
+
pkg_summary = soft_results["package_summary"]
|
|
154
|
+
console.print(
|
|
155
|
+
f"[green]Analyzed {pkg_summary['total_files']} files with a total of "
|
|
156
|
+
f"{pkg_summary['total_tokens']} tokens.[/green]"
|
|
157
|
+
)
|
|
158
|
+
console.print(
|
|
159
|
+
f"[magenta]Agent's Understanding of the Package:[/magenta]\n"
|
|
160
|
+
f"{pkg_summary['package_understanding']}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
console.print(
|
|
164
|
+
f"\n[cyan]Overall Understandability Score:[/cyan] "
|
|
165
|
+
f"{soft_results['understandability_score']:.1f}/100"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
qa_results = soft_results.get("qa_results", {}).get("sampled_entities", [])
|
|
169
|
+
if qa_results:
|
|
170
|
+
console.print("\n[bold yellow]Blind QA Sampling Results:[/bold yellow]")
|
|
171
|
+
for qa in qa_results:
|
|
172
|
+
color = "green" if qa['score'] >= 80 else "red"
|
|
173
|
+
console.print(f" - [{color}]{qa['entity']}: Score {qa['score']}[/{color}]")
|
|
174
|
+
console.print(f" [dim]Feedback: {qa['feedback']}[/dim]")
|
|
175
|
+
|
|
176
|
+
console.print("\n[yellow]Evaluation completed. Generating report...[/yellow]\n")
|
|
177
|
+
|
|
178
|
+
# Generate Final Report
|
|
179
|
+
final_report = evaluator.soft_evaluator.generate_final_report(
|
|
180
|
+
hard_results, soft_results
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if final_report:
|
|
184
|
+
verdict = final_report.get("verdict", "Unknown")
|
|
185
|
+
verdict_color = "bold green" if "Pass" in verdict else "bold red"
|
|
186
|
+
|
|
187
|
+
console.print(
|
|
188
|
+
f"[{verdict_color}]=== FINAL VERDICT: {verdict} ===[/{verdict_color}]"
|
|
189
|
+
)
|
|
190
|
+
console.print(f"[bold]Summary:[/bold] {final_report.get('summary', '')}\n")
|
|
191
|
+
|
|
192
|
+
suggestions = final_report.get("suggestions", [])
|
|
193
|
+
if suggestions:
|
|
194
|
+
console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
|
|
195
|
+
for i, sug in enumerate(suggestions, 1):
|
|
196
|
+
console.print(
|
|
197
|
+
f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
|
|
198
|
+
f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
|
|
199
|
+
)
|
|
200
|
+
console.print(f" [dim]{sug.get('description', '')}[/dim]")
|
|
201
|
+
|
|
202
|
+
if "Fail" in verdict:
|
|
203
|
+
sys.exit(1)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
if __name__ == "__main__":
|
|
207
|
+
app()
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core module for integrating all evaluations and producing the final report.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from python_harness.hard_evaluator import HardEvaluator
|
|
8
|
+
from python_harness.qc_evaluator import QCEvaluator
|
|
9
|
+
from python_harness.soft_evaluator import SoftEvaluator
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Evaluator:
|
|
13
|
+
"""
|
|
14
|
+
Main evaluator coordinating hard, QC, and soft assessments.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, target_path: str):
|
|
18
|
+
self.target_path = target_path
|
|
19
|
+
self.hard_evaluator = HardEvaluator(target_path)
|
|
20
|
+
self.qc_evaluator = QCEvaluator(target_path)
|
|
21
|
+
self.soft_evaluator = SoftEvaluator(target_path)
|
|
22
|
+
|
|
23
|
+
def run(self) -> dict[str, Any]:
|
|
24
|
+
"""
|
|
25
|
+
Run the complete evaluation process.
|
|
26
|
+
"""
|
|
27
|
+
hard_results = self.hard_evaluator.evaluate()
|
|
28
|
+
qc_results = self.qc_evaluator.evaluate()
|
|
29
|
+
soft_results = self.soft_evaluator.evaluate()
|
|
30
|
+
|
|
31
|
+
# Generate Final Synthesized Report with 3 Suggestions
|
|
32
|
+
final_report = self.soft_evaluator.generate_final_report(
|
|
33
|
+
hard_results, soft_results
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
"hard_evaluation": hard_results,
|
|
38
|
+
"qc_evaluation": qc_results,
|
|
39
|
+
"soft_evaluation": soft_results,
|
|
40
|
+
"final_report": final_report,
|
|
41
|
+
"overall_status": "success",
|
|
42
|
+
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core module for integrating hard evaluation tools like ruff, mypy, and pytest.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
class HardEvaluator:
|
|
15
|
+
"""
|
|
16
|
+
Evaluator for collecting structural code quality metrics.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, target_path: str):
|
|
20
|
+
self.target_path = Path(target_path).resolve()
|
|
21
|
+
|
|
22
|
+
def run_ruff(self) -> dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Run Ruff linter and return results.
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
result = subprocess.run(
|
|
28
|
+
["ruff", "check", str(self.target_path), "--output-format", "json"],
|
|
29
|
+
capture_output=True,
|
|
30
|
+
text=True,
|
|
31
|
+
check=False
|
|
32
|
+
)
|
|
33
|
+
issues = json.loads(result.stdout) if result.stdout else []
|
|
34
|
+
status = "success" if result.returncode == 0 else "failed"
|
|
35
|
+
return {
|
|
36
|
+
"status": status,
|
|
37
|
+
"issues": issues,
|
|
38
|
+
"return_code": result.returncode,
|
|
39
|
+
}
|
|
40
|
+
except Exception as e:
|
|
41
|
+
return {"status": "error", "error_message": str(e)}
|
|
42
|
+
|
|
43
|
+
def run_mypy(self) -> dict[str, Any]:
|
|
44
|
+
"""
|
|
45
|
+
Run Mypy type checker and return results.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
result = subprocess.run(
|
|
49
|
+
["mypy", str(self.target_path)],
|
|
50
|
+
capture_output=True,
|
|
51
|
+
text=True,
|
|
52
|
+
check=False
|
|
53
|
+
)
|
|
54
|
+
status = "success" if result.returncode == 0 else "failed"
|
|
55
|
+
return {
|
|
56
|
+
"status": status,
|
|
57
|
+
"output": result.stdout,
|
|
58
|
+
"return_code": result.returncode,
|
|
59
|
+
}
|
|
60
|
+
except Exception as e:
|
|
61
|
+
return {"status": "error", "error_message": str(e)}
|
|
62
|
+
|
|
63
|
+
def run_ty(self) -> dict[str, Any]:
|
|
64
|
+
"""
|
|
65
|
+
Run ty language server checks.
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
result = subprocess.run(
|
|
69
|
+
["ty", "check", str(self.target_path)],
|
|
70
|
+
capture_output=True,
|
|
71
|
+
text=True,
|
|
72
|
+
check=False
|
|
73
|
+
)
|
|
74
|
+
status = "success" if result.returncode == 0 else "failed"
|
|
75
|
+
return {
|
|
76
|
+
"status": status,
|
|
77
|
+
"output": result.stdout,
|
|
78
|
+
"return_code": result.returncode,
|
|
79
|
+
}
|
|
80
|
+
except Exception as e:
|
|
81
|
+
return {"status": "error", "error_message": str(e)}
|
|
82
|
+
|
|
83
|
+
def run_radon_cc(self) -> dict[str, Any]:
|
|
84
|
+
"""
|
|
85
|
+
Run Radon cyclomatic complexity check.
|
|
86
|
+
Flag any function/method with CC > 15 as a failure.
|
|
87
|
+
"""
|
|
88
|
+
try:
|
|
89
|
+
result = subprocess.run(
|
|
90
|
+
["radon", "cc", "-j", "-a", str(self.target_path)],
|
|
91
|
+
capture_output=True,
|
|
92
|
+
text=True,
|
|
93
|
+
check=False
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
issues = []
|
|
97
|
+
status = "success"
|
|
98
|
+
|
|
99
|
+
if result.stdout:
|
|
100
|
+
data = json.loads(result.stdout)
|
|
101
|
+
for file_path, blocks in data.items():
|
|
102
|
+
if isinstance(blocks, list):
|
|
103
|
+
for block in blocks:
|
|
104
|
+
if block.get('complexity', 0) > 15:
|
|
105
|
+
issues.append({
|
|
106
|
+
"file": file_path,
|
|
107
|
+
"name": block.get('name'),
|
|
108
|
+
"type": block.get('type'),
|
|
109
|
+
"complexity": block.get('complexity')
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
if issues:
|
|
113
|
+
status = "failed"
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
"status": status,
|
|
117
|
+
"issues": issues,
|
|
118
|
+
"return_code": result.returncode,
|
|
119
|
+
"output": result.stdout
|
|
120
|
+
}
|
|
121
|
+
except Exception as e:
|
|
122
|
+
return {"status": "error", "error_message": str(e)}
|
|
123
|
+
|
|
124
|
+
def run_radon_mi(self) -> dict[str, Any]:
|
|
125
|
+
"""
|
|
126
|
+
Run Radon Maintainability Index (MI) check.
|
|
127
|
+
This is a diagnostic metric, so it won't fail the build,
|
|
128
|
+
but it contributes to the scorecard.
|
|
129
|
+
"""
|
|
130
|
+
try:
|
|
131
|
+
result = subprocess.run(
|
|
132
|
+
["radon", "mi", "-j", str(self.target_path)],
|
|
133
|
+
capture_output=True,
|
|
134
|
+
text=True,
|
|
135
|
+
check=False
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
mi_scores = {}
|
|
139
|
+
if result.stdout:
|
|
140
|
+
data = json.loads(result.stdout)
|
|
141
|
+
for file_path, info in data.items():
|
|
142
|
+
mi_scores[file_path] = info.get('mi', 100.0)
|
|
143
|
+
|
|
144
|
+
return {
|
|
145
|
+
"status": "success",
|
|
146
|
+
"mi_scores": mi_scores,
|
|
147
|
+
"return_code": result.returncode,
|
|
148
|
+
}
|
|
149
|
+
except Exception as e:
|
|
150
|
+
return {"status": "error", "error_message": str(e)}
|
|
151
|
+
|
|
152
|
+
def run_pytest(self) -> dict[str, Any]:
|
|
153
|
+
"""
|
|
154
|
+
Run Pytest test suite and return coverage results.
|
|
155
|
+
"""
|
|
156
|
+
try:
|
|
157
|
+
# When pytest is run within pytest, it can cause issues or hang.
|
|
158
|
+
# Here we just run it as a subprocess to gather results.
|
|
159
|
+
result = subprocess.run(
|
|
160
|
+
["pytest", str(self.target_path), "--cov", "--cov-report=json"],
|
|
161
|
+
capture_output=True,
|
|
162
|
+
text=True,
|
|
163
|
+
check=False
|
|
164
|
+
)
|
|
165
|
+
status = "success" if result.returncode == 0 else "failed"
|
|
166
|
+
return {
|
|
167
|
+
"status": status,
|
|
168
|
+
"output": result.stdout,
|
|
169
|
+
"return_code": result.returncode,
|
|
170
|
+
}
|
|
171
|
+
except Exception as e:
|
|
172
|
+
return {"status": "error", "error_message": str(e)}
|
|
173
|
+
|
|
174
|
+
def evaluate(self) -> dict[str, Any]:
|
|
175
|
+
"""
|
|
176
|
+
Execute all hard evaluation tools.
|
|
177
|
+
Returns a dictionary with results and an overall success boolean.
|
|
178
|
+
"""
|
|
179
|
+
ruff_res = self.run_ruff()
|
|
180
|
+
mypy_res = self.run_mypy()
|
|
181
|
+
ty_res = self.run_ty()
|
|
182
|
+
radon_cc_res = self.run_radon_cc()
|
|
183
|
+
radon_mi_res = self.run_radon_mi()
|
|
184
|
+
# pytest_res = self.run_pytest() # Better handled as a separate stage
|
|
185
|
+
|
|
186
|
+
all_passed = (
|
|
187
|
+
ruff_res.get("status") == "success" and
|
|
188
|
+
mypy_res.get("status") == "success" and
|
|
189
|
+
ty_res.get("status") == "success" and
|
|
190
|
+
radon_cc_res.get("status") == "success"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
"all_passed": all_passed,
|
|
195
|
+
"ruff": ruff_res,
|
|
196
|
+
"mypy": mypy_res,
|
|
197
|
+
"ty": ty_res,
|
|
198
|
+
"radon_cc": radon_cc_res,
|
|
199
|
+
"radon_mi": radon_mi_res
|
|
200
|
+
}
|