python-harness 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- python_harness/__init__.py +5 -0
- python_harness/cli.py +207 -0
- python_harness/evaluator.py +42 -0
- python_harness/hard_evaluator.py +200 -0
- python_harness/qc_evaluator.py +89 -0
- python_harness/soft_evaluator.py +486 -0
- python_harness-0.0.1.dist-info/METADATA +88 -0
- python_harness-0.0.1.dist-info/RECORD +12 -0
- python_harness-0.0.1.dist-info/WHEEL +5 -0
- python_harness-0.0.1.dist-info/entry_points.txt +2 -0
- python_harness-0.0.1.dist-info/licenses/LICENSE +21 -0
- python_harness-0.0.1.dist-info/top_level.txt +1 -0
python_harness/cli.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for python-harness.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
|
|
12
|
+
from python_harness.evaluator import Evaluator
|
|
13
|
+
|
|
14
|
+
# Try to find .env file explicitly before anything else executes
|
|
15
|
+
env_path = os.path.join(os.getcwd(), '.env')
|
|
16
|
+
if os.path.exists(env_path):
|
|
17
|
+
load_dotenv(dotenv_path=env_path)
|
|
18
|
+
else:
|
|
19
|
+
load_dotenv() # Fallback to default search
|
|
20
|
+
|
|
21
|
+
app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
|
|
22
|
+
console = Console()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@app.command()
|
|
26
|
+
def refine(
|
|
27
|
+
path: str = typer.Argument(".", help="The path to evaluate and evolve"),
|
|
28
|
+
steps: int = typer.Option(1, help="Number of evolution steps to perform"),
|
|
29
|
+
max_retries: int = typer.Option(3, help="Maximum retries per variant if tests fail")
|
|
30
|
+
) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Refine the codebase through an agentic Edit-Test-Improve loop.
|
|
33
|
+
Generates variants based on suggestions, tests them, and picks the best.
|
|
34
|
+
"""
|
|
35
|
+
console.print(
|
|
36
|
+
f"[bold magenta]Starting evolution loop for path:[/bold magenta] {path} "
|
|
37
|
+
f"[dim](steps={steps}, max_retries={max_retries})[/dim]"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# 1. First, run a baseline evaluation to get suggestions
|
|
41
|
+
evaluator = Evaluator(path)
|
|
42
|
+
console.print("[cyan]Running baseline evaluation...[/cyan]")
|
|
43
|
+
hard_results = evaluator.hard_evaluator.evaluate()
|
|
44
|
+
soft_results = evaluator.soft_evaluator.evaluate()
|
|
45
|
+
baseline_report = evaluator.soft_evaluator.generate_final_report(
|
|
46
|
+
hard_results, soft_results
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
suggestions = baseline_report.get("suggestions", [])
|
|
50
|
+
if not suggestions:
|
|
51
|
+
console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
console.print(
|
|
55
|
+
f"[green]Found {len(suggestions)} suggestions. "
|
|
56
|
+
f"Starting evolution branches...[/green]"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# TODO: Implement the Git branching and Agent modification logic here.
|
|
60
|
+
# The loop will be:
|
|
61
|
+
# for step in range(steps):
|
|
62
|
+
# for suggestion in suggestions:
|
|
63
|
+
# checkout new branch variant-X
|
|
64
|
+
# for retry in range(max_retries):
|
|
65
|
+
# ask LLM to apply suggestion to code
|
|
66
|
+
# run pytest
|
|
67
|
+
# if pytest passes:
|
|
68
|
+
# run harness . to get new score
|
|
69
|
+
# break
|
|
70
|
+
# else:
|
|
71
|
+
# feed error back to LLM for retry
|
|
72
|
+
# compare all variants and checkout the best one
|
|
73
|
+
|
|
74
|
+
console.print(
|
|
75
|
+
"[yellow]Evolution engine skeleton ready. "
|
|
76
|
+
"Actual git mutation logic pending.[/yellow]"
|
|
77
|
+
)
|
|
78
|
+
@app.command()
|
|
79
|
+
def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Measure the codebase against hard, soft, and governance constraints.
|
|
82
|
+
Outputs a final report with scores and actionable improvement suggestions.
|
|
83
|
+
"""
|
|
84
|
+
console.print(
|
|
85
|
+
f"[bold green]Starting harness measurement for path:[/bold green] {path}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
evaluator = Evaluator(path)
|
|
89
|
+
|
|
90
|
+
# 1. Hard Evaluation Gate (First Fence)
|
|
91
|
+
console.print("[bold blue]Running Hard Evaluation (ruff, mypy)...[/bold blue]")
|
|
92
|
+
hard_results = evaluator.hard_evaluator.evaluate()
|
|
93
|
+
|
|
94
|
+
if not hard_results["all_passed"]:
|
|
95
|
+
console.print("[bold red]Hard Evaluation Failed! Exiting.[/bold red]")
|
|
96
|
+
if hard_results["ruff"]["status"] != "success":
|
|
97
|
+
console.print("[red]Ruff issues found.[/red]")
|
|
98
|
+
if hard_results["mypy"]["status"] != "success":
|
|
99
|
+
output = hard_results["mypy"].get("output", "")
|
|
100
|
+
console.print(f"[red]Mypy issues found:[/red]\n{output}")
|
|
101
|
+
if hard_results["ty"]["status"] != "success":
|
|
102
|
+
output = hard_results["ty"].get("output", "")
|
|
103
|
+
console.print(f"[red]Ty issues found:[/red]\n{output}")
|
|
104
|
+
if hard_results["radon_cc"]["status"] != "success":
|
|
105
|
+
issues = hard_results["radon_cc"].get("issues", [])
|
|
106
|
+
console.print(
|
|
107
|
+
f"[red]Cyclomatic Complexity too high "
|
|
108
|
+
f"({len(issues)} functions > 15):[/red]"
|
|
109
|
+
)
|
|
110
|
+
for issue in issues:
|
|
111
|
+
console.print(
|
|
112
|
+
f" - {issue['file']}: {issue['type']} '{issue['name']}' "
|
|
113
|
+
f"has CC {issue['complexity']}"
|
|
114
|
+
)
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
console.print("[bold green]Hard Evaluation Passed![/bold green]")
|
|
118
|
+
|
|
119
|
+
# Print Maintainability Index scorecard
|
|
120
|
+
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
121
|
+
if mi_scores:
|
|
122
|
+
avg_mi = sum(mi_scores.values()) / len(mi_scores)
|
|
123
|
+
color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
|
|
124
|
+
console.print(
|
|
125
|
+
f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# 2. Governance/QC Evaluation (Second Fence)
|
|
129
|
+
console.print("\n[bold blue]Running Governance QC (Second Fence)...[/bold blue]")
|
|
130
|
+
qc_results = evaluator.qc_evaluator.evaluate()
|
|
131
|
+
|
|
132
|
+
if not qc_results["all_passed"]:
|
|
133
|
+
console.print("[bold red]Governance QC Failed! Exiting.[/bold red]")
|
|
134
|
+
console.print(
|
|
135
|
+
"[red]The proposed changes violate governance constraints "
|
|
136
|
+
"or lack sufficient evidence.[/red]"
|
|
137
|
+
)
|
|
138
|
+
for failure in qc_results["failures"]:
|
|
139
|
+
console.print(f"[red]- {failure}[/red]")
|
|
140
|
+
sys.exit(1)
|
|
141
|
+
|
|
142
|
+
console.print(
|
|
143
|
+
"[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# 3. Soft Evaluation/Readability (Third Fence)
|
|
147
|
+
console.print(
|
|
148
|
+
"[bold blue]Running Soft Evaluation "
|
|
149
|
+
"(Readability & Understandability)...[/bold blue]"
|
|
150
|
+
)
|
|
151
|
+
soft_results = evaluator.soft_evaluator.evaluate()
|
|
152
|
+
|
|
153
|
+
pkg_summary = soft_results["package_summary"]
|
|
154
|
+
console.print(
|
|
155
|
+
f"[green]Analyzed {pkg_summary['total_files']} files with a total of "
|
|
156
|
+
f"{pkg_summary['total_tokens']} tokens.[/green]"
|
|
157
|
+
)
|
|
158
|
+
console.print(
|
|
159
|
+
f"[magenta]Agent's Understanding of the Package:[/magenta]\n"
|
|
160
|
+
f"{pkg_summary['package_understanding']}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
console.print(
|
|
164
|
+
f"\n[cyan]Overall Understandability Score:[/cyan] "
|
|
165
|
+
f"{soft_results['understandability_score']:.1f}/100"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
qa_results = soft_results.get("qa_results", {}).get("sampled_entities", [])
|
|
169
|
+
if qa_results:
|
|
170
|
+
console.print("\n[bold yellow]Blind QA Sampling Results:[/bold yellow]")
|
|
171
|
+
for qa in qa_results:
|
|
172
|
+
color = "green" if qa['score'] >= 80 else "red"
|
|
173
|
+
console.print(f" - [{color}]{qa['entity']}: Score {qa['score']}[/{color}]")
|
|
174
|
+
console.print(f" [dim]Feedback: {qa['feedback']}[/dim]")
|
|
175
|
+
|
|
176
|
+
console.print("\n[yellow]Evaluation completed. Generating report...[/yellow]\n")
|
|
177
|
+
|
|
178
|
+
# Generate Final Report
|
|
179
|
+
final_report = evaluator.soft_evaluator.generate_final_report(
|
|
180
|
+
hard_results, soft_results
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if final_report:
|
|
184
|
+
verdict = final_report.get("verdict", "Unknown")
|
|
185
|
+
verdict_color = "bold green" if "Pass" in verdict else "bold red"
|
|
186
|
+
|
|
187
|
+
console.print(
|
|
188
|
+
f"[{verdict_color}]=== FINAL VERDICT: {verdict} ===[/{verdict_color}]"
|
|
189
|
+
)
|
|
190
|
+
console.print(f"[bold]Summary:[/bold] {final_report.get('summary', '')}\n")
|
|
191
|
+
|
|
192
|
+
suggestions = final_report.get("suggestions", [])
|
|
193
|
+
if suggestions:
|
|
194
|
+
console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
|
|
195
|
+
for i, sug in enumerate(suggestions, 1):
|
|
196
|
+
console.print(
|
|
197
|
+
f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
|
|
198
|
+
f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
|
|
199
|
+
)
|
|
200
|
+
console.print(f" [dim]{sug.get('description', '')}[/dim]")
|
|
201
|
+
|
|
202
|
+
if "Fail" in verdict:
|
|
203
|
+
sys.exit(1)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
if __name__ == "__main__":
|
|
207
|
+
app()
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core module for integrating all evaluations and producing the final report.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from python_harness.hard_evaluator import HardEvaluator
|
|
8
|
+
from python_harness.qc_evaluator import QCEvaluator
|
|
9
|
+
from python_harness.soft_evaluator import SoftEvaluator
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Evaluator:
|
|
13
|
+
"""
|
|
14
|
+
Main evaluator coordinating hard, QC, and soft assessments.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, target_path: str):
|
|
18
|
+
self.target_path = target_path
|
|
19
|
+
self.hard_evaluator = HardEvaluator(target_path)
|
|
20
|
+
self.qc_evaluator = QCEvaluator(target_path)
|
|
21
|
+
self.soft_evaluator = SoftEvaluator(target_path)
|
|
22
|
+
|
|
23
|
+
def run(self) -> dict[str, Any]:
|
|
24
|
+
"""
|
|
25
|
+
Run the complete evaluation process.
|
|
26
|
+
"""
|
|
27
|
+
hard_results = self.hard_evaluator.evaluate()
|
|
28
|
+
qc_results = self.qc_evaluator.evaluate()
|
|
29
|
+
soft_results = self.soft_evaluator.evaluate()
|
|
30
|
+
|
|
31
|
+
# Generate Final Synthesized Report with 3 Suggestions
|
|
32
|
+
final_report = self.soft_evaluator.generate_final_report(
|
|
33
|
+
hard_results, soft_results
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
"hard_evaluation": hard_results,
|
|
38
|
+
"qc_evaluation": qc_results,
|
|
39
|
+
"soft_evaluation": soft_results,
|
|
40
|
+
"final_report": final_report,
|
|
41
|
+
"overall_status": "success",
|
|
42
|
+
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core module for integrating hard evaluation tools like ruff, mypy, and pytest.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
class HardEvaluator:
|
|
15
|
+
"""
|
|
16
|
+
Evaluator for collecting structural code quality metrics.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, target_path: str):
|
|
20
|
+
self.target_path = Path(target_path).resolve()
|
|
21
|
+
|
|
22
|
+
def run_ruff(self) -> dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Run Ruff linter and return results.
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
result = subprocess.run(
|
|
28
|
+
["ruff", "check", str(self.target_path), "--output-format", "json"],
|
|
29
|
+
capture_output=True,
|
|
30
|
+
text=True,
|
|
31
|
+
check=False
|
|
32
|
+
)
|
|
33
|
+
issues = json.loads(result.stdout) if result.stdout else []
|
|
34
|
+
status = "success" if result.returncode == 0 else "failed"
|
|
35
|
+
return {
|
|
36
|
+
"status": status,
|
|
37
|
+
"issues": issues,
|
|
38
|
+
"return_code": result.returncode,
|
|
39
|
+
}
|
|
40
|
+
except Exception as e:
|
|
41
|
+
return {"status": "error", "error_message": str(e)}
|
|
42
|
+
|
|
43
|
+
def run_mypy(self) -> dict[str, Any]:
|
|
44
|
+
"""
|
|
45
|
+
Run Mypy type checker and return results.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
result = subprocess.run(
|
|
49
|
+
["mypy", str(self.target_path)],
|
|
50
|
+
capture_output=True,
|
|
51
|
+
text=True,
|
|
52
|
+
check=False
|
|
53
|
+
)
|
|
54
|
+
status = "success" if result.returncode == 0 else "failed"
|
|
55
|
+
return {
|
|
56
|
+
"status": status,
|
|
57
|
+
"output": result.stdout,
|
|
58
|
+
"return_code": result.returncode,
|
|
59
|
+
}
|
|
60
|
+
except Exception as e:
|
|
61
|
+
return {"status": "error", "error_message": str(e)}
|
|
62
|
+
|
|
63
|
+
def run_ty(self) -> dict[str, Any]:
|
|
64
|
+
"""
|
|
65
|
+
Run ty language server checks.
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
result = subprocess.run(
|
|
69
|
+
["ty", "check", str(self.target_path)],
|
|
70
|
+
capture_output=True,
|
|
71
|
+
text=True,
|
|
72
|
+
check=False
|
|
73
|
+
)
|
|
74
|
+
status = "success" if result.returncode == 0 else "failed"
|
|
75
|
+
return {
|
|
76
|
+
"status": status,
|
|
77
|
+
"output": result.stdout,
|
|
78
|
+
"return_code": result.returncode,
|
|
79
|
+
}
|
|
80
|
+
except Exception as e:
|
|
81
|
+
return {"status": "error", "error_message": str(e)}
|
|
82
|
+
|
|
83
|
+
def run_radon_cc(self) -> dict[str, Any]:
|
|
84
|
+
"""
|
|
85
|
+
Run Radon cyclomatic complexity check.
|
|
86
|
+
Flag any function/method with CC > 15 as a failure.
|
|
87
|
+
"""
|
|
88
|
+
try:
|
|
89
|
+
result = subprocess.run(
|
|
90
|
+
["radon", "cc", "-j", "-a", str(self.target_path)],
|
|
91
|
+
capture_output=True,
|
|
92
|
+
text=True,
|
|
93
|
+
check=False
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
issues = []
|
|
97
|
+
status = "success"
|
|
98
|
+
|
|
99
|
+
if result.stdout:
|
|
100
|
+
data = json.loads(result.stdout)
|
|
101
|
+
for file_path, blocks in data.items():
|
|
102
|
+
if isinstance(blocks, list):
|
|
103
|
+
for block in blocks:
|
|
104
|
+
if block.get('complexity', 0) > 15:
|
|
105
|
+
issues.append({
|
|
106
|
+
"file": file_path,
|
|
107
|
+
"name": block.get('name'),
|
|
108
|
+
"type": block.get('type'),
|
|
109
|
+
"complexity": block.get('complexity')
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
if issues:
|
|
113
|
+
status = "failed"
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
"status": status,
|
|
117
|
+
"issues": issues,
|
|
118
|
+
"return_code": result.returncode,
|
|
119
|
+
"output": result.stdout
|
|
120
|
+
}
|
|
121
|
+
except Exception as e:
|
|
122
|
+
return {"status": "error", "error_message": str(e)}
|
|
123
|
+
|
|
124
|
+
def run_radon_mi(self) -> dict[str, Any]:
|
|
125
|
+
"""
|
|
126
|
+
Run Radon Maintainability Index (MI) check.
|
|
127
|
+
This is a diagnostic metric, so it won't fail the build,
|
|
128
|
+
but it contributes to the scorecard.
|
|
129
|
+
"""
|
|
130
|
+
try:
|
|
131
|
+
result = subprocess.run(
|
|
132
|
+
["radon", "mi", "-j", str(self.target_path)],
|
|
133
|
+
capture_output=True,
|
|
134
|
+
text=True,
|
|
135
|
+
check=False
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
mi_scores = {}
|
|
139
|
+
if result.stdout:
|
|
140
|
+
data = json.loads(result.stdout)
|
|
141
|
+
for file_path, info in data.items():
|
|
142
|
+
mi_scores[file_path] = info.get('mi', 100.0)
|
|
143
|
+
|
|
144
|
+
return {
|
|
145
|
+
"status": "success",
|
|
146
|
+
"mi_scores": mi_scores,
|
|
147
|
+
"return_code": result.returncode,
|
|
148
|
+
}
|
|
149
|
+
except Exception as e:
|
|
150
|
+
return {"status": "error", "error_message": str(e)}
|
|
151
|
+
|
|
152
|
+
def run_pytest(self) -> dict[str, Any]:
|
|
153
|
+
"""
|
|
154
|
+
Run Pytest test suite and return coverage results.
|
|
155
|
+
"""
|
|
156
|
+
try:
|
|
157
|
+
# When pytest is run within pytest, it can cause issues or hang.
|
|
158
|
+
# Here we just run it as a subprocess to gather results.
|
|
159
|
+
result = subprocess.run(
|
|
160
|
+
["pytest", str(self.target_path), "--cov", "--cov-report=json"],
|
|
161
|
+
capture_output=True,
|
|
162
|
+
text=True,
|
|
163
|
+
check=False
|
|
164
|
+
)
|
|
165
|
+
status = "success" if result.returncode == 0 else "failed"
|
|
166
|
+
return {
|
|
167
|
+
"status": status,
|
|
168
|
+
"output": result.stdout,
|
|
169
|
+
"return_code": result.returncode,
|
|
170
|
+
}
|
|
171
|
+
except Exception as e:
|
|
172
|
+
return {"status": "error", "error_message": str(e)}
|
|
173
|
+
|
|
174
|
+
def evaluate(self) -> dict[str, Any]:
|
|
175
|
+
"""
|
|
176
|
+
Execute all hard evaluation tools.
|
|
177
|
+
Returns a dictionary with results and an overall success boolean.
|
|
178
|
+
"""
|
|
179
|
+
ruff_res = self.run_ruff()
|
|
180
|
+
mypy_res = self.run_mypy()
|
|
181
|
+
ty_res = self.run_ty()
|
|
182
|
+
radon_cc_res = self.run_radon_cc()
|
|
183
|
+
radon_mi_res = self.run_radon_mi()
|
|
184
|
+
# pytest_res = self.run_pytest() # Better handled as a separate stage
|
|
185
|
+
|
|
186
|
+
all_passed = (
|
|
187
|
+
ruff_res.get("status") == "success" and
|
|
188
|
+
mypy_res.get("status") == "success" and
|
|
189
|
+
ty_res.get("status") == "success" and
|
|
190
|
+
radon_cc_res.get("status") == "success"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
"all_passed": all_passed,
|
|
195
|
+
"ruff": ruff_res,
|
|
196
|
+
"mypy": mypy_res,
|
|
197
|
+
"ty": ty_res,
|
|
198
|
+
"radon_cc": radon_cc_res,
|
|
199
|
+
"radon_mi": radon_mi_res
|
|
200
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core module for evaluating self-improvement Governance and Quality Control (QC).
|
|
3
|
+
Based on a simplified version of the LUCA/Sympan profile.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QCEvaluator:
|
|
11
|
+
"""
|
|
12
|
+
Evaluator for checking Governance and QC constraints.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, target_path: str):
|
|
16
|
+
self.target_path = Path(target_path).resolve()
|
|
17
|
+
|
|
18
|
+
def check_hard_invariants(self) -> dict[str, Any]:
|
|
19
|
+
"""
|
|
20
|
+
Verify that fundamental identity invariants are preserved.
|
|
21
|
+
- Ensure no bypassing of core evaluation logic.
|
|
22
|
+
- Check for explicit architectural violations.
|
|
23
|
+
"""
|
|
24
|
+
failures: list[str] = []
|
|
25
|
+
|
|
26
|
+
# Example Structural Check:
|
|
27
|
+
# Has the agent modified the evaluation core directly without
|
|
28
|
+
# going through a proper class D proposal?
|
|
29
|
+
# In a real system, we'd check git diffs or file modification times here.
|
|
30
|
+
# For now, we will simulate passing this invariant.
|
|
31
|
+
|
|
32
|
+
return {
|
|
33
|
+
"status": "success",
|
|
34
|
+
"failures": failures
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
def check_obligations(self) -> dict[str, Any]:
|
|
38
|
+
"""
|
|
39
|
+
Verify that necessary evidence and obligations are met for the changes made.
|
|
40
|
+
Every change MUST provide an Improvement Case (evidence).
|
|
41
|
+
"""
|
|
42
|
+
failures: list[str] = []
|
|
43
|
+
|
|
44
|
+
# In a real implementation, we would check if the proposal manifest
|
|
45
|
+
# defines required obligations and whether corresponding reports
|
|
46
|
+
# (benchmark, holdout) exist in the artifacts.
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
"status": "success",
|
|
50
|
+
"failures": failures
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
def check_self_touch(self) -> dict[str, Any]:
|
|
54
|
+
"""
|
|
55
|
+
Verify if the agent modified the evaluation or governance criteria (Level 1/2).
|
|
56
|
+
If it did, flag it for external certification.
|
|
57
|
+
"""
|
|
58
|
+
failures: list[str] = []
|
|
59
|
+
|
|
60
|
+
# Example check: If the agent modifies QC rules or evaluation logic,
|
|
61
|
+
# it MUST require external certification (Human or higher-level Judge).
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
"status": "success",
|
|
65
|
+
"failures": failures
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def evaluate(self) -> dict[str, Any]:
|
|
69
|
+
"""
|
|
70
|
+
Run all QC checks.
|
|
71
|
+
"""
|
|
72
|
+
invariants = self.check_hard_invariants()
|
|
73
|
+
obligations = self.check_obligations()
|
|
74
|
+
self_touch = self.check_self_touch()
|
|
75
|
+
|
|
76
|
+
failures: list[str] = []
|
|
77
|
+
failures.extend(invariants.get("failures", []))
|
|
78
|
+
failures.extend(obligations.get("failures", []))
|
|
79
|
+
failures.extend(self_touch.get("failures", []))
|
|
80
|
+
|
|
81
|
+
all_passed = len(failures) == 0
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
"all_passed": all_passed,
|
|
85
|
+
"failures": failures,
|
|
86
|
+
"invariants": invariants,
|
|
87
|
+
"obligations": obligations,
|
|
88
|
+
"self_touch": self_touch
|
|
89
|
+
}
|
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core module for agentic soft evaluation and code understanding.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import contextlib
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import random
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import tiktoken
|
|
14
|
+
from openai import OpenAI
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
from rich.console import Console
|
|
17
|
+
|
|
18
|
+
console = Console()
|
|
19
|
+
|
|
20
|
+
class FileSummary(BaseModel):
|
|
21
|
+
summary: str
|
|
22
|
+
key_entities: list[str]
|
|
23
|
+
complexity_score: int
|
|
24
|
+
|
|
25
|
+
class SoftEvaluator:
|
|
26
|
+
"""
|
|
27
|
+
Evaluator for agentic code understanding and reasoning.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, target_path: str):
|
|
31
|
+
self.target_path = Path(target_path).resolve()
|
|
32
|
+
# Initialize token counter (using cl100k_base for gpt-4/claude-3)
|
|
33
|
+
self.encoding: Any = None
|
|
34
|
+
with contextlib.suppress(Exception):
|
|
35
|
+
self.encoding = tiktoken.get_encoding("cl100k_base")
|
|
36
|
+
|
|
37
|
+
# Initialize OpenAI client only if API key is present
|
|
38
|
+
self.client = None
|
|
39
|
+
api_key = os.environ.get("LLM_API_KEY")
|
|
40
|
+
base_url = os.environ.get("LLM_BASE_URL", "https://api.deepseek.com/v1")
|
|
41
|
+
self.model_name = os.environ.get("LLM_MODEL_NAME", "deepseek-reasoner")
|
|
42
|
+
self.mini_model_name = os.environ.get("LLM_MINI_MODEL_NAME", "deepseek-chat")
|
|
43
|
+
|
|
44
|
+
if api_key:
|
|
45
|
+
self.client = OpenAI(api_key=api_key, base_url=base_url)
|
|
46
|
+
else:
|
|
47
|
+
console.print(
|
|
48
|
+
"[yellow]Warning: LLM_API_KEY not set. "
|
|
49
|
+
"Agent will run in mock mode.[/yellow]"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Store extracted AST entities for sampling
|
|
53
|
+
self.extracted_entities: list[dict[str, Any]] = []
|
|
54
|
+
|
|
55
|
+
def _get_python_files(self) -> list[Path]:
|
|
56
|
+
"""
|
|
57
|
+
Recursively find all Python files in the target directory,
|
|
58
|
+
excluding hidden dirs and .venv.
|
|
59
|
+
"""
|
|
60
|
+
python_files = []
|
|
61
|
+
for root, dirs, files in os.walk(self.target_path):
|
|
62
|
+
# Exclude hidden directories and virtual environments
|
|
63
|
+
dirs[:] = [
|
|
64
|
+
d
|
|
65
|
+
for d in dirs
|
|
66
|
+
if not d.startswith(".") and d not in (
|
|
67
|
+
"__pycache__",
|
|
68
|
+
"venv",
|
|
69
|
+
"env",
|
|
70
|
+
"vendors",
|
|
71
|
+
)
|
|
72
|
+
]
|
|
73
|
+
for file in files:
|
|
74
|
+
if file.endswith(".py"):
|
|
75
|
+
python_files.append(Path(root) / file)
|
|
76
|
+
return python_files
|
|
77
|
+
|
|
78
|
+
def calculate_token_complexity(self, file_path: Path) -> int:
|
|
79
|
+
"""
|
|
80
|
+
Calculate the token count for a given file as a proxy
|
|
81
|
+
for cognitive complexity.
|
|
82
|
+
"""
|
|
83
|
+
if not self.encoding:
|
|
84
|
+
return 0
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
content = file_path.read_text(encoding="utf-8")
|
|
88
|
+
return len(self.encoding.encode(content))
|
|
89
|
+
except Exception as e:
|
|
90
|
+
console.print(
|
|
91
|
+
f"[yellow]Warning: Could not read {file_path} for token counting: "
|
|
92
|
+
f"{e}[/yellow]"
|
|
93
|
+
)
|
|
94
|
+
return 0
|
|
95
|
+
|
|
96
|
+
def _extract_ast_entities(self, file_path: Path, content: str) -> None:
|
|
97
|
+
"""
|
|
98
|
+
Parse the AST of a file to extract:
|
|
99
|
+
1. Classes and functions for later QA sampling.
|
|
100
|
+
2. Fan-out (number of imported external modules) to measure coupling.
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
tree = ast.parse(content)
|
|
104
|
+
|
|
105
|
+
# Calculate Fan-out (number of unique imported top-level modules)
|
|
106
|
+
imported_modules = set()
|
|
107
|
+
for node in ast.walk(tree):
|
|
108
|
+
if isinstance(node, ast.Import):
|
|
109
|
+
for alias in node.names:
|
|
110
|
+
imported_modules.add(alias.name.split('.')[0])
|
|
111
|
+
elif isinstance(node, ast.ImportFrom) and node.module:
|
|
112
|
+
imported_modules.add(node.module.split('.')[0])
|
|
113
|
+
|
|
114
|
+
fan_out = len(imported_modules)
|
|
115
|
+
|
|
116
|
+
# Extract classes and functions
|
|
117
|
+
for node in ast.walk(tree):
|
|
118
|
+
if isinstance(
|
|
119
|
+
node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)
|
|
120
|
+
):
|
|
121
|
+
try:
|
|
122
|
+
source_segment = ast.get_source_segment(content, node)
|
|
123
|
+
if source_segment:
|
|
124
|
+
if isinstance(node, ast.ClassDef):
|
|
125
|
+
entity_type = "Class"
|
|
126
|
+
else:
|
|
127
|
+
entity_type = "Function"
|
|
128
|
+
self.extracted_entities.append(
|
|
129
|
+
{
|
|
130
|
+
"file": file_path.name,
|
|
131
|
+
"type": entity_type,
|
|
132
|
+
"name": node.name,
|
|
133
|
+
"code": source_segment,
|
|
134
|
+
"fan_out": fan_out, # Context
|
|
135
|
+
}
|
|
136
|
+
)
|
|
137
|
+
except Exception:
|
|
138
|
+
pass
|
|
139
|
+
except SyntaxError:
|
|
140
|
+
pass # Skip files with syntax errors
|
|
141
|
+
|
|
142
|
+
def summarize_file(self, file_path: Path) -> dict[str, Any]:
|
|
143
|
+
"""
|
|
144
|
+
Use LLM agent to summarize a single file's logic and structure.
|
|
145
|
+
"""
|
|
146
|
+
tokens = self.calculate_token_complexity(file_path)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
content = file_path.read_text(encoding="utf-8")
|
|
150
|
+
self._extract_ast_entities(file_path, content)
|
|
151
|
+
except Exception:
|
|
152
|
+
content = ""
|
|
153
|
+
|
|
154
|
+
simulated_summary = f"File {file_path.name} contains {tokens} tokens."
|
|
155
|
+
key_entities: list[str] = []
|
|
156
|
+
|
|
157
|
+
# Only call LLM if file is not empty and not too large (e.g. > 100k tokens)
|
|
158
|
+
if self.client and content and 0 < tokens < 100000:
|
|
159
|
+
try:
|
|
160
|
+
sys_prompt = (
|
|
161
|
+
"You are a senior Python architect. Analyze the provided Python "
|
|
162
|
+
"file and provide a concise summary of its purpose, a list of "
|
|
163
|
+
"its key entities (classes/functions/globals), and an estimated "
|
|
164
|
+
"cognitive complexity score (1-10).\n"
|
|
165
|
+
"Output MUST be in valid JSON matching this schema: "
|
|
166
|
+
'{"summary": "str", "key_entities": ["str"], "complexity_score": 1}'
|
|
167
|
+
)
|
|
168
|
+
completion = self.client.chat.completions.create(
|
|
169
|
+
model=self.mini_model_name,
|
|
170
|
+
messages=[
|
|
171
|
+
{
|
|
172
|
+
"role": "system",
|
|
173
|
+
"content": sys_prompt,
|
|
174
|
+
},
|
|
175
|
+
{
|
|
176
|
+
"role": "user",
|
|
177
|
+
"content": (
|
|
178
|
+
f"File name: {file_path.name}\n\nContent:\n"
|
|
179
|
+
f"```python\n{content}\n```"
|
|
180
|
+
),
|
|
181
|
+
},
|
|
182
|
+
],
|
|
183
|
+
response_format={"type": "json_object"},
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
content_str = completion.choices[0].message.content
|
|
187
|
+
if content_str:
|
|
188
|
+
result = json.loads(content_str)
|
|
189
|
+
simulated_summary = result.get("summary", simulated_summary)
|
|
190
|
+
key_entities = result.get("key_entities", key_entities)
|
|
191
|
+
except Exception as e:
|
|
192
|
+
console.print(
|
|
193
|
+
f"[yellow] Agent failed to read {file_path.name}: {e}[/yellow]"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
rel_path = str(file_path.relative_to(self.target_path))
|
|
198
|
+
except ValueError:
|
|
199
|
+
rel_path = str(file_path)
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
"file": rel_path,
|
|
203
|
+
"tokens": tokens,
|
|
204
|
+
"summary": simulated_summary,
|
|
205
|
+
"key_entities": key_entities,
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
def summarize_package(self) -> dict[str, Any]:
|
|
209
|
+
"""
|
|
210
|
+
Aggregate file summaries into a package-level understanding.
|
|
211
|
+
"""
|
|
212
|
+
files = self._get_python_files()
|
|
213
|
+
file_summaries = []
|
|
214
|
+
total_tokens = 0
|
|
215
|
+
|
|
216
|
+
console.print(
|
|
217
|
+
f"[cyan]Agent is analyzing {len(files)} Python files "
|
|
218
|
+
f"for cognitive load and architecture...[/cyan]"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
for file in files:
|
|
222
|
+
summary_data = self.summarize_file(file)
|
|
223
|
+
file_summaries.append(summary_data)
|
|
224
|
+
total_tokens += summary_data["tokens"]
|
|
225
|
+
|
|
226
|
+
# Synthesize package architecture
|
|
227
|
+
package_understanding = (
|
|
228
|
+
f"The package contains {len(files)} files with a total cognitive load "
|
|
229
|
+
f"of {total_tokens} tokens."
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if self.client and file_summaries:
|
|
233
|
+
try:
|
|
234
|
+
console.print(
|
|
235
|
+
"[cyan]Agent is synthesizing global package architecture...[/cyan]"
|
|
236
|
+
)
|
|
237
|
+
manifest_lines = [
|
|
238
|
+
f"- {s['file']}: {s['summary']} "
|
|
239
|
+
f"(Entities: {', '.join(s['key_entities'])})"
|
|
240
|
+
for s in file_summaries
|
|
241
|
+
]
|
|
242
|
+
manifest = "\n".join(manifest_lines)
|
|
243
|
+
|
|
244
|
+
sys_prompt = (
|
|
245
|
+
"You are a senior software architect. Based on the following "
|
|
246
|
+
"summaries of individual files in a Python package, write a "
|
|
247
|
+
"coherent, high-level explanation of how this entire package "
|
|
248
|
+
"works and what its primary responsibilities are. Be concise "
|
|
249
|
+
"but comprehensive."
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
completion = self.client.chat.completions.create(
|
|
253
|
+
model=self.model_name,
|
|
254
|
+
messages=[
|
|
255
|
+
{
|
|
256
|
+
"role": "system",
|
|
257
|
+
"content": sys_prompt,
|
|
258
|
+
},
|
|
259
|
+
{
|
|
260
|
+
"role": "user",
|
|
261
|
+
"content": f"Package files and summaries:\n{manifest}",
|
|
262
|
+
},
|
|
263
|
+
],
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
package_understanding = (
|
|
267
|
+
completion.choices[0].message.content or package_understanding
|
|
268
|
+
)
|
|
269
|
+
except Exception as e:
|
|
270
|
+
console.print(
|
|
271
|
+
f"[yellow]Agent failed to synthesize package: {e}[/yellow]"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return {
|
|
275
|
+
"total_files": len(files),
|
|
276
|
+
"total_tokens": total_tokens,
|
|
277
|
+
"file_level_summaries": file_summaries,
|
|
278
|
+
"package_understanding": package_understanding,
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
def run_sampling_qa(self) -> dict[str, Any]:
|
|
282
|
+
"""
|
|
283
|
+
Randomly sample modules/variables and ask the Agent questions
|
|
284
|
+
to measure understandability.
|
|
285
|
+
"""
|
|
286
|
+
if not self.extracted_entities:
|
|
287
|
+
return {
|
|
288
|
+
"qa_score": 100.0,
|
|
289
|
+
"sampled_entities": [],
|
|
290
|
+
"note": "No entities found for sampling.",
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
# Randomly sample up to 3 entities
|
|
294
|
+
sample_size = min(3, len(self.extracted_entities))
|
|
295
|
+
sampled = random.sample(self.extracted_entities, sample_size)
|
|
296
|
+
|
|
297
|
+
console.print(
|
|
298
|
+
f"\n[cyan]Agent is running Blind QA on {sample_size} "
|
|
299
|
+
f"sampled entities...[/cyan]"
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
qa_results = []
|
|
303
|
+
total_score = 0.0
|
|
304
|
+
|
|
305
|
+
for entity in sampled:
|
|
306
|
+
entity_name = entity["name"]
|
|
307
|
+
entity_type = entity["type"]
|
|
308
|
+
entity_code = entity["code"]
|
|
309
|
+
fan_out = entity.get("fan_out", 0)
|
|
310
|
+
|
|
311
|
+
if not self.client:
|
|
312
|
+
# Mock evaluation
|
|
313
|
+
score = 100.0
|
|
314
|
+
feedback = "Mock evaluation: Code is perfectly readable."
|
|
315
|
+
else:
|
|
316
|
+
try:
|
|
317
|
+
sys_prompt = (
|
|
318
|
+
"You are an expert Code Reviewer and Software Architect. "
|
|
319
|
+
"You will be given a snippet of Python code (a class or "
|
|
320
|
+
"function) along with its module's Fan-out metric (number "
|
|
321
|
+
"of external dependencies). Your task is to evaluate its "
|
|
322
|
+
"readability and structural cohesion.\n"
|
|
323
|
+
"Output MUST be in valid JSON matching this schema: "
|
|
324
|
+
'{"explanation": "str", "readability_score": 1, '
|
|
325
|
+
'"feedback": "str"}\n'
|
|
326
|
+
"- `explanation`: Briefly explain what this code does.\n"
|
|
327
|
+
"- `readability_score`: A score from 0 to 100.\n"
|
|
328
|
+
"- `feedback`: What makes it easy/hard to understand? "
|
|
329
|
+
"Does a high Fan-out indicate bad cohesion here?"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
user_content = (
|
|
333
|
+
f"Module Fan-out (Dependencies): {fan_out}\n\n"
|
|
334
|
+
f"Code Snippet:\n```python\n{entity_code}\n```"
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
completion = self.client.chat.completions.create(
|
|
338
|
+
model=self.mini_model_name,
|
|
339
|
+
messages=[
|
|
340
|
+
{"role": "system", "content": sys_prompt},
|
|
341
|
+
{"role": "user", "content": user_content},
|
|
342
|
+
],
|
|
343
|
+
response_format={"type": "json_object"},
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
content_str = completion.choices[0].message.content
|
|
347
|
+
if content_str:
|
|
348
|
+
result = json.loads(content_str)
|
|
349
|
+
score = float(result.get("readability_score", 100))
|
|
350
|
+
feedback = result.get("feedback", "")
|
|
351
|
+
else:
|
|
352
|
+
score = 80.0
|
|
353
|
+
feedback = "Failed to parse Agent response."
|
|
354
|
+
except Exception as e:
|
|
355
|
+
score = 0.0
|
|
356
|
+
feedback = f"Error during Agent evaluation: {e}"
|
|
357
|
+
|
|
358
|
+
total_score += score
|
|
359
|
+
qa_results.append(
|
|
360
|
+
{
|
|
361
|
+
"entity": f"{entity_type} {entity_name} (from {entity['file']})",
|
|
362
|
+
"score": score,
|
|
363
|
+
"feedback": feedback,
|
|
364
|
+
}
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
final_average_score = total_score / sample_size if sample_size > 0 else 100.0
|
|
368
|
+
|
|
369
|
+
return {
|
|
370
|
+
"qa_score": final_average_score,
|
|
371
|
+
"sampled_entities": qa_results,
|
|
372
|
+
"note": "Sampling QA completed.",
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
def generate_final_report(
|
|
376
|
+
self, hard_results: dict[str, Any], soft_results: dict[str, Any]
|
|
377
|
+
) -> dict[str, Any]:
|
|
378
|
+
"""
|
|
379
|
+
Synthesize all evaluation results into a final verdict and exactly
|
|
380
|
+
3 actionable suggestions.
|
|
381
|
+
"""
|
|
382
|
+
if not self.client:
|
|
383
|
+
return {
|
|
384
|
+
"verdict": "Pass (Mock)",
|
|
385
|
+
"summary": "Mock evaluation completed without LLM.",
|
|
386
|
+
"suggestions": [
|
|
387
|
+
{
|
|
388
|
+
"title": "Mock Suggestion 1",
|
|
389
|
+
"description": "Add more docstrings.",
|
|
390
|
+
"target_file": "all"
|
|
391
|
+
},
|
|
392
|
+
{
|
|
393
|
+
"title": "Mock Suggestion 2",
|
|
394
|
+
"description": "Refactor large functions.",
|
|
395
|
+
"target_file": "all"
|
|
396
|
+
},
|
|
397
|
+
{
|
|
398
|
+
"title": "Mock Suggestion 3",
|
|
399
|
+
"description": "Improve test coverage.",
|
|
400
|
+
"target_file": "tests/"
|
|
401
|
+
}
|
|
402
|
+
]
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
try:
|
|
406
|
+
# Extract key metrics for the prompt
|
|
407
|
+
cc_issues = hard_results.get("radon_cc", {}).get("issues", [])
|
|
408
|
+
mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
|
|
409
|
+
avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
|
|
410
|
+
|
|
411
|
+
qa_score = soft_results.get("understandability_score", 100.0)
|
|
412
|
+
qa_entities = soft_results.get("qa_results", {}).get("sampled_entities", [])
|
|
413
|
+
|
|
414
|
+
sys_prompt = (
|
|
415
|
+
"You are an elite Python Codebase Evaluator. You have just analyzed "
|
|
416
|
+
"a repository. Your task is to provide a final judgment and EXACTLY "
|
|
417
|
+
"3 concrete, actionable improvement suggestions. These suggestions "
|
|
418
|
+
"MUST NOT change the external functionality (they are refactoring/"
|
|
419
|
+
"quality improvements).\n\n"
|
|
420
|
+
"Output MUST be in valid JSON matching this schema:\n"
|
|
421
|
+
"{\n"
|
|
422
|
+
' "verdict": "Pass" or "Fail",\n'
|
|
423
|
+
' "summary": "One paragraph summary of codebase health",\n'
|
|
424
|
+
' "suggestions": [\n'
|
|
425
|
+
' {"title": "str", "description": "str", "target_file": "str"}\n'
|
|
426
|
+
" ]\n"
|
|
427
|
+
"}\n"
|
|
428
|
+
"Rule for Verdict: Pass if Average Maintainability > 50 and "
|
|
429
|
+
"QA Score > 75 and no Critical CC issues (>15). Otherwise Fail."
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
user_content = (
|
|
433
|
+
f"Metrics:\n"
|
|
434
|
+
f"- Average Maintainability Index (MI): {avg_mi:.1f}/100\n"
|
|
435
|
+
f"- Number of functions with Cyclomatic Complexity > 15: "
|
|
436
|
+
f"{len(cc_issues)}\n"
|
|
437
|
+
f"- Agent QA Readability Score: {qa_score:.1f}/100\n\n"
|
|
438
|
+
f"QA Feedback Snippets:\n"
|
|
439
|
+
+ "\n".join(
|
|
440
|
+
[f" * {q['entity']}: {q['feedback']}" for q in qa_entities]
|
|
441
|
+
)
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
completion = self.client.chat.completions.create(
|
|
445
|
+
model=self.model_name,
|
|
446
|
+
messages=[
|
|
447
|
+
{"role": "system", "content": sys_prompt},
|
|
448
|
+
{"role": "user", "content": user_content},
|
|
449
|
+
],
|
|
450
|
+
response_format={"type": "json_object"},
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
content_str = completion.choices[0].message.content
|
|
454
|
+
if content_str:
|
|
455
|
+
parsed_json = json.loads(content_str)
|
|
456
|
+
if isinstance(parsed_json, dict):
|
|
457
|
+
return parsed_json
|
|
458
|
+
else:
|
|
459
|
+
raise ValueError("JSON response is not a dictionary.")
|
|
460
|
+
else:
|
|
461
|
+
raise ValueError("Empty response from Agent.")
|
|
462
|
+
except Exception as e:
|
|
463
|
+
console.print(f"[yellow]Failed to generate final report: {e}[/yellow]")
|
|
464
|
+
return {
|
|
465
|
+
"verdict": "Error",
|
|
466
|
+
"summary": f"Failed to synthesize report: {e}",
|
|
467
|
+
"suggestions": []
|
|
468
|
+
}
|
|
469
|
+
def evaluate(self) -> dict[str, Any]:
|
|
470
|
+
"""
|
|
471
|
+
Execute soft evaluation workflows including summarization and Q&A.
|
|
472
|
+
"""
|
|
473
|
+
package_summary = self.summarize_package()
|
|
474
|
+
qa_results = self.run_sampling_qa()
|
|
475
|
+
|
|
476
|
+
# Calculate a mock understandability score based on token density
|
|
477
|
+
# (just an example heuristic)
|
|
478
|
+
# In reality, this will be based on the QA results and LLM judge
|
|
479
|
+
understandability_score = qa_results["qa_score"]
|
|
480
|
+
|
|
481
|
+
return {
|
|
482
|
+
"status": "success",
|
|
483
|
+
"understandability_score": understandability_score,
|
|
484
|
+
"package_summary": package_summary,
|
|
485
|
+
"qa_results": qa_results,
|
|
486
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: python-harness
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: An agentic codebase evaluation and evolution tool for Python projects.
|
|
5
|
+
Author-email: Mingli Yuan <mingli.yuan@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: typer>=0.9.0
|
|
11
|
+
Requires-Dist: rich>=13.0.0
|
|
12
|
+
Requires-Dist: pydantic>=2.0.0
|
|
13
|
+
Requires-Dist: openai>=1.0.0
|
|
14
|
+
Requires-Dist: anthropic>=0.18.0
|
|
15
|
+
Requires-Dist: tenacity>=8.2.0
|
|
16
|
+
Requires-Dist: tiktoken>=0.6.0
|
|
17
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
21
|
+
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
22
|
+
Requires-Dist: mypy>=1.9.0; extra == "dev"
|
|
23
|
+
Requires-Dist: ty>=0.0.1; extra == "dev"
|
|
24
|
+
Requires-Dist: radon>=6.0.1; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# Python Harness
|
|
28
|
+
|
|
29
|
+
An agentic codebase evaluation and evolution tool for Python projects.
|
|
30
|
+
|
|
31
|
+
`python-harness` is designed to be a universal standard tool—just like `pytest` or `ruff`—but instead of just checking syntax or running tests, it evaluates the **architecture, readability, and governance** of your codebase using both static analysis and LLMs (DeepSeek/OpenAI).
|
|
32
|
+
|
|
33
|
+
## Features
|
|
34
|
+
|
|
35
|
+
1. **Hard Evaluation (First Fence)**: Enforces strict rules using `ruff`, `mypy`, and `ty`. Evaluates Cyclomatic Complexity (CC) and Maintainability Index (MI) via `radon`.
|
|
36
|
+
2. **Governance QC (Second Fence)**: Checks if the changes violate core project governance or attempt to bypass the evaluation rules themselves.
|
|
37
|
+
3. **Soft Evaluation (Third Fence)**:
|
|
38
|
+
- Calculates architecture metrics like Fan-out (coupling).
|
|
39
|
+
- Generates a holistic package understanding using LLMs.
|
|
40
|
+
- Performs "Blind QA": Randomly samples functions/classes and tests the LLM's ability to understand them without context.
|
|
41
|
+
4. **Actionable Output**: Synthesizes the evaluation into a final `Pass/Fail` verdict with exactly 3 concrete, actionable refactoring suggestions.
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
You can install `python-harness` using `uv` or `pip`:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
uv pip install python-harness
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Configuration
|
|
52
|
+
|
|
53
|
+
`python-harness` requires an LLM to perform its soft evaluation. Create a `.env` file in the root of your project:
|
|
54
|
+
|
|
55
|
+
```env
|
|
56
|
+
LLM_API_KEY=your_api_key_here
|
|
57
|
+
LLM_BASE_URL=https://api.deepseek.com/v1
|
|
58
|
+
LLM_MODEL_NAME=deepseek-reasoner
|
|
59
|
+
LLM_MINI_MODEL_NAME=deepseek-chat
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
*(Note: If you don't provide an API key, the harness will safely run in Mock mode).*
|
|
63
|
+
|
|
64
|
+
## Usage
|
|
65
|
+
|
|
66
|
+
### 1. Measure
|
|
67
|
+
|
|
68
|
+
To evaluate your codebase, run the `measure` command in your project directory:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
harness measure .
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This will run the full 3-fence evaluation and output a report with a final verdict and top 3 improvement suggestions.
|
|
75
|
+
|
|
76
|
+
### 2. Refine (Evolution Loop - WIP)
|
|
77
|
+
|
|
78
|
+
The `refine` command is an Agentic Edit-Test-Improve loop. It takes the suggestions generated by `measure`, automatically creates branches (variants), applies the changes, runs the tests (`pytest`), and picks the best variant.
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
harness refine . --steps 1 --max-retries 3
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
MIT License. See [LICENSE](LICENSE) for more details.
|
|
87
|
+
|
|
88
|
+
A harness toolkit for Python projects
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
python_harness/__init__.py,sha256=i1W_gUvOQUO7yp0Vu_jQbOYiqR5Nf651N5A36E118X8,90
|
|
2
|
+
python_harness/cli.py,sha256=zyhC5gqo3CaieRKLkcaw_zDkOZr0PnEdzorG2AfU4bA,7815
|
|
3
|
+
python_harness/evaluator.py,sha256=Mfyg5vvL0GMKxnKLqokd5dxn0J1ob8yBd7-316zsaGw,1307
|
|
4
|
+
python_harness/hard_evaluator.py,sha256=AHcS2jn1GyePv8UK0vvpa5rH7bwXChpg5A_5m5WHGxk,6669
|
|
5
|
+
python_harness/qc_evaluator.py,sha256=Mw_nxu253ERwV4lzWEhTTL9iN3_qBsMmi72Fz9cA4Fw,2883
|
|
6
|
+
python_harness/soft_evaluator.py,sha256=OfTaRT2h14_VzNlMx1qlj2RzF_dcD-TFhjZaCri-TUg,19117
|
|
7
|
+
python_harness-0.0.1.dist-info/licenses/LICENSE,sha256=rMiBapfK7KDDmBOyspVvaqy1OFWHUe-0DoiRE9A3dL0,1068
|
|
8
|
+
python_harness-0.0.1.dist-info/METADATA,sha256=knKHBMROxKTsbd3_SSJFDr80HVjPIa5bdBEHSJ0Sknc,3149
|
|
9
|
+
python_harness-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
10
|
+
python_harness-0.0.1.dist-info/entry_points.txt,sha256=6OPLethPEEz4MlRoxgUx7SYmN22P2aogi1jaorcrVVM,51
|
|
11
|
+
python_harness-0.0.1.dist-info/top_level.txt,sha256=PxPMOpwPhfTaZxV4tX2LuRS5Sb6MEGutOm62DYMXXCQ,15
|
|
12
|
+
python_harness-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mingli Yuan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
python_harness
|