python-harness 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ """
2
+ Python Harness - An agentic evaluation tool for codebases.
3
+ """
4
+
5
+ __version__ = "0.0.1"
python_harness/cli.py ADDED
@@ -0,0 +1,207 @@
1
+ """
2
+ Command-line interface for python-harness.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+
8
+ import typer
9
+ from dotenv import load_dotenv
10
+ from rich.console import Console
11
+
12
+ from python_harness.evaluator import Evaluator
13
+
14
+ # Try to find .env file explicitly before anything else executes
15
+ env_path = os.path.join(os.getcwd(), '.env')
16
+ if os.path.exists(env_path):
17
+ load_dotenv(dotenv_path=env_path)
18
+ else:
19
+ load_dotenv() # Fallback to default search
20
+
21
+ app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
22
+ console = Console()
23
+
24
+
25
+ @app.command()
26
+ def refine(
27
+ path: str = typer.Argument(".", help="The path to evaluate and evolve"),
28
+ steps: int = typer.Option(1, help="Number of evolution steps to perform"),
29
+ max_retries: int = typer.Option(3, help="Maximum retries per variant if tests fail")
30
+ ) -> None:
31
+ """
32
+ Refine the codebase through an agentic Edit-Test-Improve loop.
33
+ Generates variants based on suggestions, tests them, and picks the best.
34
+ """
35
+ console.print(
36
+ f"[bold magenta]Starting evolution loop for path:[/bold magenta] {path} "
37
+ f"[dim](steps={steps}, max_retries={max_retries})[/dim]"
38
+ )
39
+
40
+ # 1. First, run a baseline evaluation to get suggestions
41
+ evaluator = Evaluator(path)
42
+ console.print("[cyan]Running baseline evaluation...[/cyan]")
43
+ hard_results = evaluator.hard_evaluator.evaluate()
44
+ soft_results = evaluator.soft_evaluator.evaluate()
45
+ baseline_report = evaluator.soft_evaluator.generate_final_report(
46
+ hard_results, soft_results
47
+ )
48
+
49
+ suggestions = baseline_report.get("suggestions", [])
50
+ if not suggestions:
51
+ console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
52
+ return
53
+
54
+ console.print(
55
+ f"[green]Found {len(suggestions)} suggestions. "
56
+ f"Starting evolution branches...[/green]"
57
+ )
58
+
59
+ # TODO: Implement the Git branching and Agent modification logic here.
60
+ # The loop will be:
61
+ # for step in range(steps):
62
+ # for suggestion in suggestions:
63
+ # checkout new branch variant-X
64
+ # for retry in range(max_retries):
65
+ # ask LLM to apply suggestion to code
66
+ # run pytest
67
+ # if pytest passes:
68
+ # run harness . to get new score
69
+ # break
70
+ # else:
71
+ # feed error back to LLM for retry
72
+ # compare all variants and checkout the best one
73
+
74
+ console.print(
75
+ "[yellow]Evolution engine skeleton ready. "
76
+ "Actual git mutation logic pending.[/yellow]"
77
+ )
78
+ @app.command()
79
+ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
80
+ """
81
+ Measure the codebase against hard, soft, and governance constraints.
82
+ Outputs a final report with scores and actionable improvement suggestions.
83
+ """
84
+ console.print(
85
+ f"[bold green]Starting harness measurement for path:[/bold green] {path}"
86
+ )
87
+
88
+ evaluator = Evaluator(path)
89
+
90
+ # 1. Hard Evaluation Gate (First Fence)
91
+ console.print("[bold blue]Running Hard Evaluation (ruff, mypy)...[/bold blue]")
92
+ hard_results = evaluator.hard_evaluator.evaluate()
93
+
94
+ if not hard_results["all_passed"]:
95
+ console.print("[bold red]Hard Evaluation Failed! Exiting.[/bold red]")
96
+ if hard_results["ruff"]["status"] != "success":
97
+ console.print("[red]Ruff issues found.[/red]")
98
+ if hard_results["mypy"]["status"] != "success":
99
+ output = hard_results["mypy"].get("output", "")
100
+ console.print(f"[red]Mypy issues found:[/red]\n{output}")
101
+ if hard_results["ty"]["status"] != "success":
102
+ output = hard_results["ty"].get("output", "")
103
+ console.print(f"[red]Ty issues found:[/red]\n{output}")
104
+ if hard_results["radon_cc"]["status"] != "success":
105
+ issues = hard_results["radon_cc"].get("issues", [])
106
+ console.print(
107
+ f"[red]Cyclomatic Complexity too high "
108
+ f"({len(issues)} functions > 15):[/red]"
109
+ )
110
+ for issue in issues:
111
+ console.print(
112
+ f" - {issue['file']}: {issue['type']} '{issue['name']}' "
113
+ f"has CC {issue['complexity']}"
114
+ )
115
+ sys.exit(1)
116
+
117
+ console.print("[bold green]Hard Evaluation Passed![/bold green]")
118
+
119
+ # Print Maintainability Index scorecard
120
+ mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
121
+ if mi_scores:
122
+ avg_mi = sum(mi_scores.values()) / len(mi_scores)
123
+ color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
124
+ console.print(
125
+ f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]"
126
+ )
127
+
128
+ # 2. Governance/QC Evaluation (Second Fence)
129
+ console.print("\n[bold blue]Running Governance QC (Second Fence)...[/bold blue]")
130
+ qc_results = evaluator.qc_evaluator.evaluate()
131
+
132
+ if not qc_results["all_passed"]:
133
+ console.print("[bold red]Governance QC Failed! Exiting.[/bold red]")
134
+ console.print(
135
+ "[red]The proposed changes violate governance constraints "
136
+ "or lack sufficient evidence.[/red]"
137
+ )
138
+ for failure in qc_results["failures"]:
139
+ console.print(f"[red]- {failure}[/red]")
140
+ sys.exit(1)
141
+
142
+ console.print(
143
+ "[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
144
+ )
145
+
146
+ # 3. Soft Evaluation/Readability (Third Fence)
147
+ console.print(
148
+ "[bold blue]Running Soft Evaluation "
149
+ "(Readability & Understandability)...[/bold blue]"
150
+ )
151
+ soft_results = evaluator.soft_evaluator.evaluate()
152
+
153
+ pkg_summary = soft_results["package_summary"]
154
+ console.print(
155
+ f"[green]Analyzed {pkg_summary['total_files']} files with a total of "
156
+ f"{pkg_summary['total_tokens']} tokens.[/green]"
157
+ )
158
+ console.print(
159
+ f"[magenta]Agent's Understanding of the Package:[/magenta]\n"
160
+ f"{pkg_summary['package_understanding']}"
161
+ )
162
+
163
+ console.print(
164
+ f"\n[cyan]Overall Understandability Score:[/cyan] "
165
+ f"{soft_results['understandability_score']:.1f}/100"
166
+ )
167
+
168
+ qa_results = soft_results.get("qa_results", {}).get("sampled_entities", [])
169
+ if qa_results:
170
+ console.print("\n[bold yellow]Blind QA Sampling Results:[/bold yellow]")
171
+ for qa in qa_results:
172
+ color = "green" if qa['score'] >= 80 else "red"
173
+ console.print(f" - [{color}]{qa['entity']}: Score {qa['score']}[/{color}]")
174
+ console.print(f" [dim]Feedback: {qa['feedback']}[/dim]")
175
+
176
+ console.print("\n[yellow]Evaluation completed. Generating report...[/yellow]\n")
177
+
178
+ # Generate Final Report
179
+ final_report = evaluator.soft_evaluator.generate_final_report(
180
+ hard_results, soft_results
181
+ )
182
+
183
+ if final_report:
184
+ verdict = final_report.get("verdict", "Unknown")
185
+ verdict_color = "bold green" if "Pass" in verdict else "bold red"
186
+
187
+ console.print(
188
+ f"[{verdict_color}]=== FINAL VERDICT: {verdict} ===[/{verdict_color}]"
189
+ )
190
+ console.print(f"[bold]Summary:[/bold] {final_report.get('summary', '')}\n")
191
+
192
+ suggestions = final_report.get("suggestions", [])
193
+ if suggestions:
194
+ console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
195
+ for i, sug in enumerate(suggestions, 1):
196
+ console.print(
197
+ f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
198
+ f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
199
+ )
200
+ console.print(f" [dim]{sug.get('description', '')}[/dim]")
201
+
202
+ if "Fail" in verdict:
203
+ sys.exit(1)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ app()
@@ -0,0 +1,42 @@
1
+ """
2
+ Core module for integrating all evaluations and producing the final report.
3
+ """
4
+
5
+ from typing import Any
6
+
7
+ from python_harness.hard_evaluator import HardEvaluator
8
+ from python_harness.qc_evaluator import QCEvaluator
9
+ from python_harness.soft_evaluator import SoftEvaluator
10
+
11
+
12
+ class Evaluator:
13
+ """
14
+ Main evaluator coordinating hard, QC, and soft assessments.
15
+ """
16
+
17
+ def __init__(self, target_path: str):
18
+ self.target_path = target_path
19
+ self.hard_evaluator = HardEvaluator(target_path)
20
+ self.qc_evaluator = QCEvaluator(target_path)
21
+ self.soft_evaluator = SoftEvaluator(target_path)
22
+
23
+ def run(self) -> dict[str, Any]:
24
+ """
25
+ Run the complete evaluation process.
26
+ """
27
+ hard_results = self.hard_evaluator.evaluate()
28
+ qc_results = self.qc_evaluator.evaluate()
29
+ soft_results = self.soft_evaluator.evaluate()
30
+
31
+ # Generate Final Synthesized Report with 3 Suggestions
32
+ final_report = self.soft_evaluator.generate_final_report(
33
+ hard_results, soft_results
34
+ )
35
+
36
+ return {
37
+ "hard_evaluation": hard_results,
38
+ "qc_evaluation": qc_results,
39
+ "soft_evaluation": soft_results,
40
+ "final_report": final_report,
41
+ "overall_status": "success",
42
+ }
@@ -0,0 +1,200 @@
1
+ """
2
+ Core module for integrating hard evaluation tools like ruff, mypy, and pytest.
3
+ """
4
+
5
+ import json
6
+ import subprocess
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from rich.console import Console
11
+
12
+ console = Console()
13
+
14
+ class HardEvaluator:
15
+ """
16
+ Evaluator for collecting structural code quality metrics.
17
+ """
18
+
19
+ def __init__(self, target_path: str):
20
+ self.target_path = Path(target_path).resolve()
21
+
22
+ def run_ruff(self) -> dict[str, Any]:
23
+ """
24
+ Run Ruff linter and return results.
25
+ """
26
+ try:
27
+ result = subprocess.run(
28
+ ["ruff", "check", str(self.target_path), "--output-format", "json"],
29
+ capture_output=True,
30
+ text=True,
31
+ check=False
32
+ )
33
+ issues = json.loads(result.stdout) if result.stdout else []
34
+ status = "success" if result.returncode == 0 else "failed"
35
+ return {
36
+ "status": status,
37
+ "issues": issues,
38
+ "return_code": result.returncode,
39
+ }
40
+ except Exception as e:
41
+ return {"status": "error", "error_message": str(e)}
42
+
43
+ def run_mypy(self) -> dict[str, Any]:
44
+ """
45
+ Run Mypy type checker and return results.
46
+ """
47
+ try:
48
+ result = subprocess.run(
49
+ ["mypy", str(self.target_path)],
50
+ capture_output=True,
51
+ text=True,
52
+ check=False
53
+ )
54
+ status = "success" if result.returncode == 0 else "failed"
55
+ return {
56
+ "status": status,
57
+ "output": result.stdout,
58
+ "return_code": result.returncode,
59
+ }
60
+ except Exception as e:
61
+ return {"status": "error", "error_message": str(e)}
62
+
63
+ def run_ty(self) -> dict[str, Any]:
64
+ """
65
+ Run ty language server checks.
66
+ """
67
+ try:
68
+ result = subprocess.run(
69
+ ["ty", "check", str(self.target_path)],
70
+ capture_output=True,
71
+ text=True,
72
+ check=False
73
+ )
74
+ status = "success" if result.returncode == 0 else "failed"
75
+ return {
76
+ "status": status,
77
+ "output": result.stdout,
78
+ "return_code": result.returncode,
79
+ }
80
+ except Exception as e:
81
+ return {"status": "error", "error_message": str(e)}
82
+
83
+ def run_radon_cc(self) -> dict[str, Any]:
84
+ """
85
+ Run Radon cyclomatic complexity check.
86
+ Flag any function/method with CC > 15 as a failure.
87
+ """
88
+ try:
89
+ result = subprocess.run(
90
+ ["radon", "cc", "-j", "-a", str(self.target_path)],
91
+ capture_output=True,
92
+ text=True,
93
+ check=False
94
+ )
95
+
96
+ issues = []
97
+ status = "success"
98
+
99
+ if result.stdout:
100
+ data = json.loads(result.stdout)
101
+ for file_path, blocks in data.items():
102
+ if isinstance(blocks, list):
103
+ for block in blocks:
104
+ if block.get('complexity', 0) > 15:
105
+ issues.append({
106
+ "file": file_path,
107
+ "name": block.get('name'),
108
+ "type": block.get('type'),
109
+ "complexity": block.get('complexity')
110
+ })
111
+
112
+ if issues:
113
+ status = "failed"
114
+
115
+ return {
116
+ "status": status,
117
+ "issues": issues,
118
+ "return_code": result.returncode,
119
+ "output": result.stdout
120
+ }
121
+ except Exception as e:
122
+ return {"status": "error", "error_message": str(e)}
123
+
124
+ def run_radon_mi(self) -> dict[str, Any]:
125
+ """
126
+ Run Radon Maintainability Index (MI) check.
127
+ This is a diagnostic metric, so it won't fail the build,
128
+ but it contributes to the scorecard.
129
+ """
130
+ try:
131
+ result = subprocess.run(
132
+ ["radon", "mi", "-j", str(self.target_path)],
133
+ capture_output=True,
134
+ text=True,
135
+ check=False
136
+ )
137
+
138
+ mi_scores = {}
139
+ if result.stdout:
140
+ data = json.loads(result.stdout)
141
+ for file_path, info in data.items():
142
+ mi_scores[file_path] = info.get('mi', 100.0)
143
+
144
+ return {
145
+ "status": "success",
146
+ "mi_scores": mi_scores,
147
+ "return_code": result.returncode,
148
+ }
149
+ except Exception as e:
150
+ return {"status": "error", "error_message": str(e)}
151
+
152
+ def run_pytest(self) -> dict[str, Any]:
153
+ """
154
+ Run Pytest test suite and return coverage results.
155
+ """
156
+ try:
157
+ # When pytest is run within pytest, it can cause issues or hang.
158
+ # Here we just run it as a subprocess to gather results.
159
+ result = subprocess.run(
160
+ ["pytest", str(self.target_path), "--cov", "--cov-report=json"],
161
+ capture_output=True,
162
+ text=True,
163
+ check=False
164
+ )
165
+ status = "success" if result.returncode == 0 else "failed"
166
+ return {
167
+ "status": status,
168
+ "output": result.stdout,
169
+ "return_code": result.returncode,
170
+ }
171
+ except Exception as e:
172
+ return {"status": "error", "error_message": str(e)}
173
+
174
+ def evaluate(self) -> dict[str, Any]:
175
+ """
176
+ Execute all hard evaluation tools.
177
+ Returns a dictionary with results and an overall success boolean.
178
+ """
179
+ ruff_res = self.run_ruff()
180
+ mypy_res = self.run_mypy()
181
+ ty_res = self.run_ty()
182
+ radon_cc_res = self.run_radon_cc()
183
+ radon_mi_res = self.run_radon_mi()
184
+ # pytest_res = self.run_pytest() # Better handled as a separate stage
185
+
186
+ all_passed = (
187
+ ruff_res.get("status") == "success" and
188
+ mypy_res.get("status") == "success" and
189
+ ty_res.get("status") == "success" and
190
+ radon_cc_res.get("status") == "success"
191
+ )
192
+
193
+ return {
194
+ "all_passed": all_passed,
195
+ "ruff": ruff_res,
196
+ "mypy": mypy_res,
197
+ "ty": ty_res,
198
+ "radon_cc": radon_cc_res,
199
+ "radon_mi": radon_mi_res
200
+ }
@@ -0,0 +1,89 @@
1
+ """
2
+ Core module for evaluating self-improvement Governance and Quality Control (QC).
3
+ Based on a simplified version of the LUCA/Sympan profile.
4
+ """
5
+
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ class QCEvaluator:
11
+ """
12
+ Evaluator for checking Governance and QC constraints.
13
+ """
14
+
15
+ def __init__(self, target_path: str):
16
+ self.target_path = Path(target_path).resolve()
17
+
18
+ def check_hard_invariants(self) -> dict[str, Any]:
19
+ """
20
+ Verify that fundamental identity invariants are preserved.
21
+ - Ensure no bypassing of core evaluation logic.
22
+ - Check for explicit architectural violations.
23
+ """
24
+ failures: list[str] = []
25
+
26
+ # Example Structural Check:
27
+ # Has the agent modified the evaluation core directly without
28
+ # going through a proper class D proposal?
29
+ # In a real system, we'd check git diffs or file modification times here.
30
+ # For now, we will simulate passing this invariant.
31
+
32
+ return {
33
+ "status": "success",
34
+ "failures": failures
35
+ }
36
+
37
+ def check_obligations(self) -> dict[str, Any]:
38
+ """
39
+ Verify that necessary evidence and obligations are met for the changes made.
40
+ Every change MUST provide an Improvement Case (evidence).
41
+ """
42
+ failures: list[str] = []
43
+
44
+ # In a real implementation, we would check if the proposal manifest
45
+ # defines required obligations and whether corresponding reports
46
+ # (benchmark, holdout) exist in the artifacts.
47
+
48
+ return {
49
+ "status": "success",
50
+ "failures": failures
51
+ }
52
+
53
+ def check_self_touch(self) -> dict[str, Any]:
54
+ """
55
+ Verify if the agent modified the evaluation or governance criteria (Level 1/2).
56
+ If it did, flag it for external certification.
57
+ """
58
+ failures: list[str] = []
59
+
60
+ # Example check: If the agent modifies QC rules or evaluation logic,
61
+ # it MUST require external certification (Human or higher-level Judge).
62
+
63
+ return {
64
+ "status": "success",
65
+ "failures": failures
66
+ }
67
+
68
+ def evaluate(self) -> dict[str, Any]:
69
+ """
70
+ Run all QC checks.
71
+ """
72
+ invariants = self.check_hard_invariants()
73
+ obligations = self.check_obligations()
74
+ self_touch = self.check_self_touch()
75
+
76
+ failures: list[str] = []
77
+ failures.extend(invariants.get("failures", []))
78
+ failures.extend(obligations.get("failures", []))
79
+ failures.extend(self_touch.get("failures", []))
80
+
81
+ all_passed = len(failures) == 0
82
+
83
+ return {
84
+ "all_passed": all_passed,
85
+ "failures": failures,
86
+ "invariants": invariants,
87
+ "obligations": obligations,
88
+ "self_touch": self_touch
89
+ }
@@ -0,0 +1,486 @@
1
+ """
2
+ Core module for agentic soft evaluation and code understanding.
3
+ """
4
+
5
+ import ast
6
+ import contextlib
7
+ import json
8
+ import os
9
+ import random
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ import tiktoken
14
+ from openai import OpenAI
15
+ from pydantic import BaseModel
16
+ from rich.console import Console
17
+
18
+ console = Console()
19
+
20
+ class FileSummary(BaseModel):
21
+ summary: str
22
+ key_entities: list[str]
23
+ complexity_score: int
24
+
25
+ class SoftEvaluator:
26
+ """
27
+ Evaluator for agentic code understanding and reasoning.
28
+ """
29
+
30
+ def __init__(self, target_path: str):
31
+ self.target_path = Path(target_path).resolve()
32
+ # Initialize token counter (using cl100k_base for gpt-4/claude-3)
33
+ self.encoding: Any = None
34
+ with contextlib.suppress(Exception):
35
+ self.encoding = tiktoken.get_encoding("cl100k_base")
36
+
37
+ # Initialize OpenAI client only if API key is present
38
+ self.client = None
39
+ api_key = os.environ.get("LLM_API_KEY")
40
+ base_url = os.environ.get("LLM_BASE_URL", "https://api.deepseek.com/v1")
41
+ self.model_name = os.environ.get("LLM_MODEL_NAME", "deepseek-reasoner")
42
+ self.mini_model_name = os.environ.get("LLM_MINI_MODEL_NAME", "deepseek-chat")
43
+
44
+ if api_key:
45
+ self.client = OpenAI(api_key=api_key, base_url=base_url)
46
+ else:
47
+ console.print(
48
+ "[yellow]Warning: LLM_API_KEY not set. "
49
+ "Agent will run in mock mode.[/yellow]"
50
+ )
51
+
52
+ # Store extracted AST entities for sampling
53
+ self.extracted_entities: list[dict[str, Any]] = []
54
+
55
+ def _get_python_files(self) -> list[Path]:
56
+ """
57
+ Recursively find all Python files in the target directory,
58
+ excluding hidden dirs and .venv.
59
+ """
60
+ python_files = []
61
+ for root, dirs, files in os.walk(self.target_path):
62
+ # Exclude hidden directories and virtual environments
63
+ dirs[:] = [
64
+ d
65
+ for d in dirs
66
+ if not d.startswith(".") and d not in (
67
+ "__pycache__",
68
+ "venv",
69
+ "env",
70
+ "vendors",
71
+ )
72
+ ]
73
+ for file in files:
74
+ if file.endswith(".py"):
75
+ python_files.append(Path(root) / file)
76
+ return python_files
77
+
78
+ def calculate_token_complexity(self, file_path: Path) -> int:
79
+ """
80
+ Calculate the token count for a given file as a proxy
81
+ for cognitive complexity.
82
+ """
83
+ if not self.encoding:
84
+ return 0
85
+
86
+ try:
87
+ content = file_path.read_text(encoding="utf-8")
88
+ return len(self.encoding.encode(content))
89
+ except Exception as e:
90
+ console.print(
91
+ f"[yellow]Warning: Could not read {file_path} for token counting: "
92
+ f"{e}[/yellow]"
93
+ )
94
+ return 0
95
+
96
+ def _extract_ast_entities(self, file_path: Path, content: str) -> None:
97
+ """
98
+ Parse the AST of a file to extract:
99
+ 1. Classes and functions for later QA sampling.
100
+ 2. Fan-out (number of imported external modules) to measure coupling.
101
+ """
102
+ try:
103
+ tree = ast.parse(content)
104
+
105
+ # Calculate Fan-out (number of unique imported top-level modules)
106
+ imported_modules = set()
107
+ for node in ast.walk(tree):
108
+ if isinstance(node, ast.Import):
109
+ for alias in node.names:
110
+ imported_modules.add(alias.name.split('.')[0])
111
+ elif isinstance(node, ast.ImportFrom) and node.module:
112
+ imported_modules.add(node.module.split('.')[0])
113
+
114
+ fan_out = len(imported_modules)
115
+
116
+ # Extract classes and functions
117
+ for node in ast.walk(tree):
118
+ if isinstance(
119
+ node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)
120
+ ):
121
+ try:
122
+ source_segment = ast.get_source_segment(content, node)
123
+ if source_segment:
124
+ if isinstance(node, ast.ClassDef):
125
+ entity_type = "Class"
126
+ else:
127
+ entity_type = "Function"
128
+ self.extracted_entities.append(
129
+ {
130
+ "file": file_path.name,
131
+ "type": entity_type,
132
+ "name": node.name,
133
+ "code": source_segment,
134
+ "fan_out": fan_out, # Context
135
+ }
136
+ )
137
+ except Exception:
138
+ pass
139
+ except SyntaxError:
140
+ pass # Skip files with syntax errors
141
+
142
+ def summarize_file(self, file_path: Path) -> dict[str, Any]:
143
+ """
144
+ Use LLM agent to summarize a single file's logic and structure.
145
+ """
146
+ tokens = self.calculate_token_complexity(file_path)
147
+
148
+ try:
149
+ content = file_path.read_text(encoding="utf-8")
150
+ self._extract_ast_entities(file_path, content)
151
+ except Exception:
152
+ content = ""
153
+
154
+ simulated_summary = f"File {file_path.name} contains {tokens} tokens."
155
+ key_entities: list[str] = []
156
+
157
+ # Only call LLM if file is not empty and not too large (e.g. > 100k tokens)
158
+ if self.client and content and 0 < tokens < 100000:
159
+ try:
160
+ sys_prompt = (
161
+ "You are a senior Python architect. Analyze the provided Python "
162
+ "file and provide a concise summary of its purpose, a list of "
163
+ "its key entities (classes/functions/globals), and an estimated "
164
+ "cognitive complexity score (1-10).\n"
165
+ "Output MUST be in valid JSON matching this schema: "
166
+ '{"summary": "str", "key_entities": ["str"], "complexity_score": 1}'
167
+ )
168
+ completion = self.client.chat.completions.create(
169
+ model=self.mini_model_name,
170
+ messages=[
171
+ {
172
+ "role": "system",
173
+ "content": sys_prompt,
174
+ },
175
+ {
176
+ "role": "user",
177
+ "content": (
178
+ f"File name: {file_path.name}\n\nContent:\n"
179
+ f"```python\n{content}\n```"
180
+ ),
181
+ },
182
+ ],
183
+ response_format={"type": "json_object"},
184
+ )
185
+
186
+ content_str = completion.choices[0].message.content
187
+ if content_str:
188
+ result = json.loads(content_str)
189
+ simulated_summary = result.get("summary", simulated_summary)
190
+ key_entities = result.get("key_entities", key_entities)
191
+ except Exception as e:
192
+ console.print(
193
+ f"[yellow] Agent failed to read {file_path.name}: {e}[/yellow]"
194
+ )
195
+
196
+ try:
197
+ rel_path = str(file_path.relative_to(self.target_path))
198
+ except ValueError:
199
+ rel_path = str(file_path)
200
+
201
+ return {
202
+ "file": rel_path,
203
+ "tokens": tokens,
204
+ "summary": simulated_summary,
205
+ "key_entities": key_entities,
206
+ }
207
+
208
+ def summarize_package(self) -> dict[str, Any]:
209
+ """
210
+ Aggregate file summaries into a package-level understanding.
211
+ """
212
+ files = self._get_python_files()
213
+ file_summaries = []
214
+ total_tokens = 0
215
+
216
+ console.print(
217
+ f"[cyan]Agent is analyzing {len(files)} Python files "
218
+ f"for cognitive load and architecture...[/cyan]"
219
+ )
220
+
221
+ for file in files:
222
+ summary_data = self.summarize_file(file)
223
+ file_summaries.append(summary_data)
224
+ total_tokens += summary_data["tokens"]
225
+
226
+ # Synthesize package architecture
227
+ package_understanding = (
228
+ f"The package contains {len(files)} files with a total cognitive load "
229
+ f"of {total_tokens} tokens."
230
+ )
231
+
232
+ if self.client and file_summaries:
233
+ try:
234
+ console.print(
235
+ "[cyan]Agent is synthesizing global package architecture...[/cyan]"
236
+ )
237
+ manifest_lines = [
238
+ f"- {s['file']}: {s['summary']} "
239
+ f"(Entities: {', '.join(s['key_entities'])})"
240
+ for s in file_summaries
241
+ ]
242
+ manifest = "\n".join(manifest_lines)
243
+
244
+ sys_prompt = (
245
+ "You are a senior software architect. Based on the following "
246
+ "summaries of individual files in a Python package, write a "
247
+ "coherent, high-level explanation of how this entire package "
248
+ "works and what its primary responsibilities are. Be concise "
249
+ "but comprehensive."
250
+ )
251
+
252
+ completion = self.client.chat.completions.create(
253
+ model=self.model_name,
254
+ messages=[
255
+ {
256
+ "role": "system",
257
+ "content": sys_prompt,
258
+ },
259
+ {
260
+ "role": "user",
261
+ "content": f"Package files and summaries:\n{manifest}",
262
+ },
263
+ ],
264
+ )
265
+
266
+ package_understanding = (
267
+ completion.choices[0].message.content or package_understanding
268
+ )
269
+ except Exception as e:
270
+ console.print(
271
+ f"[yellow]Agent failed to synthesize package: {e}[/yellow]"
272
+ )
273
+
274
+ return {
275
+ "total_files": len(files),
276
+ "total_tokens": total_tokens,
277
+ "file_level_summaries": file_summaries,
278
+ "package_understanding": package_understanding,
279
+ }
280
+
281
+ def run_sampling_qa(self) -> dict[str, Any]:
282
+ """
283
+ Randomly sample modules/variables and ask the Agent questions
284
+ to measure understandability.
285
+ """
286
+ if not self.extracted_entities:
287
+ return {
288
+ "qa_score": 100.0,
289
+ "sampled_entities": [],
290
+ "note": "No entities found for sampling.",
291
+ }
292
+
293
+ # Randomly sample up to 3 entities
294
+ sample_size = min(3, len(self.extracted_entities))
295
+ sampled = random.sample(self.extracted_entities, sample_size)
296
+
297
+ console.print(
298
+ f"\n[cyan]Agent is running Blind QA on {sample_size} "
299
+ f"sampled entities...[/cyan]"
300
+ )
301
+
302
+ qa_results = []
303
+ total_score = 0.0
304
+
305
+ for entity in sampled:
306
+ entity_name = entity["name"]
307
+ entity_type = entity["type"]
308
+ entity_code = entity["code"]
309
+ fan_out = entity.get("fan_out", 0)
310
+
311
+ if not self.client:
312
+ # Mock evaluation
313
+ score = 100.0
314
+ feedback = "Mock evaluation: Code is perfectly readable."
315
+ else:
316
+ try:
317
+ sys_prompt = (
318
+ "You are an expert Code Reviewer and Software Architect. "
319
+ "You will be given a snippet of Python code (a class or "
320
+ "function) along with its module's Fan-out metric (number "
321
+ "of external dependencies). Your task is to evaluate its "
322
+ "readability and structural cohesion.\n"
323
+ "Output MUST be in valid JSON matching this schema: "
324
+ '{"explanation": "str", "readability_score": 1, '
325
+ '"feedback": "str"}\n'
326
+ "- `explanation`: Briefly explain what this code does.\n"
327
+ "- `readability_score`: A score from 0 to 100.\n"
328
+ "- `feedback`: What makes it easy/hard to understand? "
329
+ "Does a high Fan-out indicate bad cohesion here?"
330
+ )
331
+
332
+ user_content = (
333
+ f"Module Fan-out (Dependencies): {fan_out}\n\n"
334
+ f"Code Snippet:\n```python\n{entity_code}\n```"
335
+ )
336
+
337
+ completion = self.client.chat.completions.create(
338
+ model=self.mini_model_name,
339
+ messages=[
340
+ {"role": "system", "content": sys_prompt},
341
+ {"role": "user", "content": user_content},
342
+ ],
343
+ response_format={"type": "json_object"},
344
+ )
345
+
346
+ content_str = completion.choices[0].message.content
347
+ if content_str:
348
+ result = json.loads(content_str)
349
+ score = float(result.get("readability_score", 100))
350
+ feedback = result.get("feedback", "")
351
+ else:
352
+ score = 80.0
353
+ feedback = "Failed to parse Agent response."
354
+ except Exception as e:
355
+ score = 0.0
356
+ feedback = f"Error during Agent evaluation: {e}"
357
+
358
+ total_score += score
359
+ qa_results.append(
360
+ {
361
+ "entity": f"{entity_type} {entity_name} (from {entity['file']})",
362
+ "score": score,
363
+ "feedback": feedback,
364
+ }
365
+ )
366
+
367
+ final_average_score = total_score / sample_size if sample_size > 0 else 100.0
368
+
369
+ return {
370
+ "qa_score": final_average_score,
371
+ "sampled_entities": qa_results,
372
+ "note": "Sampling QA completed.",
373
+ }
374
+
375
+ def generate_final_report(
376
+ self, hard_results: dict[str, Any], soft_results: dict[str, Any]
377
+ ) -> dict[str, Any]:
378
+ """
379
+ Synthesize all evaluation results into a final verdict and exactly
380
+ 3 actionable suggestions.
381
+ """
382
+ if not self.client:
383
+ return {
384
+ "verdict": "Pass (Mock)",
385
+ "summary": "Mock evaluation completed without LLM.",
386
+ "suggestions": [
387
+ {
388
+ "title": "Mock Suggestion 1",
389
+ "description": "Add more docstrings.",
390
+ "target_file": "all"
391
+ },
392
+ {
393
+ "title": "Mock Suggestion 2",
394
+ "description": "Refactor large functions.",
395
+ "target_file": "all"
396
+ },
397
+ {
398
+ "title": "Mock Suggestion 3",
399
+ "description": "Improve test coverage.",
400
+ "target_file": "tests/"
401
+ }
402
+ ]
403
+ }
404
+
405
+ try:
406
+ # Extract key metrics for the prompt
407
+ cc_issues = hard_results.get("radon_cc", {}).get("issues", [])
408
+ mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
409
+ avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
410
+
411
+ qa_score = soft_results.get("understandability_score", 100.0)
412
+ qa_entities = soft_results.get("qa_results", {}).get("sampled_entities", [])
413
+
414
+ sys_prompt = (
415
+ "You are an elite Python Codebase Evaluator. You have just analyzed "
416
+ "a repository. Your task is to provide a final judgment and EXACTLY "
417
+ "3 concrete, actionable improvement suggestions. These suggestions "
418
+ "MUST NOT change the external functionality (they are refactoring/"
419
+ "quality improvements).\n\n"
420
+ "Output MUST be in valid JSON matching this schema:\n"
421
+ "{\n"
422
+ ' "verdict": "Pass" or "Fail",\n'
423
+ ' "summary": "One paragraph summary of codebase health",\n'
424
+ ' "suggestions": [\n'
425
+ ' {"title": "str", "description": "str", "target_file": "str"}\n'
426
+ " ]\n"
427
+ "}\n"
428
+ "Rule for Verdict: Pass if Average Maintainability > 50 and "
429
+ "QA Score > 75 and no Critical CC issues (>15). Otherwise Fail."
430
+ )
431
+
432
+ user_content = (
433
+ f"Metrics:\n"
434
+ f"- Average Maintainability Index (MI): {avg_mi:.1f}/100\n"
435
+ f"- Number of functions with Cyclomatic Complexity > 15: "
436
+ f"{len(cc_issues)}\n"
437
+ f"- Agent QA Readability Score: {qa_score:.1f}/100\n\n"
438
+ f"QA Feedback Snippets:\n"
439
+ + "\n".join(
440
+ [f" * {q['entity']}: {q['feedback']}" for q in qa_entities]
441
+ )
442
+ )
443
+
444
+ completion = self.client.chat.completions.create(
445
+ model=self.model_name,
446
+ messages=[
447
+ {"role": "system", "content": sys_prompt},
448
+ {"role": "user", "content": user_content},
449
+ ],
450
+ response_format={"type": "json_object"},
451
+ )
452
+
453
+ content_str = completion.choices[0].message.content
454
+ if content_str:
455
+ parsed_json = json.loads(content_str)
456
+ if isinstance(parsed_json, dict):
457
+ return parsed_json
458
+ else:
459
+ raise ValueError("JSON response is not a dictionary.")
460
+ else:
461
+ raise ValueError("Empty response from Agent.")
462
+ except Exception as e:
463
+ console.print(f"[yellow]Failed to generate final report: {e}[/yellow]")
464
+ return {
465
+ "verdict": "Error",
466
+ "summary": f"Failed to synthesize report: {e}",
467
+ "suggestions": []
468
+ }
469
+ def evaluate(self) -> dict[str, Any]:
470
+ """
471
+ Execute soft evaluation workflows including summarization and Q&A.
472
+ """
473
+ package_summary = self.summarize_package()
474
+ qa_results = self.run_sampling_qa()
475
+
476
+ # Calculate a mock understandability score based on token density
477
+ # (just an example heuristic)
478
+ # In reality, this will be based on the QA results and LLM judge
479
+ understandability_score = qa_results["qa_score"]
480
+
481
+ return {
482
+ "status": "success",
483
+ "understandability_score": understandability_score,
484
+ "package_summary": package_summary,
485
+ "qa_results": qa_results,
486
+ }
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.4
2
+ Name: python-harness
3
+ Version: 0.0.1
4
+ Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
+ Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: typer>=0.9.0
11
+ Requires-Dist: rich>=13.0.0
12
+ Requires-Dist: pydantic>=2.0.0
13
+ Requires-Dist: openai>=1.0.0
14
+ Requires-Dist: anthropic>=0.18.0
15
+ Requires-Dist: tenacity>=8.2.0
16
+ Requires-Dist: tiktoken>=0.6.0
17
+ Requires-Dist: python-dotenv>=1.0.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
20
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
21
+ Requires-Dist: ruff>=0.3.0; extra == "dev"
22
+ Requires-Dist: mypy>=1.9.0; extra == "dev"
23
+ Requires-Dist: ty>=0.0.1; extra == "dev"
24
+ Requires-Dist: radon>=6.0.1; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # Python Harness
28
+
29
+ An agentic codebase evaluation and evolution tool for Python projects.
30
+
31
+ `python-harness` is designed to be a universal standard tool—just like `pytest` or `ruff`—but instead of just checking syntax or running tests, it evaluates the **architecture, readability, and governance** of your codebase using both static analysis and LLMs (DeepSeek/OpenAI).
32
+
33
+ ## Features
34
+
35
+ 1. **Hard Evaluation (First Fence)**: Enforces strict rules using `ruff`, `mypy`, and `ty`. Evaluates Cyclomatic Complexity (CC) and Maintainability Index (MI) via `radon`.
36
+ 2. **Governance QC (Second Fence)**: Checks if the changes violate core project governance or attempt to bypass the evaluation rules themselves.
37
+ 3. **Soft Evaluation (Third Fence)**:
38
+ - Calculates architecture metrics like Fan-out (coupling).
39
+ - Generates a holistic package understanding using LLMs.
40
+ - Performs "Blind QA": Randomly samples functions/classes and tests the LLM's ability to understand them without context.
41
+ 4. **Actionable Output**: Synthesizes the evaluation into a final `Pass/Fail` verdict with exactly 3 concrete, actionable refactoring suggestions.
42
+
43
+ ## Installation
44
+
45
+ You can install `python-harness` using `uv` or `pip`:
46
+
47
+ ```bash
48
+ uv pip install python-harness
49
+ ```
50
+
51
+ ## Configuration
52
+
53
+ `python-harness` requires an LLM to perform its soft evaluation. Create a `.env` file in the root of your project:
54
+
55
+ ```env
56
+ LLM_API_KEY=your_api_key_here
57
+ LLM_BASE_URL=https://api.deepseek.com/v1
58
+ LLM_MODEL_NAME=deepseek-reasoner
59
+ LLM_MINI_MODEL_NAME=deepseek-chat
60
+ ```
61
+
62
+ *(Note: If you don't provide an API key, the harness will safely run in Mock mode).*
63
+
64
+ ## Usage
65
+
66
+ ### 1. Measure
67
+
68
+ To evaluate your codebase, run the `measure` command in your project directory:
69
+
70
+ ```bash
71
+ harness measure .
72
+ ```
73
+
74
+ This will run the full 3-fence evaluation and output a report with a final verdict and top 3 improvement suggestions.
75
+
76
+ ### 2. Refine (Evolution Loop - WIP)
77
+
78
+ The `refine` command is an Agentic Edit-Test-Improve loop. It takes the suggestions generated by `measure`, automatically creates branches (variants), applies the changes, runs the tests (`pytest`), and picks the best variant.
79
+
80
+ ```bash
81
+ harness refine . --steps 1 --max-retries 3
82
+ ```
83
+
84
+ ## License
85
+
86
+ MIT License. See [LICENSE](LICENSE) for more details.
87
+
88
+ A harness toolkit for Python projects
@@ -0,0 +1,12 @@
1
+ python_harness/__init__.py,sha256=i1W_gUvOQUO7yp0Vu_jQbOYiqR5Nf651N5A36E118X8,90
2
+ python_harness/cli.py,sha256=zyhC5gqo3CaieRKLkcaw_zDkOZr0PnEdzorG2AfU4bA,7815
3
+ python_harness/evaluator.py,sha256=Mfyg5vvL0GMKxnKLqokd5dxn0J1ob8yBd7-316zsaGw,1307
4
+ python_harness/hard_evaluator.py,sha256=AHcS2jn1GyePv8UK0vvpa5rH7bwXChpg5A_5m5WHGxk,6669
5
+ python_harness/qc_evaluator.py,sha256=Mw_nxu253ERwV4lzWEhTTL9iN3_qBsMmi72Fz9cA4Fw,2883
6
+ python_harness/soft_evaluator.py,sha256=OfTaRT2h14_VzNlMx1qlj2RzF_dcD-TFhjZaCri-TUg,19117
7
+ python_harness-0.0.1.dist-info/licenses/LICENSE,sha256=rMiBapfK7KDDmBOyspVvaqy1OFWHUe-0DoiRE9A3dL0,1068
8
+ python_harness-0.0.1.dist-info/METADATA,sha256=knKHBMROxKTsbd3_SSJFDr80HVjPIa5bdBEHSJ0Sknc,3149
9
+ python_harness-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
10
+ python_harness-0.0.1.dist-info/entry_points.txt,sha256=6OPLethPEEz4MlRoxgUx7SYmN22P2aogi1jaorcrVVM,51
11
+ python_harness-0.0.1.dist-info/top_level.txt,sha256=PxPMOpwPhfTaZxV4tX2LuRS5Sb6MEGutOm62DYMXXCQ,15
12
+ python_harness-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ harness = python_harness.cli:app
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mingli Yuan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ python_harness