python-harness 0.0.6__tar.gz → 0.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {python_harness-0.0.6/python_harness.egg-info → python_harness-0.0.10}/PKG-INFO +1 -1
  2. {python_harness-0.0.6 → python_harness-0.0.10}/pyproject.toml +1 -2
  3. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness/__init__.py +1 -1
  4. python_harness-0.0.10/python_harness/cli.py +312 -0
  5. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness/evaluator.py +1 -1
  6. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness/hard_evaluator.py +99 -16
  7. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness/soft_evaluator.py +252 -126
  8. {python_harness-0.0.6 → python_harness-0.0.10/python_harness.egg-info}/PKG-INFO +1 -1
  9. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness.egg-info/SOURCES.txt +1 -0
  10. python_harness-0.0.10/tests/test_cli.py +429 -0
  11. python_harness-0.0.10/tests/test_evaluator.py +37 -0
  12. python_harness-0.0.10/tests/test_hard_evaluator.py +358 -0
  13. python_harness-0.0.10/tests/test_qc_evaluator.py +65 -0
  14. python_harness-0.0.10/tests/test_soft_evaluator.py +354 -0
  15. python_harness-0.0.6/python_harness/cli.py +0 -235
  16. python_harness-0.0.6/tests/test_cli.py +0 -26
  17. python_harness-0.0.6/tests/test_evaluator.py +0 -18
  18. python_harness-0.0.6/tests/test_hard_evaluator.py +0 -95
  19. python_harness-0.0.6/tests/test_soft_evaluator.py +0 -42
  20. {python_harness-0.0.6 → python_harness-0.0.10}/LICENSE +0 -0
  21. {python_harness-0.0.6 → python_harness-0.0.10}/README.md +0 -0
  22. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness/qc_evaluator.py +0 -0
  23. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness.egg-info/dependency_links.txt +0 -0
  24. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness.egg-info/entry_points.txt +0 -0
  25. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness.egg-info/requires.txt +0 -0
  26. {python_harness-0.0.6 → python_harness-0.0.10}/python_harness.egg-info/top_level.txt +0 -0
  27. {python_harness-0.0.6 → python_harness-0.0.10}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-harness
3
- Version: 0.0.6
3
+ Version: 0.0.10
4
4
  Summary: An agentic codebase evaluation and evolution tool for Python projects.
5
5
  Author-email: Mingli Yuan <mingli.yuan@gmail.com>
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "python-harness"
3
- version = "0.0.6"
3
+ version = "0.0.10"
4
4
  description = "An agentic codebase evaluation and evolution tool for Python projects."
5
5
  requires-python = ">=3.10"
6
6
  readme = "README.md"
@@ -62,4 +62,3 @@ addopts = "-ra -q --cov=python_harness --cov-report=term-missing --cov-report=ht
62
62
  testpaths = [
63
63
  "tests",
64
64
  ]
65
-
@@ -2,4 +2,4 @@
2
2
  Python Harness - An agentic evaluation tool for codebases.
3
3
  """
4
4
 
5
- __version__ = "0.0.1"
5
+ __version__ = "0.0.10"
@@ -0,0 +1,312 @@
1
+ """
2
+ Command-line interface for python-harness.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from typing import Any
8
+
9
+ import typer
10
+ from dotenv import load_dotenv
11
+ from rich.console import Console
12
+
13
+ from python_harness.evaluator import Evaluator
14
+
15
+ # Try to find .env file explicitly before anything else executes
16
+ env_path = os.path.join(os.getcwd(), '.env')
17
+ if os.path.exists(env_path):
18
+ load_dotenv(dotenv_path=env_path)
19
+ else:
20
+ load_dotenv() # Fallback to default search
21
+
22
+ app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
23
+ console = Console()
24
+
25
+
26
+ def _print_detail_block(title: str, details: str, color: str) -> None:
27
+ normalized_details = [
28
+ line.rstrip() for line in details.splitlines() if line.strip()
29
+ ]
30
+ console.print(f"[{color}]{title}:[/{color}]")
31
+ for line in normalized_details:
32
+ console.print(f" {line}")
33
+ console.print()
34
+
35
+
36
+ def _print_ruff_issues(issues: list[dict[str, Any]]) -> None:
37
+ console.print("[red]Ruff issues found:[/red]")
38
+ for issue in issues:
39
+ file = issue.get("filename", "unknown")
40
+ line = issue.get("location", {}).get("row", "?")
41
+ msg = issue.get("message", "unknown issue")
42
+ console.print(f" - {file}:{line} {msg}")
43
+ console.print()
44
+
45
+
46
+ def _print_ty_result(ty_results: dict[str, Any]) -> None:
47
+ status = ty_results.get("status")
48
+ if status == "warning":
49
+ msg = str(ty_results.get("error_message", "ty not found"))
50
+ _print_detail_block("Ty warning", msg, "yellow")
51
+ return
52
+ if status == "success":
53
+ return
54
+
55
+ output = str(ty_results.get("output", ""))
56
+ error_msg = str(ty_results.get("error_message", ""))
57
+ if output:
58
+ _print_detail_block("Ty issues found", output, "red")
59
+ elif error_msg:
60
+ _print_detail_block("Ty error", error_msg, "red")
61
+ else:
62
+ console.print("[red]Ty failed, but no standard output was captured.[/red]")
63
+
64
+
65
+ def _print_radon_cc_result(radon_results: dict[str, Any]) -> None:
66
+ status = radon_results.get("status")
67
+ if status == "warning":
68
+ err_msg = str(radon_results.get("error_message", ""))
69
+ _print_detail_block("Radon CC warning", err_msg, "yellow")
70
+ return
71
+ if status != "failed":
72
+ return
73
+
74
+ issues = radon_results.get("issues", [])
75
+ if issues:
76
+ console.print(
77
+ f"[red]Cyclomatic Complexity too high "
78
+ f"({len(issues)} functions > 15):[/red]"
79
+ )
80
+ for issue in issues:
81
+ console.print(
82
+ f" - {issue['file']}: {issue['type']} '{issue['name']}' "
83
+ f"has CC {issue['complexity']}"
84
+ )
85
+ console.print()
86
+ return
87
+
88
+ err_msg = str(radon_results.get("error_message", ""))
89
+ if err_msg:
90
+ _print_detail_block("Radon CC error", err_msg, "red")
91
+ return
92
+ console.print("[red]Radon CC failed but no specific issues were parsed.[/red]")
93
+ console.print()
94
+
95
+
96
+ def _print_hard_failure_details(hard_results: dict[str, Any]) -> None:
97
+ console.print("[bold red]Hard Evaluation Failed![/bold red]")
98
+ console.print()
99
+
100
+ ruff_issues = hard_results.get("ruff", {}).get("issues", [])
101
+ if hard_results.get("ruff", {}).get("status") != "success":
102
+ _print_ruff_issues(ruff_issues)
103
+
104
+ if hard_results.get("mypy", {}).get("status") != "success":
105
+ output = str(hard_results.get("mypy", {}).get("output", ""))
106
+ _print_detail_block("Mypy issues found", output, "red")
107
+
108
+ _print_ty_result(hard_results.get("ty", {}))
109
+ _print_radon_cc_result(hard_results.get("radon_cc", {}))
110
+
111
+ if hard_results.get("pytest", {}).get("status") == "failed":
112
+ error_msg = str(hard_results.get("pytest", {}).get("error_message", ""))
113
+ _print_detail_block("Pytest/Coverage issues found", error_msg, "red")
114
+
115
+ console.print(
116
+ "[yellow]Continuing to soft evaluation to generate "
117
+ "suggestions despite hard failures...[/yellow]"
118
+ )
119
+
120
+
121
+ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
122
+ if hard_results["all_passed"]:
123
+ console.print("[bold green]Hard Evaluation Passed![/bold green]")
124
+ return
125
+ _print_hard_failure_details(hard_results)
126
+
127
+
128
+ def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
129
+ mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
130
+ if not mi_scores:
131
+ return
132
+
133
+ avg_mi = sum(mi_scores.values()) / len(mi_scores)
134
+ color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
135
+ console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
136
+
137
+
138
+ def _print_qc_summary(qc_results: dict[str, Any]) -> None:
139
+ console.print()
140
+ console.print("[bold blue]Running Governance QC (Second Fence)...[/bold blue]")
141
+
142
+ if qc_results["all_passed"]:
143
+ console.print(
144
+ "[bold green]Governance QC Passed! (Change is admissible)[/bold green]"
145
+ )
146
+ console.print()
147
+ return
148
+
149
+ console.print("[bold red]Governance QC Failed![/bold red]")
150
+ console.print()
151
+ console.print(
152
+ "[red]The proposed changes violate governance constraints "
153
+ "or lack sufficient evidence.[/red]"
154
+ )
155
+ for failure in qc_results["failures"]:
156
+ console.print(f"[red]- {failure}[/red]")
157
+ console.print()
158
+ console.print(
159
+ "[yellow]Continuing to soft evaluation to generate "
160
+ "suggestions despite QC failures...[/yellow]"
161
+ )
162
+ console.print()
163
+
164
+
165
+ def _print_soft_evaluation_start() -> None:
166
+ console.print(
167
+ "[bold blue]Running Soft Evaluation "
168
+ "(Readability & Understandability)...[/bold blue]"
169
+ )
170
+
171
+
172
+ def _print_soft_summary(soft_results: dict[str, Any]) -> None:
173
+ pkg_summary = soft_results["package_summary"]
174
+ console.print(
175
+ f"[green]Analyzed {pkg_summary['total_files']} files with a total of "
176
+ f"{pkg_summary['total_tokens']} tokens.[/green]"
177
+ )
178
+ console.print(
179
+ f"[magenta]Agent's Understanding of the Package:[/magenta]\n"
180
+ f"{pkg_summary['package_understanding']}"
181
+ )
182
+
183
+ console.print()
184
+ console.print(
185
+ f"[cyan]Overall Understandability Score:[/cyan] "
186
+ f"{soft_results['understandability_score']:.1f}/100"
187
+ )
188
+
189
+ qa_results = soft_results.get("qa_results", {}).get("sampled_entities", [])
190
+ if qa_results:
191
+ console.print()
192
+ console.print("[bold yellow]Blind QA Sampling Results:[/bold yellow]")
193
+ for qa in qa_results:
194
+ color = "green" if qa["score"] >= 80 else "red"
195
+ console.print(f" - [{color}]{qa['entity']}: Score {qa['score']}[/{color}]")
196
+ console.print(f" [dim]Feedback: {qa['feedback']}[/dim]")
197
+
198
+ console.print()
199
+ console.print("[yellow]Evaluation completed. Generating report...[/yellow]")
200
+ console.print()
201
+
202
+
203
+ def _print_final_report(final_report: dict[str, Any]) -> None:
204
+ verdict = str(final_report.get("verdict", "Unknown"))
205
+ verdict_color = "bold green" if "Pass" in verdict else "bold red"
206
+
207
+ console.print(
208
+ f"[{verdict_color}]=== FINAL VERDICT: {verdict} ===[/{verdict_color}]"
209
+ )
210
+ console.print(f"[bold]Summary:[/bold] {final_report.get('summary', '')}")
211
+ console.print()
212
+
213
+ suggestions = final_report.get("suggestions", [])
214
+ if suggestions:
215
+ console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
216
+ for i, sug in enumerate(suggestions, 1):
217
+ console.print(
218
+ f" {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
219
+ f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
220
+ )
221
+ console.print(f" [dim]{sug.get('description', '')}[/dim]")
222
+
223
+
224
+ @app.command()
225
+ def refine(
226
+ path: str = typer.Argument(".", help="The path to evaluate and evolve"),
227
+ steps: int = typer.Option(1, help="Number of evolution steps to perform"),
228
+ max_retries: int = typer.Option(3, help="Maximum retries per variant if tests fail")
229
+ ) -> None:
230
+ """
231
+ Refine the codebase through an agentic Edit-Test-Improve loop.
232
+ Generates variants based on suggestions, tests them, and picks the best.
233
+ """
234
+ console.print(
235
+ f"[bold magenta]Starting evolution loop for path:[/bold magenta] {path} "
236
+ f"[dim](steps={steps}, max_retries={max_retries})[/dim]"
237
+ )
238
+
239
+ # 1. First, run a baseline evaluation to get suggestions
240
+ evaluator = Evaluator(path)
241
+ console.print("[cyan]Running baseline evaluation...[/cyan]")
242
+ hard_results = evaluator.hard_evaluator.evaluate()
243
+ soft_results = evaluator.soft_evaluator.evaluate()
244
+ baseline_report = evaluator.soft_evaluator.generate_final_report(
245
+ hard_results, {"all_passed": True, "failures": []}, soft_results
246
+ )
247
+
248
+ suggestions = baseline_report.get("suggestions", [])
249
+ if not suggestions:
250
+ console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
251
+ return
252
+
253
+ console.print(
254
+ f"[green]Found {len(suggestions)} suggestions. "
255
+ f"Starting evolution branches...[/green]"
256
+ )
257
+
258
+ # TODO: Implement the Git branching and Agent modification logic here.
259
+ # The loop will be:
260
+ # for step in range(steps):
261
+ # for suggestion in suggestions:
262
+ # checkout new branch variant-X
263
+ # for retry in range(max_retries):
264
+ # ask LLM to apply suggestion to code
265
+ # run pytest
266
+ # if pytest passes:
267
+ # run harness . to get new score
268
+ # break
269
+ # else:
270
+ # feed error back to LLM for retry
271
+ # compare all variants and checkout the best one
272
+
273
+ console.print(
274
+ "[yellow]Evolution engine skeleton ready. "
275
+ "Actual git mutation logic pending.[/yellow]"
276
+ )
277
+ @app.command()
278
+ def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
279
+ """
280
+ Measure the codebase against hard, soft, and governance constraints.
281
+ Outputs a final report with scores and actionable improvement suggestions.
282
+ """
283
+ console.print(
284
+ f"[bold green]Starting harness measurement for path:[/bold green] {path}"
285
+ )
286
+
287
+ evaluator = Evaluator(path)
288
+ console.print("[bold blue]Running Hard Evaluation (ruff, mypy)...[/bold blue]")
289
+ hard_results = evaluator.hard_evaluator.evaluate()
290
+ _print_hard_evaluation_summary(hard_results)
291
+ _print_mi_scorecard(hard_results)
292
+
293
+ qc_results = evaluator.qc_evaluator.evaluate()
294
+ _print_qc_summary(qc_results)
295
+
296
+ _print_soft_evaluation_start()
297
+ soft_results = evaluator.soft_evaluator.evaluate()
298
+ _print_soft_summary(soft_results)
299
+
300
+ final_report = evaluator.soft_evaluator.generate_final_report(
301
+ hard_results, qc_results, soft_results
302
+ )
303
+ if not final_report:
304
+ return
305
+
306
+ _print_final_report(final_report)
307
+ if "Fail" in str(final_report.get("verdict", "Unknown")):
308
+ sys.exit(1)
309
+
310
+
311
+ if __name__ == "__main__":
312
+ app()
@@ -30,7 +30,7 @@ class Evaluator:
30
30
 
31
31
  # Generate Final Synthesized Report with 3 Suggestions
32
32
  final_report = self.soft_evaluator.generate_final_report(
33
- hard_results, soft_results
33
+ hard_results, qc_results, soft_results
34
34
  )
35
35
 
36
36
  return {
@@ -4,12 +4,15 @@ Core module for integrating hard evaluation tools like ruff, mypy, and pytest.
4
4
 
5
5
  import json
6
6
  import subprocess
7
+ import sys
8
+ import tempfile
7
9
  from pathlib import Path
8
10
  from typing import Any
9
11
 
10
12
  from rich.console import Console
11
13
 
12
14
  console = Console()
15
+ PYTEST_TIMEOUT_SECONDS = 60
13
16
 
14
17
  class HardEvaluator:
15
18
  """
@@ -25,7 +28,15 @@ class HardEvaluator:
25
28
  """
26
29
  try:
27
30
  result = subprocess.run(
28
- ["ruff", "check", str(self.target_path), "--output-format", "json"],
31
+ [
32
+ sys.executable,
33
+ "-m",
34
+ "ruff",
35
+ "check",
36
+ str(self.target_path),
37
+ "--output-format",
38
+ "json",
39
+ ],
29
40
  capture_output=True,
30
41
  text=True,
31
42
  check=False
@@ -46,7 +57,7 @@ class HardEvaluator:
46
57
  """
47
58
  try:
48
59
  result = subprocess.run(
49
- ["mypy", str(self.target_path)],
60
+ [sys.executable, "-m", "mypy", str(self.target_path)],
50
61
  capture_output=True,
51
62
  text=True,
52
63
  check=False
@@ -101,7 +112,15 @@ class HardEvaluator:
101
112
  """
102
113
  try:
103
114
  result = subprocess.run(
104
- ["radon", "cc", "-j", "-a", str(self.target_path)],
115
+ [
116
+ sys.executable,
117
+ "-m",
118
+ "radon",
119
+ "cc",
120
+ "-j",
121
+ "-a",
122
+ str(self.target_path),
123
+ ],
105
124
  capture_output=True,
106
125
  text=True,
107
126
  check=False
@@ -136,7 +155,19 @@ class HardEvaluator:
136
155
  "output": result.stdout,
137
156
  "error_message": result.stderr if result.returncode != 0 else ""
138
157
  }
158
+ except FileNotFoundError:
159
+ return {
160
+ "status": "warning",
161
+ "issues": [],
162
+ "error_message": "radon executable not found. Please install it."
163
+ }
139
164
  except Exception as e:
165
+ if "No module named radon" in str(e) or "radon" in str(e):
166
+ return {
167
+ "status": "warning",
168
+ "issues": [],
169
+ "error_message": "radon executable not found. Please install it."
170
+ }
140
171
  return {"status": "error", "error_message": str(e)}
141
172
 
142
173
  def run_radon_mi(self) -> dict[str, Any]:
@@ -147,7 +178,7 @@ class HardEvaluator:
147
178
  """
148
179
  try:
149
180
  result = subprocess.run(
150
- ["radon", "mi", "-j", str(self.target_path)],
181
+ [sys.executable, "-m", "radon", "mi", "-j", str(self.target_path)],
151
182
  capture_output=True,
152
183
  text=True,
153
184
  check=False
@@ -164,7 +195,19 @@ class HardEvaluator:
164
195
  "mi_scores": mi_scores,
165
196
  "return_code": result.returncode,
166
197
  }
198
+ except FileNotFoundError:
199
+ return {
200
+ "status": "warning",
201
+ "mi_scores": {},
202
+ "error_message": "radon executable not found. Please install it."
203
+ }
167
204
  except Exception as e:
205
+ if "No module named radon" in str(e) or "radon" in str(e):
206
+ return {
207
+ "status": "warning",
208
+ "mi_scores": {},
209
+ "error_message": "radon executable not found. Please install it."
210
+ }
168
211
  return {"status": "error", "error_message": str(e)}
169
212
 
170
213
  def run_pytest(self) -> dict[str, Any]:
@@ -172,22 +215,44 @@ class HardEvaluator:
172
215
  Run Pytest test suite and return coverage results.
173
216
  """
174
217
  try:
175
- # When pytest is run within pytest, it can cause issues or hang.
176
- # Here we just run it as a subprocess to gather results.
177
- result = subprocess.run(
178
- ["pytest", str(self.target_path), "--cov", "--cov-report=json"],
179
- capture_output=True,
180
- text=True,
181
- check=False
182
- )
218
+ with tempfile.TemporaryDirectory() as tmp_dir:
219
+ coverage_report = Path(tmp_dir) / "coverage.json"
220
+ result = subprocess.run(
221
+ [
222
+ sys.executable,
223
+ "-m",
224
+ "pytest",
225
+ str(self.target_path),
226
+ "--cov",
227
+ f"--cov-report=json:{coverage_report}",
228
+ ],
229
+ capture_output=True,
230
+ text=True,
231
+ check=False,
232
+ timeout=PYTEST_TIMEOUT_SECONDS,
233
+ )
234
+ coverage_percentage = None
235
+ if coverage_report.exists():
236
+ coverage_data = json.loads(coverage_report.read_text())
237
+ coverage_percentage = coverage_data.get("totals", {}).get(
238
+ "percent_covered"
239
+ )
183
240
  status = "success" if result.returncode == 0 else "failed"
184
241
  return {
185
242
  "status": status,
186
243
  "output": result.stdout,
187
244
  "return_code": result.returncode,
245
+ "coverage_percentage": coverage_percentage,
246
+ }
247
+ except subprocess.TimeoutExpired:
248
+ return {
249
+ "status": "failed",
250
+ "error_message": (
251
+ f"Pytest run timed out after {PYTEST_TIMEOUT_SECONDS} seconds."
252
+ ),
188
253
  }
189
254
  except Exception as e:
190
- return {"status": "error", "error_message": str(e)}
255
+ return {"status": "error", "error_message": str(e)}
191
256
 
192
257
  def evaluate(self) -> dict[str, Any]:
193
258
  """
@@ -199,13 +264,30 @@ class HardEvaluator:
199
264
  ty_res = self.run_ty()
200
265
  radon_cc_res = self.run_radon_cc()
201
266
  radon_mi_res = self.run_radon_mi()
202
- # pytest_res = self.run_pytest() # Better handled as a separate stage
267
+ pytest_res = self.run_pytest()
203
268
 
269
+ # Parse pytest coverage to check if it's < 90%
270
+ cov_percentage = pytest_res.get("coverage_percentage")
271
+ if pytest_res.get("status") == "success":
272
+ if isinstance(cov_percentage, (int, float)):
273
+ if cov_percentage < 90.0:
274
+ pytest_res["status"] = "failed"
275
+ pytest_res["error_message"] = (
276
+ f"Test coverage is {cov_percentage:.2f}%, "
277
+ f"which is below the 90% threshold."
278
+ )
279
+ else:
280
+ pytest_res["status"] = "failed"
281
+ pytest_res["error_message"] = (
282
+ "Coverage report was missing or unreadable."
283
+ )
284
+
204
285
  all_passed = (
205
286
  ruff_res.get("status") == "success" and
206
287
  mypy_res.get("status") == "success" and
207
288
  ty_res.get("status") in ("success", "warning") and
208
- radon_cc_res.get("status") == "success"
289
+ radon_cc_res.get("status") in ("success", "warning") and
290
+ pytest_res.get("status") == "success"
209
291
  )
210
292
 
211
293
  return {
@@ -214,5 +296,6 @@ class HardEvaluator:
214
296
  "mypy": mypy_res,
215
297
  "ty": ty_res,
216
298
  "radon_cc": radon_cc_res,
217
- "radon_mi": radon_mi_res
299
+ "radon_mi": radon_mi_res,
300
+ "pytest": pytest_res
218
301
  }