PyPI - python-harness - Versions diffs - 0.0.11__tar.gz → 0.0.13__tar.gz - Mend

python-harness 0.0.11tar.gz → 0.0.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

{python_harness-0.0.11/python_harness.egg-info → python_harness-0.0.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-harness
-Version: 0.0.11
+Version: 0.0.13
 Summary: An agentic codebase evaluation and evolution tool for Python projects.
 Author-email: Mingli Yuan <mingli.yuan@gmail.com>
 License: MIT

{python_harness-0.0.11 → python_harness-0.0.13}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "python-harness"
-version = "0.0.11"
+version = "0.0.13"
 description = "An agentic codebase evaluation and evolution tool for Python projects."
 requires-python = ">=3.10"
 readme = "README.md"

{python_harness-0.0.11 → python_harness-0.0.13}/python_harness/__init__.py RENAMED Viewed

@@ -2,4 +2,4 @@
 Python Harness - An agentic evaluation tool for codebases.
 """
-__version__ = "0.0.11"
+__version__ = "0.0.12"

{python_harness-0.0.11 → python_harness-0.0.13}/python_harness/cli.py RENAMED Viewed

@@ -4,6 +4,7 @@ Command-line interface for python-harness.
 import os
 import sys
+from pathlib import Path
 from typing import Any
 import typer
@@ -11,6 +12,7 @@ from dotenv import load_dotenv
 from rich.console import Console
 from python_harness.evaluator import Evaluator
+from python_harness.refine_engine import run_refine
 # Try to find .env file explicitly before anything else executes
 env_path = os.path.join(os.getcwd(), '.env')
@@ -21,6 +23,8 @@ else:
 app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
 console = Console()
+MI_HEALTHY_THRESHOLD = 70.0
+MI_WARNING_THRESHOLD = 40.0
 def _print_detail_block(title: str, details: str, color: str) -> None:
@@ -133,13 +137,21 @@ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
     _print_hard_failure_details(hard_results)
+def _mi_scorecard_color(avg_mi: float) -> str:
+    if avg_mi >= MI_HEALTHY_THRESHOLD:
+        return "green"
+    if avg_mi >= MI_WARNING_THRESHOLD:
+        return "yellow"
+    return "red"
 def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
     mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
     if not mi_scores:
         return
     avg_mi = sum(mi_scores.values()) / len(mi_scores)
-    color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
+    color = _mi_scorecard_color(avg_mi)
     console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")
@@ -221,7 +233,7 @@ def _print_final_report(final_report: dict[str, Any]) -> None:
     suggestions = final_report.get("suggestions", [])
     if suggestions:
         console.print("[bold cyan]Top 3 Improvement Suggestions:[/bold cyan]")
-        for i, sug in enumerate(suggestions, 1):
+        for i, sug in enumerate(suggestions[:3], 1):
             console.print(
                 f"  {i}. [bold]{sug.get('title', 'Suggestion')}[/bold] "
                 f"(Target: [yellow]{sug.get('target_file', 'unknown')}[/yellow])"
@@ -232,56 +244,30 @@ def _print_final_report(final_report: dict[str, Any]) -> None:
 @app.command()
 def refine(
     path: str = typer.Argument(".", help="The path to evaluate and evolve"),
-    steps: int = typer.Option(1, help="Number of evolution steps to perform"),
-    max_retries: int = typer.Option(3, help="Maximum retries per variant if tests fail")
+    max_retries: int = typer.Option(3, help="Maximum retries per candidate"),
+    loop: bool = typer.Option(False, help="Keep refining winners across rounds"),
+    max_rounds: int = typer.Option(3, help="Maximum refine rounds when looping"),
 ) -> None:
     """
-    Refine the codebase through an agentic Edit-Test-Improve loop.
-    Generates variants based on suggestions, tests them, and picks the best.
+    Refine the codebase through a fixed two-level search and optional loop.
     """
     console.print(
-        f"[bold magenta]Starting evolution loop for path:[/bold magenta] {path} "
-        f"[dim](steps={steps}, max_retries={max_retries})[/dim]"
-    )
-    # 1. First, run a baseline evaluation to get suggestions
-    evaluator = Evaluator(path)
-    console.print("[cyan]Running baseline evaluation...[/cyan]")
-    hard_results = evaluator.hard_evaluator.evaluate()
-    soft_results = evaluator.soft_evaluator.evaluate()
-    baseline_report = evaluator.soft_evaluator.generate_final_report(
-        hard_results, {"all_passed": True, "failures": []}, soft_results
-    )
-    suggestions = baseline_report.get("suggestions", [])
-    if not suggestions:
-        console.print("[yellow]No suggestions found to evolve. Exiting.[/yellow]")
-        return
-    console.print(
-        f"[green]Found {len(suggestions)} suggestions. "
-        f"Starting evolution branches...[/green]"
+        f"[bold magenta]Starting refine for path:[/bold magenta] {path} "
+        f"[dim](loop={loop}, max_rounds={max_rounds}, "
+        f"max_retries={max_retries})[/dim]"
     )
-    # TODO: Implement the Git branching and Agent modification logic here.
-    # The loop will be:
-    # for step in range(steps):
-    #   for suggestion in suggestions:
-    #     checkout new branch variant-X
-    #     for retry in range(max_retries):
-    #       ask LLM to apply suggestion to code
-    #       run pytest
-    #       if pytest passes:
-    #         run harness . to get new score
-    #         break
-    #       else:
-    #         feed error back to LLM for retry
-    #   compare all variants and checkout the best one
-    console.print(
-        "[yellow]Evolution engine skeleton ready. "
-        "Actual git mutation logic pending.[/yellow]"
+    target_path = Path(path).resolve()
+    result = run_refine(
+        target_path=target_path,
+        max_retries=max_retries,
+        loop=loop,
+        max_rounds=max_rounds,
+        progress_callback=lambda message: console.print(f"[dim]{message}[/dim]"),
     )
+    console.print(f"[green]winner_id:[/green] {result['winner_id']}")
+    console.print(f"[cyan]rounds_completed:[/cyan] {result['rounds_completed']}")
+    console.print(f"[yellow]stop_reason:[/yellow] {result['stop_reason']}")
 @app.command()
 def measure(path: str = typer.Argument(".", help="The path to evaluate")) -> None:
     """

{python_harness-0.0.11 → python_harness-0.0.13}/python_harness/hard_evaluator.py RENAMED Viewed

@@ -11,6 +11,8 @@ from typing import Any
 from rich.console import Console
+from python_harness.python_file_inventory import collect_python_files
 console = Console()
 PYTEST_TIMEOUT_SECONDS = 60
@@ -22,6 +24,9 @@ class HardEvaluator:
     def __init__(self, target_path: str):
         self.target_path = Path(target_path).resolve()
+    def _radon_metric_targets(self) -> list[str]:
+        return [str(file_path) for file_path in collect_python_files(self.target_path)]
     def run_ruff(self) -> dict[str, Any]:
         """
         Run Ruff linter and return results.
@@ -112,6 +117,14 @@ class HardEvaluator:
         Flag any function/method with CC > 15 as a failure.
         """
         try:
+            targets = self._radon_metric_targets()
+            if not targets:
+                return {
+                    "status": "success",
+                    "issues": [],
+                    "return_code": 0,
+                    "output": "",
+                }
             result = subprocess.run(
                 [
                     sys.executable,
@@ -120,7 +133,7 @@ class HardEvaluator:
                     "cc",
                     "-j",
                     "-a",
-                    str(self.target_path),
+                    *targets,
                 ],
                 capture_output=True,
                 text=True,
@@ -178,8 +191,11 @@ class HardEvaluator:
         but it contributes to the scorecard.
         """
         try:
+            targets = self._radon_metric_targets()
+            if not targets:
+                return {"status": "success", "mi_scores": {}, "return_code": 0}
             result = subprocess.run(
-                [sys.executable, "-m", "radon", "mi", "-j", str(self.target_path)],
+                [sys.executable, "-m", "radon", "mi", "-j", *targets],
                 capture_output=True,
                 text=True,
                 check=False

python_harness-0.0.13/python_harness/llm_client.py ADDED Viewed

@@ -0,0 +1,32 @@
+import os
+from dataclasses import dataclass
+from typing import Any
+from openai import OpenAI
+@dataclass(frozen=True)
+class LLMSettings:
+    api_key: str | None
+    base_url: str
+    model_name: str
+    mini_model_name: str
+    request_timeout_seconds: float
+def load_llm_settings() -> LLMSettings:
+    return LLMSettings(
+        api_key=os.environ.get("LLM_API_KEY"),
+        base_url=os.environ.get("LLM_BASE_URL", "https://api.deepseek.com/v1"),
+        model_name=os.environ.get("LLM_MODEL_NAME", "deepseek-reasoner"),
+        mini_model_name=os.environ.get("LLM_MINI_MODEL_NAME", "deepseek-chat"),
+        request_timeout_seconds=float(
+            os.environ.get("LLM_REQUEST_TIMEOUT_SECONDS", "60")
+        ),
+    )
+def build_llm_client(settings: LLMSettings) -> Any | None:
+    if not settings.api_key:
+        return None
+    return OpenAI(api_key=settings.api_key, base_url=settings.base_url)

python_harness-0.0.13/python_harness/python_file_inventory.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+Python file discovery helpers.
+"""
+from pathlib import Path
+SKIPPED_DIRS = {"__pycache__", "env", "test", "tests", "vendors", "venv"}
+def should_skip_python_path(file_path: Path, root: Path) -> bool:
+    if file_path.name.startswith("test_") or file_path.name.endswith("_test.py"):
+        return True
+    try:
+        relative_parts = file_path.relative_to(root).parts
+    except ValueError:
+        relative_parts = file_path.parts
+    return any(part.startswith(".") or part in SKIPPED_DIRS for part in relative_parts)
+def collect_python_files(root: Path) -> list[Path]:
+    if root.is_file():
+        return [root] if root.suffix == ".py" else []
+    return [
+        file_path
+        for file_path in sorted(root.rglob("*.py"))
+        if not should_skip_python_path(file_path, root)
+    ]

python_harness-0.0.13/python_harness/refine_apply.py ADDED Viewed

@@ -0,0 +1,177 @@
+import json
+from pathlib import Path
+from typing import Any, cast
+from python_harness.llm_client import build_llm_client, load_llm_settings
+from python_harness.python_file_inventory import collect_python_files
+class NullSuggestionApplier:
+    def apply(
+        self,
+        workspace: Path,
+        suggestion: dict[str, str],
+        failure_feedback: str = "",
+    ) -> dict[str, Any]:
+        return {
+            "ok": True,
+            "touched_files": [],
+            "failure_reason": "",
+            "suggestion_title": suggestion.get("title", ""),
+            "failure_feedback": failure_feedback,
+            "workspace": str(workspace),
+        }
+class LLMSuggestionApplier:
+    def __init__(
+        self,
+        client: Any | None = None,
+        model_name: str | None = None,
+    ) -> None:
+        settings = load_llm_settings()
+        self.client = client if client is not None else build_llm_client(settings)
+        self.model_name = model_name or settings.mini_model_name
+        self.request_timeout_seconds = settings.request_timeout_seconds
+    def _select_files(self, workspace: Path, suggestion: dict[str, str]) -> list[Path]:
+        target_file = suggestion.get("target_file", "").strip()
+        if target_file and target_file != "all":
+            target_path = workspace / target_file
+            if target_path.is_file():
+                return [target_path]
+            if target_path.is_dir():
+                return sorted(target_path.rglob("*.py"))[:3]
+        return collect_python_files(workspace)[:3]
+    def _build_messages(
+        self,
+        workspace: Path,
+        suggestion: dict[str, str],
+        failure_feedback: str,
+        files: list[Path],
+    ) -> list[dict[str, str]]:
+        inventory = "\n".join(
+            f"- {file_path.relative_to(workspace)}"
+            for file_path in collect_python_files(workspace)
+        )
+        file_blocks = "\n\n".join(
+            (
+                f"FILE: {file_path.relative_to(workspace)}\n"
+                f"```python\n{file_path.read_text(encoding='utf-8')}\n```"
+            )
+            for file_path in files
+        )
+        system_prompt = (
+            "You apply a single repository improvement suggestion. "
+            "Return only valid JSON with schema "
+            '{"updates":[{"path":"relative/path.py","content":"full file content"}]}. '
+            "Make the smallest possible change that satisfies the suggestion "
+            "and preserves behavior. "
+            "Never write files outside the workspace."
+        )
+        user_prompt = (
+            f"Suggestion title: {suggestion.get('title', '')}\n"
+            f"Suggestion description: {suggestion.get('description', '')}\n"
+            f"Suggestion target_file: {suggestion.get('target_file', 'all')}\n"
+            f"Failure feedback from previous attempt: {failure_feedback or 'None'}\n\n"
+            f"Workspace python inventory:\n{inventory}\n\n"
+            f"Editable file contents:\n{file_blocks}"
+        )
+        return [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+    def _parse_updates(self, raw_content: str) -> list[dict[str, str]]:
+        payload = json.loads(raw_content)
+        updates = payload.get("updates", [])
+        if not isinstance(updates, list):
+            raise ValueError("LLM updates payload must contain a list")
+        parsed: list[dict[str, str]] = []
+        for update in updates:
+            if not isinstance(update, dict):
+                continue
+            path = update.get("path")
+            content = update.get("content")
+            if isinstance(path, str) and isinstance(content, str):
+                parsed.append({"path": path, "content": content})
+        if not parsed:
+            raise ValueError("LLM returned no file updates")
+        return parsed
+    def apply(
+        self,
+        workspace: Path,
+        suggestion: dict[str, str],
+        failure_feedback: str = "",
+    ) -> dict[str, Any]:
+        if self.client is None:
+            return {
+                "ok": False,
+                "touched_files": [],
+                "failure_reason": "LLM_API_KEY not configured",
+            }
+        files = self._select_files(workspace, suggestion)
+        if not files:
+            return {
+                "ok": False,
+                "touched_files": [],
+                "failure_reason": "No editable files selected for suggestion",
+            }
+        client = cast(Any, self.client)
+        try:
+            completion = client.chat.completions.create(
+                model=self.model_name,
+                messages=self._build_messages(
+                    workspace,
+                    suggestion,
+                    failure_feedback,
+                    files,
+                ),
+                response_format={"type": "json_object"},
+                timeout=self.request_timeout_seconds,
+            )
+        except Exception as exc:
+            return {
+                "ok": False,
+                "touched_files": [],
+                "failure_reason": str(exc),
+                "retryable": False,
+            }
+        content = completion.choices[0].message.content
+        if not content:
+            return {
+                "ok": False,
+                "touched_files": [],
+                "failure_reason": "LLM returned empty response",
+                "retryable": False,
+            }
+        try:
+            updates = self._parse_updates(content)
+            touched_files: list[str] = []
+            for update in updates:
+                destination = (workspace / update["path"]).resolve()
+                if not destination.is_relative_to(workspace.resolve()):
+                    raise ValueError("LLM update path is outside workspace")
+                destination.parent.mkdir(parents=True, exist_ok=True)
+                destination.write_text(update["content"], encoding="utf-8")
+                touched_files.append(str(destination.relative_to(workspace)))
+        except Exception as exc:
+            return {
+                "ok": False,
+                "touched_files": [],
+                "failure_reason": str(exc),
+                "retryable": False,
+            }
+        return {
+            "ok": True,
+            "touched_files": touched_files,
+            "failure_reason": "",
+            "suggestion_title": suggestion.get("title", ""),
+            "failure_feedback": failure_feedback,
+            "workspace": str(workspace),
+        }

python_harness-0.0.13/python_harness/refine_checks.py ADDED Viewed

@@ -0,0 +1,29 @@
+import subprocess
+import sys
+from pathlib import Path
+def run_command(path: Path, args: list[str]) -> tuple[bool, str]:
+    command_cwd = path if path.is_dir() else path.parent
+    completed = subprocess.run(
+        args,
+        cwd=command_cwd,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    output = (completed.stdout + completed.stderr).strip()
+    return completed.returncode == 0, output
+def default_self_check_runner(path: Path) -> tuple[bool, str]:
+    checks = [
+        [sys.executable, "-m", "ruff", "check", str(path)],
+        [sys.executable, "-m", "mypy", str(path)],
+        [sys.executable, "-m", "pytest", str(path)],
+    ]
+    for args in checks:
+        ok, output = run_command(path, args)
+        if not ok:
+            return False, output
+    return True, ""

python_harness-0.0.13/python_harness/refine_engine.py ADDED Viewed

@@ -0,0 +1,41 @@
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+from python_harness.refine_checks import default_self_check_runner
+from python_harness.refine_execution import (
+    execute_candidate as _execute_candidate,
+)
+from python_harness.refine_rounds import (
+    default_evaluator_runner,
+    default_workspace_root,
+    suggestions_from,
+    validate_workspace_root,
+)
+from python_harness.refine_rounds import (
+    run_refine as _run_refine,
+)
+from python_harness.refine_rounds import (
+    run_refine_round as _run_refine_round,
+)
+SelfCheckRunner = Callable[[Path], tuple[bool, str]]
+EvaluatorRunner = Callable[[Path], dict[str, Any]]
+_default_evaluator_runner = default_evaluator_runner
+_default_self_check_runner = default_self_check_runner
+_default_workspace_root = default_workspace_root
+_suggestions_from = suggestions_from
+_validate_workspace_root = validate_workspace_root
+def execute_candidate(*args: Any, **kwargs: Any) -> Any:
+    return _execute_candidate(*args, **kwargs)
+def run_refine_round(*args: Any, **kwargs: Any) -> Any:
+    return _run_refine_round(*args, **kwargs)
+def run_refine(*args: Any, **kwargs: Any) -> Any:
+    return _run_refine(*args, **kwargs)

python_harness-0.0.13/python_harness/refine_execution.py ADDED Viewed

@@ -0,0 +1,114 @@
+from pathlib import Path
+from typing import Any
+from python_harness.refine_models import Candidate, SuggestionApplier
+from python_harness.refine_workspace import create_candidate_workspace
+def _emit(progress_callback: Any, message: str) -> None:
+    if progress_callback is not None:
+        progress_callback(message)
+def execute_candidate(
+    *,
+    parent: Candidate,
+    candidate_id: str,
+    suggestion: dict[str, str],
+    workspace_root: Path,
+    applier: SuggestionApplier,
+    self_check_runner: Any,
+    evaluator_runner: Any,
+    max_retries: int,
+    progress_callback: Any = None,
+) -> Candidate:
+    workspace = create_candidate_workspace(
+        parent.workspace,
+        workspace_root,
+        candidate_id,
+    )
+    feedback = ""
+    retries = 0
+    suggestion_title = suggestion.get("title", candidate_id)
+    while True:
+        apply_result: dict[str, Any] | None = None
+        _emit(
+            progress_callback,
+            f"{candidate_id} apply started: {suggestion_title}",
+        )
+        try:
+            apply_result = applier.apply(
+                workspace,
+                suggestion,
+                failure_feedback=feedback,
+            )
+            if not bool(apply_result.get("ok", False)):
+                feedback = str(
+                    apply_result.get("failure_reason") or "suggestion apply failed"
+                )
+                raise RuntimeError(feedback)
+            _emit(progress_callback, f"{candidate_id} apply passed")
+        except Exception as exc:
+            feedback = str(exc)
+            retryable = True
+            if apply_result is not None:
+                retryable = bool(apply_result.get("retryable", True))
+            _emit(progress_callback, f"{candidate_id} apply failed: {feedback}")
+            if not retryable:
+                return Candidate(
+                    id=candidate_id,
+                    parent_id=parent.id,
+                    depth=parent.depth + 1,
+                    workspace=workspace,
+                    suggestion_trace=parent.suggestion_trace + (suggestion_title,),
+                    status="failed",
+                    retry_count=retries,
+                    selection_reason=feedback,
+                )
+            retries += 1
+            if retries > max_retries:
+                return Candidate(
+                    id=candidate_id,
+                    parent_id=parent.id,
+                    depth=parent.depth + 1,
+                    workspace=workspace,
+                    suggestion_trace=parent.suggestion_trace + (suggestion_title,),
+                    status="failed",
+                    retry_count=retries - 1,
+                    selection_reason=feedback,
+                )
+            continue
+        _emit(progress_callback, f"{candidate_id} guardrail 1 started")
+        is_ok, feedback = self_check_runner(workspace)
+        if is_ok:
+            _emit(progress_callback, f"{candidate_id} guardrail 1 passed")
+            _emit(progress_callback, f"{candidate_id} guardrail 2 started")
+            evaluation = evaluator_runner(workspace)
+            _emit(progress_callback, f"{candidate_id} guardrail 2 passed")
+            return Candidate(
+                id=candidate_id,
+                parent_id=parent.id,
+                depth=parent.depth + 1,
+                workspace=workspace,
+                suggestion_trace=parent.suggestion_trace + (suggestion_title,),
+                evaluation=evaluation,
+                status="measured",
+                retry_count=retries,
+            )
+        _emit(progress_callback, f"{candidate_id} guardrail 1 failed")
+        _emit(progress_callback, feedback)
+        retries += 1
+        if retries > max_retries:
+            return Candidate(
+                id=candidate_id,
+                parent_id=parent.id,
+                depth=parent.depth + 1,
+                workspace=workspace,
+                suggestion_trace=parent.suggestion_trace + (suggestion_title,),
+                status="failed",
+                retry_count=retries - 1,
+                selection_reason=str(feedback),
+            )

python_harness-0.0.13/python_harness/refine_models.py ADDED Viewed

@@ -0,0 +1,40 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Protocol
+@dataclass(slots=True)
+class Candidate:
+    id: str
+    parent_id: str | None
+    depth: int
+    workspace: Path
+    suggestion_trace: tuple[str, ...]
+    evaluation: dict[str, Any] | None = None
+    status: str = "pending"
+    retry_count: int = 0
+    selection_reason: str = ""
+@dataclass(slots=True)
+class SelectionResult:
+    winner: Candidate
+    ordered_ids: list[str]
+    reason: str
+@dataclass(slots=True)
+class RefineRoundResult:
+    baseline: Candidate
+    candidates: list[Candidate] = field(default_factory=list)
+    winner: Candidate | None = None
+    stop_reason: str = ""
+class SuggestionApplier(Protocol):
+    def apply(
+        self,
+        workspace: Path,
+        suggestion: dict[str, str],
+        failure_feedback: str = "",
+    ) -> dict[str, Any]: ...

python-harness 0.0.11__tar.gz → 0.0.13__tar.gz

python-harness 0.0.11tar.gz → 0.0.13tar.gz