PyPI - python-harness - Versions diffs - 0.0.11__tar.gz → 0.0.12__tar.gz - Mend

python-harness 0.0.11tar.gz → 0.0.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{python_harness-0.0.11/python_harness.egg-info → python_harness-0.0.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-harness
-Version: 0.0.11
+Version: 0.0.12
 Summary: An agentic codebase evaluation and evolution tool for Python projects.
 Author-email: Mingli Yuan <mingli.yuan@gmail.com>
 License: MIT

{python_harness-0.0.11 → python_harness-0.0.12}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "python-harness"
-version = "0.0.11"
+version = "0.0.12"
 description = "An agentic codebase evaluation and evolution tool for Python projects."
 requires-python = ">=3.10"
 readme = "README.md"

{python_harness-0.0.11 → python_harness-0.0.12}/python_harness/__init__.py RENAMED Viewed

@@ -2,4 +2,4 @@
 Python Harness - An agentic evaluation tool for codebases.
 """
-__version__ = "0.0.11"
+__version__ = "0.0.12"

{python_harness-0.0.11 → python_harness-0.0.12}/python_harness/cli.py RENAMED Viewed

@@ -21,6 +21,8 @@ else:
 app = typer.Typer(help="Agentic harness tool for universal Python codebase evaluation.")
 console = Console()
+MI_HEALTHY_THRESHOLD = 70.0
+MI_WARNING_THRESHOLD = 40.0
 def _print_detail_block(title: str, details: str, color: str) -> None:
@@ -133,13 +135,21 @@ def _print_hard_evaluation_summary(hard_results: dict[str, Any]) -> None:
     _print_hard_failure_details(hard_results)
+def _mi_scorecard_color(avg_mi: float) -> str:
+    if avg_mi >= MI_HEALTHY_THRESHOLD:
+        return "green"
+    if avg_mi >= MI_WARNING_THRESHOLD:
+        return "yellow"
+    return "red"
 def _print_mi_scorecard(hard_results: dict[str, Any]) -> None:
     mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
     if not mi_scores:
         return
     avg_mi = sum(mi_scores.values()) / len(mi_scores)
-    color = "green" if avg_mi > 50 else "yellow" if avg_mi > 20 else "red"
+    color = _mi_scorecard_color(avg_mi)
     console.print(f"[{color}]Average Maintainability Index: {avg_mi:.1f}/100[/{color}]")

{python_harness-0.0.11 → python_harness-0.0.12}/python_harness/hard_evaluator.py RENAMED Viewed

@@ -11,6 +11,8 @@ from typing import Any
 from rich.console import Console
+from python_harness.python_file_inventory import collect_python_files
 console = Console()
 PYTEST_TIMEOUT_SECONDS = 60
@@ -22,6 +24,9 @@ class HardEvaluator:
     def __init__(self, target_path: str):
         self.target_path = Path(target_path).resolve()
+    def _radon_metric_targets(self) -> list[str]:
+        return [str(file_path) for file_path in collect_python_files(self.target_path)]
     def run_ruff(self) -> dict[str, Any]:
         """
         Run Ruff linter and return results.
@@ -112,6 +117,14 @@ class HardEvaluator:
         Flag any function/method with CC > 15 as a failure.
         """
         try:
+            targets = self._radon_metric_targets()
+            if not targets:
+                return {
+                    "status": "success",
+                    "issues": [],
+                    "return_code": 0,
+                    "output": "",
+                }
             result = subprocess.run(
                 [
                     sys.executable,
@@ -120,7 +133,7 @@ class HardEvaluator:
                     "cc",
                     "-j",
                     "-a",
-                    str(self.target_path),
+                    *targets,
                 ],
                 capture_output=True,
                 text=True,
@@ -178,8 +191,11 @@ class HardEvaluator:
         but it contributes to the scorecard.
         """
         try:
+            targets = self._radon_metric_targets()
+            if not targets:
+                return {"status": "success", "mi_scores": {}, "return_code": 0}
             result = subprocess.run(
-                [sys.executable, "-m", "radon", "mi", "-j", str(self.target_path)],
+                [sys.executable, "-m", "radon", "mi", "-j", *targets],
                 capture_output=True,
                 text=True,
                 check=False

python_harness-0.0.12/python_harness/python_file_inventory.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""
+Python file discovery helpers.
+"""
+from pathlib import Path
+SKIPPED_DIRS = {"__pycache__", "env", "test", "tests", "vendors", "venv"}
+def should_skip_python_path(file_path: Path, root: Path) -> bool:
+    if file_path.name.startswith("test_") or file_path.name.endswith("_test.py"):
+        return True
+    try:
+        relative_parts = file_path.relative_to(root).parts
+    except ValueError:
+        relative_parts = file_path.parts
+    return any(part.startswith(".") or part in SKIPPED_DIRS for part in relative_parts)
+def collect_python_files(root: Path) -> list[Path]:
+    if root.is_file():
+        return [root] if root.suffix == ".py" else []
+    return [
+        file_path
+        for file_path in sorted(root.rglob("*.py"))
+        if not should_skip_python_path(file_path, root)
+    ]

python_harness-0.0.12/python_harness/soft_eval_report.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""
+Report-building helpers for soft evaluation.
+"""
+import json
+from typing import Any
+MI_PASS_THRESHOLD = 70.0
+QA_PASS_THRESHOLD = 75.0
+def collect_hard_errors(hard_results: dict[str, Any]) -> list[str]:
+    if hard_results.get("all_passed", True):
+        return []
+    hard_errors = []
+    if hard_results.get("ruff", {}).get("status") != "success":
+        hard_errors.append("Linter (Ruff) failed.")
+    if hard_results.get("mypy", {}).get("status") != "success":
+        hard_errors.append("Type checker (Mypy) failed.")
+    if hard_results.get("pytest", {}).get("status") != "success":
+        hard_errors.append(
+            hard_results.get("pytest", {}).get(
+                "error_message",
+                "Tests or Coverage failed.",
+            )
+        )
+    return hard_errors
+def extract_metrics(
+    hard_results: dict[str, Any],
+    qc_results: dict[str, Any],
+    soft_results: dict[str, Any],
+) -> dict[str, Any]:
+    mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
+    avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
+    return {
+        "avg_mi": avg_mi,
+        "cc_issues": hard_results.get("radon_cc", {}).get("issues", []),
+        "hard_errors": collect_hard_errors(hard_results),
+        "hard_failed": not hard_results.get("all_passed", True),
+        "qa_entities": soft_results.get("qa_results", {}).get("sampled_entities", []),
+        "qa_score": soft_results.get("understandability_score", 100.0),
+        "qc_errors": qc_results.get("failures", []),
+        "qc_failed": not qc_results.get("all_passed", True),
+    }
+def determine_verdict(metrics: dict[str, Any], mock: bool = False) -> str:
+    suffix = " (Mock)" if mock else ""
+    if metrics["hard_failed"] or metrics["qc_failed"]:
+        return f"Fail{suffix}"
+    passed = (
+        metrics["avg_mi"] >= MI_PASS_THRESHOLD
+        and metrics["qa_score"] > QA_PASS_THRESHOLD
+        and not metrics["cc_issues"]
+    )
+    return f"Pass{suffix}" if passed else f"Fail{suffix}"
+def build_mock_summary(
+    metrics: dict[str, Any],
+    hard_results: dict[str, Any],
+) -> str:
+    summary_parts = []
+    if metrics["hard_failed"]:
+        pytest_err = hard_results.get("pytest", {}).get("error_message", "")
+        summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
+    if metrics["qc_failed"]:
+        summary_parts.append("Governance QC failed.")
+    if not summary_parts:
+        summary_parts.append("Mock evaluation completed without LLM.")
+    return " ".join(summary_parts)
+def build_mock_final_report(
+    hard_results: dict[str, Any],
+    metrics: dict[str, Any],
+) -> dict[str, Any]:
+    return {
+        "verdict": determine_verdict(metrics, mock=True),
+        "summary": build_mock_summary(metrics, hard_results),
+        "suggestions": [
+            {
+                "title": "Mock Suggestion 1",
+                "description": "Add more docstrings.",
+                "target_file": "all",
+            },
+            {
+                "title": "Mock Suggestion 2",
+                "description": "Refactor large functions.",
+                "target_file": "all",
+            },
+            {
+                "title": "Mock Suggestion 3",
+                "description": "Improve test coverage.",
+                "target_file": "tests/",
+            },
+        ],
+    }
+def build_final_report_messages(metrics: dict[str, Any]) -> list[dict[str, str]]:
+    sys_prompt = (
+        "You are an elite Python Codebase Evaluator. You have just analyzed "
+        "a repository. Your task is to provide a final judgment and EXACTLY "
+        "3 concrete, actionable improvement suggestions.\n"
+        "If the codebase failed its Hard or QC evaluations (e.g. tests "
+        "failed, coverage is low, or governance violated), your suggestions "
+        "MUST prioritize fixing those issues.\n"
+        "Otherwise, focus on refactoring/quality improvements without "
+        "changing external functionality.\n\n"
+        "Output MUST be in valid JSON matching this schema:\n"
+        "{\n"
+        '  "verdict": "Pass" or "Fail",\n'
+        '  "summary": "One paragraph summary of codebase health and '
+        'any critical failures",\n'
+        '  "suggestions": [\n'
+        '    {"title": "str", "description": "str", "target_file": "str"}\n'
+        "  ]\n"
+        "}\n"
+        "Rule for Verdict: If there are Hard Failures or QC Failures, "
+        "verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
+        f">= {MI_PASS_THRESHOLD:.0f} and QA Score > {QA_PASS_THRESHOLD:.0f} "
+        "and no Critical CC issues (>15). Otherwise Fail."
+    )
+    user_content = (
+        f"Metrics:\n"
+        f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
+        f"- Number of functions with Cyclomatic Complexity > 15: "
+        f"{len(metrics['cc_issues'])}\n"
+        f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
+        f"Failures (Prioritize these!):\n"
+        f"- Hard Evaluation Errors: "
+        f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
+        f"- QC/Governance Errors: "
+        f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
+        f"QA Feedback Snippets:\n"
+        + "\n".join(
+            [f"  * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
+        )
+    )
+    return [
+        {"role": "system", "content": sys_prompt},
+        {"role": "user", "content": user_content},
+    ]
+def parse_final_report_response(raw_content: str) -> dict[str, Any]:
+    parsed_json = json.loads(raw_content)
+    if isinstance(parsed_json, dict):
+        return parsed_json
+    raise ValueError("JSON response is not a dictionary.")

{python_harness-0.0.11 → python_harness-0.0.12}/python_harness/soft_evaluator.py RENAMED Viewed

@@ -15,6 +15,17 @@ from openai import OpenAI
 from pydantic import BaseModel
 from rich.console import Console
+from python_harness.python_file_inventory import collect_python_files
+from python_harness.soft_eval_report import (
+    build_final_report_messages,
+    build_mock_final_report,
+    build_mock_summary,
+    collect_hard_errors,
+    determine_verdict,
+    extract_metrics,
+    parse_final_report_response,
+)
 console = Console()
 class FileSummary(BaseModel):
@@ -57,23 +68,7 @@ class SoftEvaluator:
         Recursively find all Python files in the target directory,
         excluding hidden dirs and .venv.
         """
-        python_files = []
-        for root, dirs, files in os.walk(self.target_path):
-            # Exclude hidden directories and virtual environments
-            dirs[:] = [
-                d
-                for d in dirs
-                if not d.startswith(".") and d not in (
-                    "__pycache__",
-                    "venv",
-                    "env",
-                    "vendors",
-                )
-            ]
-            for file in files:
-                if file.endswith(".py"):
-                    python_files.append(Path(root) / file)
-        return python_files
+        return collect_python_files(self.target_path)
     def _read_file_text(self, file_path: Path) -> str:
         return file_path.read_text(encoding="utf-8")
@@ -164,145 +159,36 @@ class SoftEvaluator:
         qc_results: dict[str, Any],
         soft_results: dict[str, Any],
     ) -> dict[str, Any]:
-        cc_issues = hard_results.get("radon_cc", {}).get("issues", [])
-        mi_scores = hard_results.get("radon_mi", {}).get("mi_scores", {})
-        avg_mi = sum(mi_scores.values()) / len(mi_scores) if mi_scores else 100.0
-        return {
-            "cc_issues": cc_issues,
-            "avg_mi": avg_mi,
-            "hard_failed": not hard_results.get("all_passed", True),
-            "qc_failed": not qc_results.get("all_passed", True),
-            "qc_errors": qc_results.get("failures", []),
-            "qa_score": soft_results.get("understandability_score", 100.0),
-            "qa_entities": soft_results.get("qa_results", {}).get(
-                "sampled_entities", []
-            ),
-            "hard_errors": self._collect_hard_errors(hard_results),
-        }
+        return extract_metrics(hard_results, qc_results, soft_results)
     def _collect_hard_errors(self, hard_results: dict[str, Any]) -> list[str]:
-        if hard_results.get("all_passed", True):
-            return []
-        hard_errors = []
-        if hard_results.get("ruff", {}).get("status") != "success":
-            hard_errors.append("Linter (Ruff) failed.")
-        if hard_results.get("mypy", {}).get("status") != "success":
-            hard_errors.append("Type checker (Mypy) failed.")
-        if hard_results.get("pytest", {}).get("status") != "success":
-            hard_errors.append(
-                hard_results.get("pytest", {}).get(
-                    "error_message", "Tests or Coverage failed."
-                )
-            )
-        return hard_errors
+        return collect_hard_errors(hard_results)
     def _determine_verdict(self, metrics: dict[str, Any], mock: bool = False) -> str:
-        suffix = " (Mock)" if mock else ""
-        if metrics["hard_failed"] or metrics["qc_failed"]:
-            return f"Fail{suffix}"
-        passed = (
-            metrics["avg_mi"] > 50
-            and metrics["qa_score"] > 75
-            and not metrics["cc_issues"]
-        )
-        return f"Pass{suffix}" if passed else f"Fail{suffix}"
+        return determine_verdict(metrics, mock=mock)
     def _build_mock_summary(
         self,
         metrics: dict[str, Any],
         hard_results: dict[str, Any],
     ) -> str:
-        summary_parts = []
-        if metrics["hard_failed"]:
-            pytest_err = hard_results.get("pytest", {}).get("error_message", "")
-            summary_parts.append(f"Hard evaluation failed. {pytest_err}".strip())
-        if metrics["qc_failed"]:
-            summary_parts.append("Governance QC failed.")
-        if not summary_parts:
-            summary_parts.append("Mock evaluation completed without LLM.")
-        return " ".join(summary_parts)
+        return build_mock_summary(metrics, hard_results)
     def _build_mock_final_report(
         self,
         hard_results: dict[str, Any],
         metrics: dict[str, Any],
     ) -> dict[str, Any]:
-        return {
-            "verdict": self._determine_verdict(metrics, mock=True),
-            "summary": self._build_mock_summary(metrics, hard_results),
-            "suggestions": [
-                {
-                    "title": "Mock Suggestion 1",
-                    "description": "Add more docstrings.",
-                    "target_file": "all",
-                },
-                {
-                    "title": "Mock Suggestion 2",
-                    "description": "Refactor large functions.",
-                    "target_file": "all",
-                },
-                {
-                    "title": "Mock Suggestion 3",
-                    "description": "Improve test coverage.",
-                    "target_file": "tests/",
-                },
-            ],
-        }
+        return build_mock_final_report(hard_results, metrics)
     def _build_final_report_messages(
         self,
         metrics: dict[str, Any],
     ) -> list[dict[str, str]]:
-        sys_prompt = (
-            "You are an elite Python Codebase Evaluator. You have just analyzed "
-            "a repository. Your task is to provide a final judgment and EXACTLY "
-            "3 concrete, actionable improvement suggestions.\n"
-            "If the codebase failed its Hard or QC evaluations (e.g. tests "
-            "failed, coverage is low, or governance violated), your suggestions "
-            "MUST prioritize fixing those issues.\n"
-            "Otherwise, focus on refactoring/quality improvements without "
-            "changing external functionality.\n\n"
-            "Output MUST be in valid JSON matching this schema:\n"
-            "{\n"
-            '  "verdict": "Pass" or "Fail",\n'
-            '  "summary": "One paragraph summary of codebase health and '
-            'any critical failures",\n'
-            '  "suggestions": [\n'
-            '    {"title": "str", "description": "str", "target_file": "str"}\n'
-            "  ]\n"
-            "}\n"
-            "Rule for Verdict: If there are Hard Failures or QC Failures, "
-            "verdict MUST be Fail. Otherwise, Pass if Average Maintainability "
-            "> 50 and QA Score > 75 and no Critical CC issues (>15). "
-            "Otherwise Fail."
-        )
-        user_content = (
-            f"Metrics:\n"
-            f"- Average Maintainability Index (MI): {metrics['avg_mi']:.1f}/100\n"
-            f"- Number of functions with Cyclomatic Complexity > 15: "
-            f"{len(metrics['cc_issues'])}\n"
-            f"- Agent QA Readability Score: {metrics['qa_score']:.1f}/100\n\n"
-            f"Failures (Prioritize these!):\n"
-            f"- Hard Evaluation Errors: "
-            f"{metrics['hard_errors'] if metrics['hard_errors'] else 'None'}\n"
-            f"- QC/Governance Errors: "
-            f"{metrics['qc_errors'] if metrics['qc_errors'] else 'None'}\n\n"
-            f"QA Feedback Snippets:\n"
-            + "\n".join(
-                [f"  * {q['entity']}: {q['feedback']}" for q in metrics["qa_entities"]]
-            )
-        )
-        return [
-            {"role": "system", "content": sys_prompt},
-            {"role": "user", "content": user_content},
-        ]
+        return build_final_report_messages(metrics)
     def _parse_final_report_response(self, raw_content: str) -> dict[str, Any]:
-        parsed_json = json.loads(raw_content)
-        if isinstance(parsed_json, dict):
-            return parsed_json
-        raise ValueError("JSON response is not a dictionary.")
+        return parse_final_report_response(raw_content)
     def calculate_token_complexity(self, file_path: Path) -> int:
         """

{python_harness-0.0.11 → python_harness-0.0.12/python_harness.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-harness
-Version: 0.0.11
+Version: 0.0.12
 Summary: An agentic codebase evaluation and evolution tool for Python projects.
 Author-email: Mingli Yuan <mingli.yuan@gmail.com>
 License: MIT

{python_harness-0.0.11 → python_harness-0.0.12}/python_harness.egg-info/SOURCES.txt RENAMED Viewed

@@ -5,7 +5,9 @@ python_harness/__init__.py
 python_harness/cli.py
 python_harness/evaluator.py
 python_harness/hard_evaluator.py
+python_harness/python_file_inventory.py
 python_harness/qc_evaluator.py
+python_harness/soft_eval_report.py
 python_harness/soft_evaluator.py
 python_harness.egg-info/PKG-INFO
 python_harness.egg-info/SOURCES.txt

{python_harness-0.0.11 → python_harness-0.0.12}/tests/test_cli.py RENAMED Viewed

@@ -501,3 +501,17 @@ def test_measure_surfaces_hard_tool_errors(monkeypatch: Any) -> None:
     assert "No module named mypy" in result.stdout
     assert "Pytest/Coverage issues found" in result.stdout
     assert "No module named pytest" in result.stdout
+def test_mi_scorecard_uses_warning_color_below_70() -> None:
+    """
+    Test that MI below 70 is no longer rendered as healthy green.
+    """
+    assert cli_module._mi_scorecard_color(65.0) == "yellow"
+def test_mi_scorecard_uses_green_at_70() -> None:
+    """
+    Test that MI 70 is rendered at the healthy threshold.
+    """
+    assert cli_module._mi_scorecard_color(70.0) == "green"

{python_harness-0.0.11 → python_harness-0.0.12}/tests/test_hard_evaluator.py RENAMED Viewed

@@ -79,6 +79,7 @@ def test_radon_cc_syntax_error(monkeypatch: Any, tmp_path: Path) -> None:
     # and writing an error to stderr (which happens when there are syntax errors)
     import subprocess
     original_run = subprocess.run
+    (tmp_path / "bad.py").write_text("def broken(:\n")
     def mock_run(args: Any, **kwargs: Any) -> Any:
         # Check if the command is for radon cc (sys.executable, -m, radon, cc)
@@ -372,6 +373,21 @@ def test_run_pytest_surfaces_stderr(monkeypatch: Any, tmp_path: Path) -> None:
     assert result["error_message"] == "No module named pytest"
+def test_radon_mi_targets_exclude_test_files(tmp_path: Path) -> None:
+    """
+    Test that maintainability scoring ignores test files and directories.
+    """
+    (tmp_path / "pkg").mkdir()
+    (tmp_path / "pkg" / "keep.py").write_text("x = 1\n")
+    (tmp_path / "tests").mkdir()
+    (tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
+    (tmp_path / "test_skip.py").write_text("x = 1\n")
+    evaluator = HardEvaluator(str(tmp_path))
+    assert evaluator._radon_metric_targets() == [str(tmp_path / "pkg" / "keep.py")]
 def test_evaluate_fails_when_coverage_report_missing(monkeypatch: Any) -> None:
     """
     Test that missing coverage data fails the hard gate even when tests pass.

{python_harness-0.0.11 → python_harness-0.0.12}/tests/test_soft_evaluator.py RENAMED Viewed

@@ -111,6 +111,66 @@ def test_generate_final_report_mock_fails_on_hard_failure() -> None:
             os.environ["LLM_API_KEY"] = old_key
+def test_determine_verdict_fails_below_mi_70(tmp_path: Path) -> None:
+    """
+    Test that MI below 70 no longer qualifies for a passing verdict.
+    """
+    evaluator = SoftEvaluator(str(tmp_path))
+    verdict = evaluator._determine_verdict(
+        {
+            "hard_failed": False,
+            "qc_failed": False,
+            "avg_mi": 65.0,
+            "qa_score": 90.0,
+            "cc_issues": [],
+        }
+    )
+    assert verdict == "Fail"
+def test_determine_verdict_passes_at_mi_70(tmp_path: Path) -> None:
+    """
+    Test that MI of 70 is sufficient for a passing verdict.
+    """
+    evaluator = SoftEvaluator(str(tmp_path))
+    verdict = evaluator._determine_verdict(
+        {
+            "hard_failed": False,
+            "qc_failed": False,
+            "avg_mi": 70.0,
+            "qa_score": 90.0,
+            "cc_issues": [],
+        }
+    )
+    assert verdict == "Pass"
+def test_final_report_prompt_mentions_mi_70_threshold(tmp_path: Path) -> None:
+    """
+    Test that the final report prompt advertises the updated MI threshold.
+    """
+    evaluator = SoftEvaluator(str(tmp_path))
+    messages = evaluator._build_final_report_messages(
+        {
+            "avg_mi": 70.0,
+            "cc_issues": [],
+            "qa_score": 90.0,
+            "hard_errors": [],
+            "qc_errors": [],
+            "qa_entities": [],
+            "hard_failed": False,
+            "qc_failed": False,
+        }
+    )
+    assert "Average Maintainability >= 70" in messages[0]["content"]
 def test_read_file_text_helper_reads_utf8_content(tmp_path: Path) -> None:
     """
     Test that the file-reading helper returns UTF-8 text content.
@@ -145,6 +205,8 @@ def test_get_python_files_filters_hidden_and_virtualenv_dirs(tmp_path: Path) ->
     (tmp_path / "venv" / "skip.py").write_text("x = 1\n")
     (tmp_path / "vendors").mkdir()
     (tmp_path / "vendors" / "skip.py").write_text("x = 1\n")
+    (tmp_path / "tests").mkdir()
+    (tmp_path / "tests" / "test_skip.py").write_text("x = 1\n")
     evaluator = SoftEvaluator(str(tmp_path))