PyPI - latch-eval-tools - Versions diffs - 0.1.0__py3-none-any.whl - Mend

latch-eval-tools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

latch_eval_tools/__init__.py +64 -0
latch_eval_tools/answer_extraction.py +35 -0
latch_eval_tools/cli/__init__.py +0 -0
latch_eval_tools/cli/eval_lint.py +185 -0
latch_eval_tools/eval_server.py +570 -0
latch_eval_tools/faas_utils.py +13 -0
latch_eval_tools/graders/__init__.py +40 -0
latch_eval_tools/graders/base.py +29 -0
latch_eval_tools/graders/distribution.py +102 -0
latch_eval_tools/graders/label_set.py +75 -0
latch_eval_tools/graders/marker_gene.py +317 -0
latch_eval_tools/graders/multiple_choice.py +38 -0
latch_eval_tools/graders/numeric.py +137 -0
latch_eval_tools/graders/spatial.py +93 -0
latch_eval_tools/harness/__init__.py +27 -0
latch_eval_tools/harness/claudecode.py +212 -0
latch_eval_tools/harness/minisweagent.py +265 -0
latch_eval_tools/harness/plotsagent.py +156 -0
latch_eval_tools/harness/runner.py +191 -0
latch_eval_tools/harness/utils.py +191 -0
latch_eval_tools/headless_eval_server.py +727 -0
latch_eval_tools/linter/__init__.py +25 -0
latch_eval_tools/linter/explanations.py +331 -0
latch_eval_tools/linter/runner.py +146 -0
latch_eval_tools/linter/schema.py +126 -0
latch_eval_tools/linter/validators.py +595 -0
latch_eval_tools/types.py +30 -0
latch_eval_tools/wrapper_entrypoint.py +316 -0
latch_eval_tools-0.1.0.dist-info/METADATA +118 -0
latch_eval_tools-0.1.0.dist-info/RECORD +33 -0
latch_eval_tools-0.1.0.dist-info/WHEEL +4 -0
latch_eval_tools-0.1.0.dist-info/entry_points.txt +2 -0
latch_eval_tools-0.1.0.dist-info/licenses/LICENSE +1 -0

latch_eval_tools/harness/plotsagent.py ADDED Viewed

@@ -0,0 +1,156 @@
+import json
+import os
+import subprocess
+import time
+from pathlib import Path
+EVAL_TIMEOUT = 600
+def run_plotsagent_task(
+    task_prompt: str,
+    work_dir: Path,
+    model_name: str | None = None,
+    eval_timeout: int = EVAL_TIMEOUT,
+) -> dict:
+    """Run PlotsAgent on a task.
+    Args:
+        task_prompt: Task description for the agent
+        work_dir: Working directory for the agent
+        model_name: Optional model name (e.g., "anthropic/claude-sonnet-4")
+        eval_timeout: Timeout for entire evaluation (seconds)
+    Returns:
+        dict with keys "answer" (parsed JSON or None) and "metadata"
+    """
+    agent_log_file = work_dir / "agent_output.log"
+    if agent_log_file.exists():
+        agent_log_file.unlink()
+    eval_config = {
+        "id": work_dir.name,
+        "task": task_prompt,
+        "data_node": None,
+        "grader": None,
+    }
+    eval_file = work_dir / "eval_config.json"
+    eval_file.write_text(json.dumps(eval_config, indent=2))
+    output_file = work_dir / "eval_output.json"
+    faas_python = Path(os.environ.get("PLOTS_FAAS_PYTHON", "/root/plots-faas-venv/bin/python"))
+    cmd = [
+        str(faas_python),
+        "-m", "latch_eval_tools.eval_server",
+        "--headless",
+        "--eval", str(eval_file),
+        "-o", str(output_file),
+    ]
+    env = {
+        **os.environ,
+        "LATCH_PLOTS_FAAS_PATH": os.environ.get("LATCH_PLOTS_FAAS_PATH", "/root/latch-plots-faas"),
+    }
+    start_time = time.time()
+    timed_out = False
+    try:
+        with open(agent_log_file, "w") as log_f:
+            subprocess.run(
+                cmd,
+                env=env,
+                stdout=log_f,
+                stderr=subprocess.STDOUT,
+                timeout=eval_timeout,
+            )
+    except subprocess.TimeoutExpired:
+        timed_out = True
+        with open(agent_log_file, "a") as log_f:
+            log_f.write(f"\n\nAgent timed out after {eval_timeout} seconds")
+    duration = time.time() - start_time
+    print(f"Agent output saved to: {agent_log_file}")
+    eval_id = work_dir.name
+    workspace_dir = output_file.parent / "workspaces" / eval_id
+    trajectory = []
+    if workspace_dir.exists():
+        trajectory_src = workspace_dir / "trajectory.json"
+        if trajectory_src.exists():
+            try:
+                trajectory = json.loads(trajectory_src.read_text())
+                trajectory_dst = work_dir / "trajectory.json"
+                trajectory_dst.write_text(json.dumps(trajectory, indent=2))
+                print(f"Trajectory saved to: {trajectory_dst}")
+            except json.JSONDecodeError:
+                pass
+    agent_answer = None
+    error_details = None
+    if output_file.exists():
+        try:
+            results = json.loads(output_file.read_text())
+            evals = results.get("evals", [])
+            if evals:
+                eval_entry = evals[0]
+                agent_answer = eval_entry.get("agent_answer")
+                if agent_answer is not None:
+                    eval_answer_file = work_dir / "eval_answer.json"
+                    eval_answer_file.write_text(json.dumps(agent_answer, indent=2))
+        except json.JSONDecodeError as e:
+            error_details = {"error": f"Failed to parse output: {e}"}
+    if agent_answer is None:
+        eval_answer_file = work_dir / "eval_answer.json"
+        if eval_answer_file.exists():
+            try:
+                agent_answer = json.loads(eval_answer_file.read_text())
+            except json.JSONDecodeError:
+                pass
+    if agent_answer is None:
+        if workspace_dir.exists():
+            ws_eval_answer = workspace_dir / "eval_answer.json"
+            if ws_eval_answer.exists():
+                try:
+                    agent_answer = json.loads(ws_eval_answer.read_text())
+                    eval_answer_file = work_dir / "eval_answer.json"
+                    eval_answer_file.write_text(json.dumps(agent_answer, indent=2))
+                except json.JSONDecodeError:
+                    pass
+    if agent_answer is None and not error_details:
+        log_tail = ""
+        if agent_log_file.exists():
+            log_content = agent_log_file.read_text()
+            log_tail = log_content[-1000:]
+        error_msg = "Agent timed out" if timed_out else "Agent did not produce an answer"
+        error_details = {
+            "error": error_msg,
+            "timed_out": timed_out,
+            "log_tail": log_tail,
+        }
+        print(f"\nWarning: {error_msg}")
+    n_steps = len([t for t in trajectory if t.get("type") == "assistant"])
+    metadata = {
+        "duration_s": round(duration, 2),
+        "model": model_name or "anthropic/claude-sonnet-4",
+        "n_steps": n_steps,
+        "n_messages": len(trajectory),
+    }
+    if timed_out:
+        metadata["timed_out"] = True
+        metadata["eval_timeout_seconds"] = eval_timeout
+    if error_details:
+        metadata["error_details"] = error_details
+    return {"answer": agent_answer, "metadata": metadata}

latch_eval_tools/harness/runner.py ADDED Viewed

@@ -0,0 +1,191 @@
+import json
+from pathlib import Path
+from latch_eval_tools.types import TestCase
+from latch_eval_tools.graders import GRADER_REGISTRY, GraderResult
+from latch_eval_tools.harness.utils import download_data, setup_workspace, cleanup_workspace
+class EvalRunner:
+    """Main evaluation runner for executing benchmarks with various agents."""
+    def __init__(
+        self,
+        eval_path: str | Path,
+        keep_workspace: bool = False,
+        run_id: str | None = None,
+        cache_name: str = ".eval_cache",
+        workspace_name: str = ".eval_workspace",
+        benchmark_name: str = "Eval"
+    ):
+        """Initialize evaluation runner.
+        Args:
+            eval_path: Path to eval JSON file
+            keep_workspace: Whether to preserve workspace after completion
+            run_id: Optional run ID for organizing multiple runs
+            cache_name: Name of cache directory (e.g., .scbench, .spatialbench)
+            workspace_name: Name of workspace directory
+            benchmark_name: Display name for benchmark (e.g., "SCBench", "SpatialBench")
+        """
+        self.eval_path = Path(eval_path)
+        self.keep_workspace = keep_workspace
+        self.run_id = run_id
+        self.cache_name = cache_name
+        self.workspace_name = workspace_name
+        self.benchmark_name = benchmark_name
+        if not self.eval_path.exists():
+            raise FileNotFoundError(f"Eval file not found: {self.eval_path}")
+        eval_data = json.loads(self.eval_path.read_text())
+        self.test_case = TestCase(**eval_data)
+    def run(self, agent_function=None):
+        """Run evaluation with specified agent function.
+        Args:
+            agent_function: Callable that takes (task_prompt: str, work_dir: Path)
+                          and returns dict with keys "answer" and optionally "metadata"
+        Returns:
+            dict with test results including test_id, agent_answer, grader_result, passed
+        """
+        print("=" * 80)
+        print(f"Running {self.benchmark_name} evaluation: {self.test_case.id}")
+        print("=" * 80)
+        print("\nTask:")
+        print("-" * 80)
+        print(self.test_case.task)
+        print("-" * 80)
+        work_dir = setup_workspace(self.test_case.id, self.run_id, self.workspace_name)
+        print(f"\nWorking directory: {work_dir}")
+        print("\n" + "=" * 80)
+        print("Staging data files...")
+        print("=" * 80)
+        contextual_data = download_data(self.test_case.data_node, work_dir, self.cache_name)
+        data_context = ""
+        if contextual_data:
+            data_context = f"\n\nHere is the context of the selected nodes the user would like to use: <ContextualNodeData>{json.dumps(contextual_data)}</ContextualNodeData>"
+        task_prompt = f"""{self.test_case.task}
+IMPORTANT: When you have completed this task:
+1. Write your final answer as a JSON object to a file named `eval_answer.json`
+2. The file should contain ONLY the JSON object with the required fields
+3. After writing the file, you have completed the task
+Example eval_answer.json:
+{{
+  "field1": value1,
+  "field2": value2
+}}
+{data_context}"""
+        print("\n" + "=" * 80)
+        print("Running agent on task...")
+        print("=" * 80)
+        agent_answer = None
+        agent_metadata = {}
+        if agent_function is None:
+            print("\nNo agent function provided. To run this eval, pass an agent_function that:")
+            print("  1. Takes (task_prompt: str, work_dir: Path) as arguments")
+            print("  2. Returns the parsed agent answer dict")
+            print(f"\nExample:")
+            print(f"  def my_agent(task, work_dir):")
+            print(f"      # Run your agent")
+            print(f"      # Agent should write eval_answer.json to work_dir")
+            print(f"      answer_file = work_dir / 'eval_answer.json'")
+            print(f"      return json.loads(answer_file.read_text())")
+            print(f"\n  runner = EvalRunner(eval_path)")
+            print(f"  runner.run(agent_function=my_agent)")
+        else:
+            try:
+                result = agent_function(task_prompt, work_dir)
+                if isinstance(result, dict) and "answer" in result:
+                    agent_answer = result["answer"]
+                    agent_metadata = result.get("metadata", {})
+                else:
+                    agent_answer = result
+                print("\nAgent completed successfully")
+            except Exception as e:
+                print(f"\nAgent error: {e}")
+                import traceback
+                traceback.print_exc()
+        eval_answer_path = work_dir / "eval_answer.json"
+        if agent_answer is None and eval_answer_path.exists():
+            try:
+                agent_answer = json.loads(eval_answer_path.read_text())
+                print(f"Loaded agent answer from eval_answer.json")
+            except json.JSONDecodeError as e:
+                print(f"Warning: Failed to parse eval_answer.json: {e}")
+        grader_result = None
+        if self.test_case.grader and agent_answer is not None:
+            print("\n" + "=" * 80)
+            print("Running grader...")
+            print("=" * 80)
+            grader_type = self.test_case.grader.get("type")
+            grader_config = self.test_case.grader.get("config", {})
+            if grader_type in GRADER_REGISTRY:
+                grader_cls = GRADER_REGISTRY[grader_type]
+                grader = grader_cls()
+                try:
+                    grader_result = grader.evaluate_answer(agent_answer, grader_config)
+                except Exception as e:
+                    import traceback
+                    grader_result = GraderResult(
+                        passed=False,
+                        metrics={"grader_error": str(e)},
+                        reasoning=f"Grader failed due to malformed agent output: {e}\n\n{traceback.format_exc()}",
+                        agent_answer=agent_answer
+                    )
+                print(f"\n{'✓ EVAL PASSED' if grader_result.passed else '✗ EVAL FAILED'}")
+                print("\nGrader reasoning:")
+                print("-" * 80)
+                print(grader_result.reasoning)
+                print("-" * 80)
+                if grader_result.metrics:
+                    print("\nMetrics:")
+                    for key, value in grader_result.metrics.items():
+                        if isinstance(value, (list, dict)):
+                            continue
+                        print(f"   {key}: {value}")
+            else:
+                print(f"\nWarning: Unknown grader type '{grader_type}'")
+        print("\n" + "=" * 80)
+        print("Cleanup...")
+        print("=" * 80)
+        cleanup_workspace(work_dir, keep=self.keep_workspace)
+        if self.keep_workspace:
+            print(f"\nTo inspect results:")
+            print(f"  cd {work_dir}")
+        result_dict = {
+            "test_id": self.test_case.id,
+            "agent_answer": agent_answer,
+            "grader_result": grader_result,
+            "passed": grader_result.passed if grader_result else None,
+        }
+        if agent_metadata:
+            result_dict["metadata"] = agent_metadata
+        return result_dict

latch_eval_tools/harness/utils.py ADDED Viewed

@@ -0,0 +1,191 @@
+import hashlib
+import json
+import os
+import subprocess
+from pathlib import Path
+def get_project_root():
+    """Find project root by looking for pyproject.toml."""
+    current = Path(__file__).resolve()
+    while current != current.parent:
+        if (current / "pyproject.toml").exists():
+            return current
+        current = current.parent
+    return Path.cwd()
+def get_cache_dir(cache_name: str = ".eval_cache"):
+    """Get cache directory for datasets.
+    Args:
+        cache_name: Name of cache directory (default: .eval_cache)
+                   Can be customized per benchmark (e.g., .scbench, .spatialbench)
+    """
+    project_root = get_project_root()
+    cache_dir = project_root / cache_name / "cache"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+def get_cache_manifest(cache_name: str = ".eval_cache"):
+    """Load cache manifest tracking downloaded datasets."""
+    cache_dir = get_cache_dir(cache_name)
+    manifest_file = cache_dir / "manifest.json"
+    if manifest_file.exists():
+        return json.loads(manifest_file.read_text())
+    return {}
+def save_cache_manifest(manifest: dict, cache_name: str = ".eval_cache"):
+    """Save cache manifest."""
+    cache_dir = get_cache_dir(cache_name)
+    manifest_file = cache_dir / "manifest.json"
+    manifest_file.write_text(json.dumps(manifest, indent=2))
+def get_cache_key(uri: str) -> str:
+    """Generate cache key from URI."""
+    uri_hash = hashlib.sha256(uri.encode()).hexdigest()[:16]
+    filename = Path(uri).name
+    return f"{uri_hash}__{filename}"
+def download_single_dataset(uri: str, show_progress: bool = True, cache_name: str = ".eval_cache") -> Path:
+    """Download a single dataset with caching.
+    Args:
+        uri: URI of dataset to download (e.g., latch://...)
+        show_progress: Whether to print progress messages
+        cache_name: Name of cache directory
+    Returns:
+        Path to cached file
+    """
+    cache_dir = get_cache_dir(cache_name)
+    manifest = get_cache_manifest(cache_name)
+    if uri in manifest:
+        cached_file = cache_dir / manifest[uri]
+        if cached_file.exists():
+            if show_progress:
+                print(f"Using cached: {Path(uri).name}")
+            return cached_file
+    cache_key = get_cache_key(uri)
+    cached_file = cache_dir / cache_key
+    if show_progress:
+        print(f"Downloading: {uri}")
+    subprocess.run(
+        ["latch", "cp", uri, str(cached_file)],
+        check=True,
+        capture_output=True
+    )
+    if show_progress:
+        print(f"Cached as: {cache_key}")
+    manifest[uri] = cache_key
+    save_cache_manifest(manifest, cache_name)
+    return cached_file
+def download_data(data_node: str | list[str], work_dir: Path, cache_name: str = ".eval_cache") -> list[dict]:
+    """Download and symlink data files to workspace.
+    Args:
+        data_node: Single URI or list of URIs to download
+        work_dir: Working directory to create symlinks in
+        cache_name: Name of cache directory
+    Returns:
+        List of contextual data dicts with file info
+    """
+    data_nodes = data_node if isinstance(data_node, list) else ([data_node] if data_node else [])
+    contextual_data = []
+    for node in data_nodes:
+        cached_file = download_single_dataset(node, cache_name=cache_name)
+        data_filename = Path(node).name
+        target_file = work_dir / data_filename
+        if target_file.exists():
+            target_file.unlink()
+        os.symlink(cached_file, target_file)
+        print(f"Linked: {data_filename} -> workspace")
+        contextual_data.append({
+            "type": "File",
+            "path": node,
+            "local_path": data_filename,
+            "id": node.replace("latch:///", "").replace(".csv", "").replace(".h5ad", ""),
+        })
+    return contextual_data
+def batch_download_datasets(uris: list[str], show_progress: bool = True, cache_name: str = ".eval_cache"):
+    """Download multiple datasets in batch.
+    Args:
+        uris: List of URIs to download
+        show_progress: Whether to print progress messages
+        cache_name: Name of cache directory
+    """
+    if show_progress and uris:
+        print(f"Preparing to download {len(uris)} unique dataset(s)...")
+        print("=" * 80)
+    for i, uri in enumerate(uris, 1):
+        if show_progress:
+            print(f"[{i}/{len(uris)}] ", end="")
+        download_single_dataset(uri, show_progress=show_progress, cache_name=cache_name)
+    if show_progress and uris:
+        print("=" * 80)
+        print(f"Downloaded/verified {len(uris)} dataset(s)")
+        print()
+def setup_workspace(eval_id: str, run_id: str | None = None, workspace_name: str = ".eval_workspace") -> Path:
+    """Setup workspace directory for evaluation.
+    Args:
+        eval_id: ID of evaluation
+        run_id: Optional run ID for organizing multiple runs
+        workspace_name: Name of workspace directory (default: .eval_workspace)
+                       Can be customized per benchmark (e.g., .scbench, .spatialbench)
+    Returns:
+        Path to workspace directory
+    """
+    project_root = get_project_root()
+    if run_id:
+        work_dir = project_root / workspace_name / "workspace" / run_id / eval_id
+    else:
+        work_dir = project_root / workspace_name / "workspace" / eval_id
+    if work_dir.exists():
+        import shutil
+        shutil.rmtree(work_dir)
+    work_dir.mkdir(parents=True)
+    return work_dir
+def cleanup_workspace(work_dir: Path, keep: bool = False):
+    """Clean up workspace directory.
+    Args:
+        work_dir: Workspace directory to clean up
+        keep: Whether to keep the workspace
+    """
+    if keep:
+        print(f"Workspace preserved at: {work_dir}")
+    else:
+        import shutil
+        shutil.rmtree(work_dir)
+        print(f"Workspace deleted: {work_dir}")