PyPI - latch-eval-tools - Versions diffs - 0.1.0__tar.gz - Mend

latch-eval-tools 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

latch_eval_tools-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,6 @@
+.DS_Store
+__pycache__
+.ruff_cache
+!**/.gitkeep
+/credentials
+/env

latch_eval_tools-0.1.0/Justfile ADDED Viewed

@@ -0,0 +1,7 @@
+build:
+  rm -rf dist
+  uv build
+publish:
+  uv publish --token $(<credentials/pypi_token)
+  rm -rf dist

latch_eval_tools-0.1.0/LICENSE ADDED Viewed

	@@ -0,0 +1 @@
1	+ © LatchBio LLC. All rights reserved.

latch_eval_tools-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,118 @@
+Metadata-Version: 2.4
+Name: latch-eval-tools
+Version: 0.1.0
+Summary: Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks.
+License-File: LICENSE
+Requires-Python: >=3.10
+Requires-Dist: aiohttp>=3.0.0
+Requires-Dist: anthropic>=0.72.0
+Requires-Dist: latch-config>=0.1.0
+Requires-Dist: latch>=2.0.0
+Requires-Dist: matplotlib>=3.0.0
+Requires-Dist: mini-swe-agent
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: openai>=1.0.0
+Requires-Dist: orjson>=3.0.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: scikit-learn>=1.3.0
+Requires-Dist: scipy>=1.10.0
+Requires-Dist: statsmodels>=0.14.0
+Requires-Dist: websockets>=12.0
+Description-Content-Type: text/markdown
+# latch-eval-tools
+Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks.
+## Installation
+```bash
+pip install latch-eval-tools
+```
+## Components
+### Types
+```python
+from latch_eval_tools import Eval, EvalResult
+eval_case = Eval(
+    id="test_001",
+    task="Count cells in the dataset",
+    data_node="latch:///data/sample.h5ad",
+    grader={"type": "numeric_tolerance", "config": {...}}
+)
+```
+### Graders
+Available graders: `numeric_tolerance`, `label_set_jaccard`, `distribution_comparison`, `marker_gene_precision_recall`, `marker_gene_separation`, `spatial_adjacency`, `multiple_choice`
+```python
+from latch_eval_tools.graders import get_grader, NumericToleranceGrader
+grader = get_grader("numeric_tolerance")
+result = grader.evaluate(
+    agent_answer={"n_cells": 1523},
+    config={
+        "ground_truth": {"n_cells": 1500},
+        "tolerances": {"n_cells": {"type": "relative", "value": 0.05}}
+    }
+)
+print(result.passed)
+print(result.reasoning)
+```
+### Harness
+Run evaluations with different agents:
+```python
+from latch_eval_tools.harness import EvalRunner, run_minisweagent_task
+runner = EvalRunner("evals/count_cells.json", cache_name=".scbench")
+result = runner.run(agent_function=lambda task, work_dir:
+    run_minisweagent_task(task, work_dir, model_name="anthropic/claude-sonnet-4")
+)
+def my_agent(task_prompt: str, work_dir: Path) -> dict:
+    return {"answer": json.loads((work_dir / "eval_answer.json").read_text())}
+runner.run(agent_function=my_agent)
+```
+Built-in agents: `run_minisweagent_task`, `run_claudecode_task`, `run_plotsagent_task`
+### Linter
+Validate eval JSON files:
+```bash
+eval-lint evals/my_dataset/
+eval-lint evals/ --format json
+```
+```python
+from latch_eval_tools.linter import lint_eval, lint_directory
+result = lint_eval("evals/test.json")
+print(result.passed, result.issues)
+```
+## Eval JSON Schema
+```json
+{
+  "id": "unique_test_id",
+  "task": "Task description for the agent",
+  "data_node": "latch:///path/to/data.h5ad",
+  "grader": {
+    "type": "numeric_tolerance",
+    "config": {
+      "ground_truth": {"field": 42},
+      "tolerances": {"field": {"type": "absolute", "value": 1}}
+    }
+  }
+}
+```

latch_eval_tools-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,96 @@
+# latch-eval-tools
+Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks.
+## Installation
+```bash
+pip install latch-eval-tools
+```
+## Components
+### Types
+```python
+from latch_eval_tools import Eval, EvalResult
+eval_case = Eval(
+    id="test_001",
+    task="Count cells in the dataset",
+    data_node="latch:///data/sample.h5ad",
+    grader={"type": "numeric_tolerance", "config": {...}}
+)
+```
+### Graders
+Available graders: `numeric_tolerance`, `label_set_jaccard`, `distribution_comparison`, `marker_gene_precision_recall`, `marker_gene_separation`, `spatial_adjacency`, `multiple_choice`
+```python
+from latch_eval_tools.graders import get_grader, NumericToleranceGrader
+grader = get_grader("numeric_tolerance")
+result = grader.evaluate(
+    agent_answer={"n_cells": 1523},
+    config={
+        "ground_truth": {"n_cells": 1500},
+        "tolerances": {"n_cells": {"type": "relative", "value": 0.05}}
+    }
+)
+print(result.passed)
+print(result.reasoning)
+```
+### Harness
+Run evaluations with different agents:
+```python
+from latch_eval_tools.harness import EvalRunner, run_minisweagent_task
+runner = EvalRunner("evals/count_cells.json", cache_name=".scbench")
+result = runner.run(agent_function=lambda task, work_dir:
+    run_minisweagent_task(task, work_dir, model_name="anthropic/claude-sonnet-4")
+)
+def my_agent(task_prompt: str, work_dir: Path) -> dict:
+    return {"answer": json.loads((work_dir / "eval_answer.json").read_text())}
+runner.run(agent_function=my_agent)
+```
+Built-in agents: `run_minisweagent_task`, `run_claudecode_task`, `run_plotsagent_task`
+### Linter
+Validate eval JSON files:
+```bash
+eval-lint evals/my_dataset/
+eval-lint evals/ --format json
+```
+```python
+from latch_eval_tools.linter import lint_eval, lint_directory
+result = lint_eval("evals/test.json")
+print(result.passed, result.issues)
+```
+## Eval JSON Schema
+```json
+{
+  "id": "unique_test_id",
+  "task": "Task description for the agent",
+  "data_node": "latch:///path/to/data.h5ad",
+  "grader": {
+    "type": "numeric_tolerance",
+    "config": {
+      "ground_truth": {"field": 42},
+      "tolerances": {"field": {"type": "absolute", "value": 1}}
+    }
+  }
+}
+```

latch_eval_tools-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,35 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "latch-eval-tools"
+version = "0.1.0"
+description = "Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks."
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "pydantic>=2.0.0",
+    "websockets>=12.0",
+    "aiohttp>=3.0.0",
+    "orjson>=3.0.0",
+    "numpy>=1.24.0",
+    "scipy>=1.10.0",
+    "scikit-learn>=1.3.0",
+    "statsmodels>=0.14.0",
+    "matplotlib>=3.0.0",
+    "openai>=1.0.0",
+    "anthropic>=0.72.0",
+    "latch>=2.0.0",
+    "latch_config>=0.1.0",
+    "mini-swe-agent",
+]
+[project.scripts]
+eval-lint = "latch_eval_tools.cli.eval_lint:main"
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+packages = ["src/latch_eval_tools"]

latch_eval_tools-0.1.0/src/latch_eval_tools/__init__.py ADDED Viewed

@@ -0,0 +1,64 @@
+from latch_eval_tools.types import Eval, EvalResult, TestCase, TestResult
+from latch_eval_tools.linter import lint_eval, lint_directory, LintResult
+from latch_eval_tools.harness import (
+    EvalRunner,
+    run_minisweagent_task,
+    run_claudecode_task,
+    run_plotsagent_task,
+    download_single_dataset,
+    download_data,
+    batch_download_datasets,
+    setup_workspace,
+    cleanup_workspace,
+)
+from latch_eval_tools.graders import (
+    BinaryGrader,
+    GraderResult,
+    get_nested_value,
+    NumericToleranceGrader,
+    MarkerGenePrecisionRecallGrader,
+    MarkerGeneSeparationGrader,
+    LabelSetJaccardGrader,
+    DistributionComparisonGrader,
+    SpatialAdjacencyGrader,
+    MultipleChoiceGrader,
+    GRADER_REGISTRY,
+    get_grader,
+)
+__all__ = [
+    # Types
+    "Eval",
+    "EvalResult",
+    "TestCase",  # Backward compatibility alias
+    "TestResult",  # Backward compatibility alias
+    # Linter
+    "lint_eval",
+    "lint_directory",
+    "LintResult",
+    # Harness
+    "EvalRunner",
+    "run_minisweagent_task",
+    "run_claudecode_task",
+    "run_plotsagent_task",
+    "download_single_dataset",
+    "download_data",
+    "batch_download_datasets",
+    "setup_workspace",
+    "cleanup_workspace",
+    # Graders
+    "BinaryGrader",
+    "GraderResult",
+    "get_nested_value",
+    "NumericToleranceGrader",
+    "MarkerGenePrecisionRecallGrader",
+    "MarkerGeneSeparationGrader",
+    "LabelSetJaccardGrader",
+    "DistributionComparisonGrader",
+    "SpatialAdjacencyGrader",
+    "MultipleChoiceGrader",
+    "GRADER_REGISTRY",
+    "get_grader",
+]
+__version__ = "0.1.0"

latch_eval_tools-0.1.0/src/latch_eval_tools/answer_extraction.py ADDED Viewed

@@ -0,0 +1,35 @@
+import json
+import re
+def extract_answer_from_conversation(conversation: list[dict]) -> dict | None:
+    """Extract the JSON answer from a conversation history.
+    Looks for submit_response tool calls with EVAL_ANSWER tags in the summary.
+    Args:
+        conversation: List of message dicts from agent conversation
+    Returns:
+        Parsed JSON answer dict, or None if not found
+    """
+    for msg in reversed(conversation):
+        if msg.get("type") != "anthropic_message" or msg.get("role") != "assistant":
+            continue
+        content = msg.get("content", [])
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "tool_use":
+                if block.get("name") == "submit_response":
+                    tool_input = block.get("input", {})
+                    summary = tool_input.get("summary", "")
+                    match = re.search(r'<EVAL_ANSWER>(.*?)</EVAL_ANSWER>', summary, re.DOTALL)
+                    if match:
+                        json_str = match.group(1).strip()
+                        try:
+                            return json.loads(json_str)
+                        except json.JSONDecodeError as e:
+                            print(f"[grader] Failed to parse JSON from EVAL_ANSWER tags: {e}")
+                            return None
+    return None

latch_eval_tools-0.1.0/src/latch_eval_tools/cli/__init__.py ADDED Viewed

File without changes

latch_eval_tools-0.1.0/src/latch_eval_tools/cli/eval_lint.py ADDED Viewed

@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import sys
+from pathlib import Path
+from latch_eval_tools.linter import lint_eval, lint_directory, LintResult
+from latch_eval_tools.linter.explanations import get_explanation
+def format_console_rich(results: list[LintResult]) -> str:
+    lines = []
+    total_errors = 0
+    total_warnings = 0
+    for result in results:
+        lines.append(f"\nChecking: {result.file_path}")
+        lines.append("─" * 50)
+        if not result.issues:
+            lines.append("✓ All checks passed")
+            continue
+        for issue in result.issues:
+            prefix = "✗" if issue.level == "error" else "⚠"
+            explanation = get_explanation(issue.code)
+            lines.append(f"\n{prefix} {issue.code}: {issue.message}")
+            if explanation:
+                lines.append(f"")
+                lines.append(f"  Fix: {explanation.example_before} → {explanation.example_after}")
+                if explanation.doc_link:
+                    lines.append(f"  Docs: {explanation.doc_link}")
+            if issue.location:
+                lines.append(f"  Location: {issue.location}")
+        total_errors += result.error_count
+        total_warnings += result.warning_count
+    lines.append("")
+    lines.append("─" * 50)
+    lines.append(f"Result: {total_errors} error(s), {total_warnings} warning(s)")
+    lines.append(f"Files: {sum(1 for r in results if r.passed)}/{len(results)} passed")
+    return "\n".join(lines)
+def format_json_output(results: list[LintResult]) -> str:
+    output = {
+        "summary": {
+            "files_checked": len(results),
+            "files_passed": sum(1 for r in results if r.passed),
+            "total_errors": sum(r.error_count for r in results),
+            "total_warnings": sum(r.warning_count for r in results),
+        },
+        "results": [],
+    }
+    for result in results:
+        result_entry = {
+            "file": result.file_path,
+            "passed": result.passed,
+            "errors": result.error_count,
+            "warnings": result.warning_count,
+            "issues": [],
+        }
+        for issue in result.issues:
+            issue_entry: dict = {
+                "level": issue.level,
+                "code": issue.code,
+                "message": issue.message,
+            }
+            if issue.location:
+                issue_entry["location"] = issue.location
+            explanation = get_explanation(issue.code)
+            if explanation:
+                issue_entry["fix"] = {
+                    "before": explanation.example_before,
+                    "after": explanation.example_after,
+                }
+                if explanation.doc_link:
+                    issue_entry["docs"] = explanation.doc_link
+            result_entry["issues"].append(issue_entry)
+        output["results"].append(result_entry)
+    return json.dumps(output, indent=2)
+VALID_CATEGORIES = ["qc", "normalization", "dimensionality_reduction", "clustering", "cell_typing", "differential_expression", "spatial_analysis"]
+def main():
+    parser = argparse.ArgumentParser(
+        prog="eval-lint",
+        description="Validate eval JSON files locally (no credentials required)",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  eval-lint path/to/eval.json           # Lint single file
+  eval-lint evals/my_dataset/           # Lint directory
+  eval-lint evals/ --category qc        # Lint only QC evals
+  eval-lint evals/ --format json        # JSON output for CI/CD
+  eval-lint evals/ -q                   # Quiet mode (exit code only)
+Exit codes:
+  0  All files passed validation
+  1  One or more files have errors
+""",
+    )
+    parser.add_argument(
+        "path",
+        type=Path,
+        help="Path to eval JSON file or directory containing eval files",
+    )
+    parser.add_argument(
+        "--category", "-c",
+        choices=VALID_CATEGORIES,
+        help="Only lint evals with this metadata.task category",
+    )
+    parser.add_argument(
+        "--format", "-f",
+        choices=["console", "json"],
+        default="console",
+        help="Output format (default: console)",
+    )
+    parser.add_argument(
+        "--quiet", "-q",
+        action="store_true",
+        help="Quiet mode: only show summary and exit code",
+    )
+    parser.add_argument(
+        "--pattern",
+        default="**/*.json",
+        help="Glob pattern for finding files in directory (default: **/*.json)",
+    )
+    args = parser.parse_args()
+    if not args.path.exists():
+        print(f"Error: Path not found: {args.path}", file=sys.stderr)
+        sys.exit(1)
+    if args.path.is_file():
+        results = [lint_eval(args.path)]
+    else:
+        results = lint_directory(args.path, args.pattern)
+    if args.category:
+        filtered_results = []
+        for result in results:
+            try:
+                with open(result.file_path) as f:
+                    eval_data = json.load(f)
+                if eval_data.get("metadata", {}).get("task") == args.category:
+                    filtered_results.append(result)
+            except (json.JSONDecodeError, IOError):
+                filtered_results.append(result)
+        results = filtered_results
+    if not results:
+        print("No eval files found", file=sys.stderr)
+        sys.exit(1)
+    total_errors = sum(r.error_count for r in results)
+    all_passed = all(r.passed for r in results)
+    if args.quiet:
+        passed = sum(1 for r in results if r.passed)
+        print(f"{passed}/{len(results)} files passed, {total_errors} error(s)")
+    elif args.format == "json":
+        print(format_json_output(results))
+    else:
+        print(format_console_rich(results))
+    sys.exit(0 if all_passed else 1)
+if __name__ == "__main__":
+    main()