PyPI - fhir-mcp-shared - Versions diffs - 0.1.0__tar.gz - Mend

fhir-mcp-shared 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

fhir_mcp_shared-0.1.0/.gitignore +54 -0
fhir_mcp_shared-0.1.0/PKG-INFO +41 -0
fhir_mcp_shared-0.1.0/README.md +12 -0
fhir_mcp_shared-0.1.0/pyproject.toml +48 -0
fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/__init__.py +13 -0
fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/eval/__init__.py +120 -0
fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/langfuse.py +200 -0
fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/logging.py +68 -0
fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/models/__init__.py +16 -0
fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/models/fhir.py +33 -0
fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/models/validation.py +41 -0
fhir_mcp_shared-0.1.0/tests/test_eval_harness.py +52 -0

fhir_mcp_shared-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,54 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+.eggs/
+# uv
+.venv/
+.uv/
+# uv.lock is intentionally committed — pins all transitive deps for CI reproducibility
+# Environments
+.env
+.env.local
+.env.*.local
+# IDE
+.vscode/settings.json
+.idea/
+*.iml
+# Testing
+.pytest_cache/
+.coverage
+coverage.xml
+htmlcov/
+.tox/
+# Mypy
+.mypy_cache/
+.dmypy.json
+# Ruff
+.ruff_cache/
+# MkDocs
+site/
+# Docker
+*.tar
+# OS
+.DS_Store
+Thumbs.db
+# Secrets (belt-and-suspenders; gitleaks is the real guard)
+*.pem
+*.key
+*_secret*

fhir_mcp_shared-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,41 @@
+Metadata-Version: 2.4
+Name: fhir-mcp-shared
+Version: 0.1.0
+Summary: Shared utilities for the fhir-mcp-suite monorepo (logging, LangFuse, base models, eval)
+Project-URL: Homepage, https://github.com/pcmedsinge/fhir-mcp-suite
+Project-URL: Repository, https://github.com/pcmedsinge/fhir-mcp-suite
+Project-URL: Bug Tracker, https://github.com/pcmedsinge/fhir-mcp-suite/issues
+Project-URL: Documentation, https://pcmedsinge.github.io/fhir-mcp-suite/
+Project-URL: Changelog, https://github.com/pcmedsinge/fhir-mcp-suite/releases
+Author-email: Parag Medsinge <pcmedsinge@gmail.com>
+License: Apache-2.0
+Keywords: eval,fhir,healthcare,langfuse,logging,mcp,pydantic
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Healthcare Industry
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Medical Science Apps.
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Typing :: Typed
+Requires-Python: >=3.12
+Requires-Dist: langfuse>=3.0
+Requires-Dist: pydantic-settings>=2.6
+Requires-Dist: pydantic>=2.9
+Requires-Dist: structlog>=24.4
+Description-Content-Type: text/markdown
+# fhir-mcp-shared
+Internal shared utilities for the [fhir-mcp-suite](https://github.com/pcmedsinge/fhir-mcp-suite) monorepo.
+Not published to PyPI. Used as a uv workspace dependency by `mcp-fhir`, `mcp-terminology`, and `mcp-clinical-reasoner`.
+## Contents
+- `logging.py` — structlog configuration (JSON + console renderers)
+- `langfuse.py` — LangFuse v3 wrapper with graceful no-op degradation
+- `models/` — shared Pydantic models (`FhirResource`, `ValidationReport`, etc.)
+- `eval/` — golden-query eval harness (`EvalRunner`, `GoldenCase`, `EvalResult`)

fhir_mcp_shared-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,12 @@
+# fhir-mcp-shared
+Internal shared utilities for the [fhir-mcp-suite](https://github.com/pcmedsinge/fhir-mcp-suite) monorepo.
+Not published to PyPI. Used as a uv workspace dependency by `mcp-fhir`, `mcp-terminology`, and `mcp-clinical-reasoner`.
+## Contents
+- `logging.py` — structlog configuration (JSON + console renderers)
+- `langfuse.py` — LangFuse v3 wrapper with graceful no-op degradation
+- `models/` — shared Pydantic models (`FhirResource`, `ValidationReport`, etc.)
+- `eval/` — golden-query eval harness (`EvalRunner`, `GoldenCase`, `EvalResult`)

fhir_mcp_shared-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,48 @@
+[project]
+name = "fhir-mcp-shared"
+version = "0.1.0"
+description = "Shared utilities for the fhir-mcp-suite monorepo (logging, LangFuse, base models, eval)"
+readme = "README.md"
+requires-python = ">=3.12"
+license = { text = "Apache-2.0" }
+authors = [{ name = "Parag Medsinge", email = "pcmedsinge@gmail.com" }]
+keywords = ["mcp", "fhir", "healthcare", "langfuse", "logging", "pydantic", "eval"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Healthcare Industry",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Medical Science Apps.",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Typing :: Typed",
+]
+dependencies = [
+    "pydantic>=2.9",
+    "pydantic-settings>=2.6",
+    "structlog>=24.4",
+    "langfuse>=3.0",
+]
+[project.urls]
+Homepage = "https://github.com/pcmedsinge/fhir-mcp-suite"
+Repository = "https://github.com/pcmedsinge/fhir-mcp-suite"
+"Bug Tracker" = "https://github.com/pcmedsinge/fhir-mcp-suite/issues"
+Documentation = "https://pcmedsinge.github.io/fhir-mcp-suite/"
+Changelog = "https://github.com/pcmedsinge/fhir-mcp-suite/releases"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/fhir_mcp_shared"]
+[dependency-groups]
+dev = [
+    "pytest>=8.3",
+    "pytest-asyncio>=0.24",
+    "pytest-cov>=5.0",
+]

fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""fhir-mcp-shared — internal utilities for the fhir-mcp-suite monorepo.
+Public surface (intended for import by package servers):
+    from fhir_mcp_shared.logging import configure_logging
+    from fhir_mcp_shared.langfuse import span, generation, get_client
+    from fhir_mcp_shared.models import ValidationIssue, ValidationReport, FhirResource
+    from fhir_mcp_shared.eval import EvalRunner, GoldenCase, EvalResult
+"""
+from fhir_mcp_shared.logging import configure_logging
+__all__ = ["configure_logging"]

fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/eval/__init__.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Eval harness skeleton — golden-query runner for all three MCP servers.
+Usage::
+    from fhir_mcp_shared.eval import EvalRunner, GoldenCase, EvalResult
+    runner = EvalRunner(cases=load_golden("evals/mcp-fhir/golden_queries.json"))
+    results = await runner.run(invoke_fn=my_tool_fn)
+    runner.assert_threshold(results, min_pass_rate=0.90)
+"""
+from __future__ import annotations
+import json
+from collections.abc import Awaitable, Callable
+from pathlib import Path
+from typing import Any
+import structlog
+from pydantic import BaseModel, Field
+log = structlog.get_logger(__name__)
+class GoldenCase(BaseModel):
+    """A single golden-query test case."""
+    id: str
+    description: str
+    tool: str = Field(description="MCP tool name to invoke")
+    input: dict[str, Any] = Field(description="Tool input arguments")
+    expected: dict[str, Any] = Field(description="Expected fields in the response")
+    tags: list[str] = Field(default_factory=list)
+class EvalResult(BaseModel):
+    """Outcome of running a single golden case."""
+    case_id: str
+    passed: bool
+    score: float = Field(ge=0.0, le=1.0, description="0.0 = fail, 1.0 = full pass")
+    actual: dict[str, Any] = Field(default_factory=dict)
+    notes: str = ""
+class EvalRunner:
+    """Runs a golden-query suite against an async tool-invoke function."""
+    def __init__(self, cases: list[GoldenCase]) -> None:
+        self.cases = cases
+    @classmethod
+    def from_file(cls, path: str | Path) -> EvalRunner:
+        """Load golden cases from a JSON file."""
+        data = json.loads(Path(path).read_text())
+        return cls(cases=[GoldenCase.model_validate(c) for c in data])
+    async def run(
+        self,
+        invoke_fn: Callable[[str, dict[str, Any]], Awaitable[dict[str, Any]]],
+        tags: list[str] | None = None,
+    ) -> list[EvalResult]:
+        """Run all cases (optionally filtered by tag) and return results.
+        Args:
+            invoke_fn: An async callable ``(tool_name, input) -> dict``.
+            tags:      If provided, only run cases whose tags overlap.
+        """
+        results: list[EvalResult] = []
+        for case in self.cases:
+            if tags and not set(tags) & set(case.tags):
+                continue
+            log.info("eval_case_start", case_id=case.id, tool=case.tool)
+            try:
+                actual = await invoke_fn(case.tool, case.input)
+                passed, score, notes = self._check(case.expected, actual)
+            except Exception as exc:
+                log.warning("eval_case_error", case_id=case.id, error=str(exc))
+                passed, score, notes = False, 0.0, f"exception: {exc}"
+                actual = {}
+            results.append(
+                EvalResult(case_id=case.id, passed=passed, score=score, actual=actual, notes=notes)
+            )
+            log.info("eval_case_done", case_id=case.id, passed=passed, score=score)
+        return results
+    def _check(self, expected: dict[str, Any], actual: dict[str, Any]) -> tuple[bool, float, str]:
+        """Check that all expected keys/values appear in actual (subset match).
+        Returns (passed, score, notes).
+        """
+        if not expected:
+            return True, 1.0, "no assertions defined"
+        hits = 0
+        misses: list[str] = []
+        for key, expected_val in expected.items():
+            actual_val = actual.get(key)
+            if actual_val == expected_val:
+                hits += 1
+            else:
+                misses.append(f"{key}: expected {expected_val!r}, got {actual_val!r}")
+        score = hits / len(expected)
+        passed = len(misses) == 0
+        notes = "; ".join(misses) if misses else "all assertions passed"
+        return passed, score, notes
+    @staticmethod
+    def assert_threshold(results: list[EvalResult], min_pass_rate: float = 0.85) -> None:
+        """Raise AssertionError if pass rate is below threshold (used in CI)."""
+        if not results:
+            raise AssertionError("No eval results to check")
+        pass_rate = sum(r.passed for r in results) / len(results)
+        if pass_rate < min_pass_rate:
+            failures = [r for r in results if not r.passed]
+            details = "\n".join(f"  {r.case_id}: {r.notes}" for r in failures)
+            raise AssertionError(
+                f"Eval pass rate {pass_rate:.1%} < threshold {min_pass_rate:.1%}\n{details}"
+            )

fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/langfuse.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""LangFuse v3 wrapper — gracefully degrades when credentials are absent.
+Usage in any MCP server::
+    from fhir_mcp_shared.langfuse import span, generation, trace
+    # Per-request trace (wraps one MCP tool call)
+    with trace("fhir_read", session_id="session-abc", user_id="user-1") as t:
+        with span("http_get", parent=t, resource_type="Patient") as s:
+            result = await do_read()
+            if s:
+                s.update(output={"bytes": len(result)})
+    # Simpler span without explicit trace:
+    with span("fhir_search", resource_type="Patient") as s:
+        ...
+    # LLM generation (for mcp-clinical-reasoner):
+    with generation("llm_call", model="gpt-4o-mini", input=prompt) as gen:
+        resp = await openai_client.chat.completions.create(...)
+        if gen:
+            gen.update(output=resp.choices[0].message.content,
+                       usage_details={"input": resp.usage.prompt_tokens,
+                                      "output": resp.usage.completion_tokens})
+If ``LANGFUSE_PUBLIC_KEY`` / ``LANGFUSE_SECRET_KEY`` are not set, all
+helpers are no-ops and the server runs without observability.
+"""
+from __future__ import annotations
+import os
+import uuid
+from collections.abc import Generator
+from contextlib import contextmanager, suppress
+from typing import Any
+import structlog
+log = structlog.get_logger(__name__)
+_client: Any | None = None
+_initialized: bool = False
+def get_client() -> Any | None:
+    """Return a singleton Langfuse client, or ``None`` if credentials aren't set."""
+    global _client, _initialized
+    if _initialized:
+        return _client
+    _initialized = True
+    public_key = os.getenv("LANGFUSE_PUBLIC_KEY", "")
+    secret_key = os.getenv("LANGFUSE_SECRET_KEY", "")
+    host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
+    if not public_key or not secret_key:
+        log.debug("langfuse_disabled", reason="LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY not set")
+        return None
+    try:
+        from langfuse import Langfuse  # type: ignore
+    except ImportError as exc:
+        log.warning("langfuse_import_failed", error=str(exc))
+        return None
+    try:
+        _client = Langfuse(public_key=public_key, secret_key=secret_key, host=host)
+        log.info("langfuse_initialized", host=host)
+    except Exception as exc:
+        log.warning("langfuse_init_failed", error=str(exc))
+    return _client
+@contextmanager
+def trace(
+    name: str,
+    *,
+    session_id: str | None = None,
+    user_id: str | None = None,
+    tags: list[str] | None = None,
+    **metadata: Any,
+) -> Generator[Any, None, None]:
+    """Context manager that creates a top-level LangFuse trace.
+    A trace represents one logical request (e.g. one MCP ``call_tool``
+    invocation). Nest ``span()`` calls inside with ``parent=t``.
+    Yields the trace object (or ``None`` if LangFuse is disabled).
+    Example::
+        with trace("fhir_read", session_id=session_id) as t:
+            with span("http_get", parent=t, resource_type="Patient"):
+                ...
+    """
+    client = get_client()
+    if client is None:
+        yield None
+        return
+    trace_id = str(uuid.uuid4())
+    try:
+        tr = client.trace(
+            id=trace_id,
+            name=name,
+            session_id=session_id,
+            user_id=user_id,
+            tags=tags or [],
+            metadata=metadata,
+        )
+        try:
+            yield tr
+        finally:
+            with suppress(Exception):
+                client.flush()
+    except Exception as exc:
+        log.warning("langfuse_trace_error", name=name, error=str(exc))
+        yield None
+@contextmanager
+def span(
+    name: str,
+    *,
+    parent: Any = None,
+    **kwargs: Any,
+) -> Generator[Any, None, None]:
+    """Context manager that wraps a logical step in a LangFuse span.
+    Args:
+        name:   Span name (e.g. ``"fhir_read"``).
+        parent: A trace or span object to nest under (optional).
+        **kwargs: Metadata attached to the span.
+    Yields the span object (or ``None`` if LangFuse is disabled) so callers
+    can attach output metadata::
+        with span("validate", profile=profile_url) as s:
+            result = validate(resource)
+            if s:
+                s.update(output=result.model_dump())
+    """
+    client = get_client()
+    if client is None:
+        yield None
+        return
+    try:
+        kwargs_clean = {k: v for k, v in kwargs.items() if v is not None}
+        if parent is not None:
+            s = parent.span(name=name, metadata=kwargs_clean)
+        else:
+            s = client.start_span(name=name, metadata=kwargs_clean)
+        try:
+            yield s
+        finally:
+            s.end()
+    except Exception as exc:
+        log.warning("langfuse_span_error", name=name, error=str(exc))
+        yield None
+@contextmanager
+def generation(
+    name: str,
+    model: str = "",
+    *,
+    parent: Any = None,
+    **kwargs: Any,
+) -> Generator[Any, None, None]:
+    """Context manager for an LLM generation span.
+    Callers should call ``gen.update(output=..., usage_details=...)`` inside
+    the block for cost/token tracking.
+    Args:
+        name:   Generation name.
+        model:  Model identifier (e.g. ``"gpt-4o-mini"``).
+        parent: Parent trace or span (optional).
+        **kwargs: Extra metadata.
+    """
+    client = get_client()
+    if client is None:
+        yield None
+        return
+    try:
+        kwargs_clean = {k: v for k, v in kwargs.items() if v is not None}
+        if parent is not None:
+            g = parent.generation(name=name, model=model, metadata=kwargs_clean)
+        else:
+            g = client.start_generation(name=name, model=model, metadata=kwargs_clean)
+        try:
+            yield g
+        finally:
+            g.end()
+    except Exception as exc:
+        log.warning("langfuse_generation_error", name=name, error=str(exc))
+        yield None

fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/logging.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Structured logging setup using structlog.
+Call ``configure_logging()`` once at process startup (e.g. in server.py).
+After that, every module can use::
+    import structlog
+    log = structlog.get_logger(__name__)
+    log.info("event", key="value")
+In development (LOG_FORMAT=console) output is human-readable with colours.
+In production (LOG_FORMAT=json, the default) every line is a JSON object
+suitable for Loki / Azure Monitor ingestion.
+"""
+from __future__ import annotations
+import logging
+import sys
+import structlog
+def configure_logging(level: str = "INFO", fmt: str = "json") -> None:
+    """Configure structlog and stdlib logging.
+    Args:
+        level: Logging level string, e.g. "DEBUG", "INFO", "WARNING".
+        fmt:   "json" (default, for production) or "console" (dev).
+    """
+    log_level = getattr(logging, level.upper(), logging.INFO)
+    shared_processors: list[structlog.types.Processor] = [
+        structlog.contextvars.merge_contextvars,
+        structlog.stdlib.add_logger_name,
+        structlog.stdlib.add_log_level,
+        structlog.processors.TimeStamper(fmt="iso"),
+        structlog.processors.StackInfoRenderer(),
+    ]
+    if fmt == "console":
+        renderer: structlog.types.Processor = structlog.dev.ConsoleRenderer()
+    else:
+        renderer = structlog.processors.JSONRenderer()
+    structlog.configure(
+        processors=[
+            *shared_processors,
+            structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
+        ],
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        wrapper_class=structlog.stdlib.BoundLogger,
+        cache_logger_on_first_use=True,
+    )
+    formatter = structlog.stdlib.ProcessorFormatter(
+        processors=[
+            structlog.stdlib.ProcessorFormatter.remove_processors_meta,
+            renderer,
+        ],
+        foreign_pre_chain=shared_processors,
+    )
+    handler = logging.StreamHandler(sys.stderr)
+    handler.setFormatter(formatter)
+    root_logger = logging.getLogger()
+    root_logger.handlers = [handler]
+    root_logger.setLevel(log_level)

fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/models/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Base Pydantic models shared across all three MCP servers."""
+from fhir_mcp_shared.models.fhir import FhirResource, FhirSearchParams
+from fhir_mcp_shared.models.validation import (
+    ValidationIssue,
+    ValidationReport,
+    ValidationSeverity,
+)
+__all__ = [
+    "FhirResource",
+    "FhirSearchParams",
+    "ValidationIssue",
+    "ValidationReport",
+    "ValidationSeverity",
+]

fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/models/fhir.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""FHIR resource and search models."""
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field, HttpUrl
+class FhirResource(BaseModel):
+    """A single FHIR resource as returned by the server."""
+    resource_type: str = Field(alias="resourceType")
+    id: str | None = None
+    meta: dict[str, Any] | None = None
+    # All remaining fields stored as-is; avoids re-modelling the full FHIR spec.
+    extra: dict[str, Any] = Field(default_factory=dict)
+    model_config = {"populate_by_name": True, "extra": "allow"}
+class FhirSearchParams(BaseModel):
+    """Parameters for a FHIR search request."""
+    resource_type: str = Field(description="FHIR resource type, e.g. 'Patient', 'Observation'")
+    params: dict[str, str] = Field(
+        default_factory=dict,
+        description="FHIR search parameters, e.g. {'family': 'Smith', '_count': '10'}",
+    )
+    base_url: HttpUrl | None = Field(
+        default=None,
+        description="Override the default FHIR server base URL for this request.",
+    )

fhir_mcp_shared-0.1.0/src/fhir_mcp_shared/models/validation.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Validation report models — HAPI validator output, normalised.
+Ported from P1 (fhir-mapping-agent) with minor adaptations for P3.
+"""
+from __future__ import annotations
+from enum import StrEnum
+from pydantic import BaseModel, Field
+class ValidationSeverity(StrEnum):
+    FATAL = "fatal"
+    ERROR = "error"
+    WARNING = "warning"
+    INFORMATION = "information"
+class ValidationIssue(BaseModel):
+    severity: ValidationSeverity
+    code: str = Field(description="HAPI/FHIR issue code, e.g. 'required', 'code-invalid'")
+    location: str | None = Field(default=None, description="FHIRPath of the offending element")
+    message: str
+class ValidationReport(BaseModel):
+    profile: str
+    resource_type: str
+    is_conformant: bool
+    issues: list[ValidationIssue] = Field(default_factory=list)
+    error_count: int = 0
+    warning_count: int = 0
+    def model_post_init(self, _ctx: object, /) -> None:
+        self.error_count = sum(
+            1
+            for i in self.issues
+            if i.severity in (ValidationSeverity.ERROR, ValidationSeverity.FATAL)
+        )
+        self.warning_count = sum(1 for i in self.issues if i.severity == ValidationSeverity.WARNING)

fhir_mcp_shared-0.1.0/tests/test_eval_harness.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Placeholder tests for shared library."""
+from fhir_mcp_shared.eval import EvalRunner, GoldenCase
+def test_eval_runner_all_pass() -> None:
+    cases = [
+        GoldenCase(
+            id="t1",
+            description="basic pass",
+            tool="fhir_read",
+            input={"resource_type": "Patient", "resource_id": "1"},
+            expected={"resourceType": "Patient"},
+        )
+    ]
+    runner = EvalRunner(cases=cases)
+    async def invoke(_tool: str, _input: dict) -> dict:  # type: ignore[type-arg]
+        return {"resourceType": "Patient", "id": "1"}
+    import asyncio
+    results = asyncio.run(runner.run(invoke_fn=invoke))
+    assert len(results) == 1
+    assert results[0].passed
+    EvalRunner.assert_threshold(results, min_pass_rate=1.0)
+def test_eval_runner_fail_threshold() -> None:
+    cases = [
+        GoldenCase(
+            id="t1",
+            description="fail case",
+            tool="fhir_read",
+            input={},
+            expected={"resourceType": "Patient"},
+        )
+    ]
+    runner = EvalRunner(cases=cases)
+    async def invoke(_tool: str, _input: dict) -> dict:  # type: ignore[type-arg]
+        return {}
+    import asyncio
+    results = asyncio.run(runner.run(invoke_fn=invoke))
+    assert not results[0].passed
+    import pytest
+    with pytest.raises(AssertionError, match="pass rate"):
+        EvalRunner.assert_threshold(results, min_pass_rate=0.85)