PyPI - agentevals-cli - Versions diffs - 0.9.3__tar.gz → 0.9.5__tar.gz - Mend

agentevals-cli 0.9.3tar.gz → 0.9.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (285) hide show

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/Dockerfile RENAMED Viewed

@@ -31,7 +31,7 @@ COPY --from=ui /build/ui/dist ./src/agentevals/_static
 ARG VERSION
 ENV SETUPTOOLS_SCM_PRETEND_VERSION=${VERSION}
-RUN uv sync --frozen --no-dev --extra live --extra postgres \
+RUN uv sync --frozen --no-dev --extra live --extra postgres --extra kubernetes \
     && groupadd --gid 1000 app \
     && useradd --uid 1000 --gid app --home-dir /app --no-log-init app \
     && chown -R app:app /app

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.4
 Name: agentevals-cli
-Version: 0.9.3
+Version: 0.9.5
 Summary: Standalone framework to evaluate agent correctness based on portable OpenTelemetry traces
 License-File: LICENSE
 Requires-Python: >=3.11
 Requires-Dist: click>=8.0
 Requires-Dist: fastapi>=0.115.0
-Requires-Dist: google-adk[eval]>=1.30.0
+Requires-Dist: google-adk[eval]<2.2,>=2.1.0
 Requires-Dist: httpx>=0.27.0
 Requires-Dist: opentelemetry-proto>=1.36.0
 Requires-Dist: python-dotenv>=1.0.0
@@ -14,6 +14,8 @@ Requires-Dist: python-multipart>=0.0.12
 Requires-Dist: pyyaml>=6.0
 Requires-Dist: tabulate>=0.9.0
 Requires-Dist: uvicorn[standard]>=0.32.0
+Provides-Extra: kubernetes
+Requires-Dist: kubernetes>=36.0.0; extra == 'kubernetes'
 Provides-Extra: live
 Requires-Dist: httpx>=0.27.0; extra == 'live'
 Requires-Dist: mcp>=1.26.0; extra == 'live'

agentevals_cli-0.9.5/charts/agentevals/templates/rbac.yaml ADDED Viewed

@@ -0,0 +1,33 @@
+{{- if .Values.rbac.create -}}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: {{ include "agentevals.fullname" . }}
+  namespace: {{ include "agentevals.namespace" . }}
+  labels:
+    {{- include "agentevals.labels" . | nindent 4 }}
+rules:
+  - apiGroups: [""]
+    resources: ["secrets"]
+    verbs: ["get"]
+    {{- with .Values.rbac.secretNames }}
+    resourceNames:
+      {{- toYaml . | nindent 6 }}
+    {{- end }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: {{ include "agentevals.fullname" . }}
+  namespace: {{ include "agentevals.namespace" . }}
+  labels:
+    {{- include "agentevals.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: {{ include "agentevals.fullname" . }}
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "agentevals.serviceAccountName" . }}
+    namespace: {{ include "agentevals.namespace" . }}
+{{- end }}

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/charts/agentevals/values.yaml RENAMED Viewed

@@ -57,6 +57,20 @@ serviceAccount:
   # -- ServiceAccount name override
   name: ""
+# ==============================================================================
+# RBAC
+# ==============================================================================
+# -- Namespaced Role + RoleBinding granting the pod's ServiceAccount read
+# access to Secrets. Enable this when the kubernetes secret resolver reads
+# provider credentials from Secrets via in-cluster config.
+rbac:
+  # -- Create the Role and RoleBinding
+  create: false
+  # -- Restrict the Role to these Secret names. Empty grants get on all
+  # Secrets in the release namespace.
+  secretNames: []
 # ==============================================================================
 # Pod
 # ==============================================================================

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/examples/zero-code-examples/adk/run.py RENAMED Viewed

@@ -74,7 +74,7 @@ async def main():
         agent_response = ""
         async for event in runner.run_async(user_id=user_id, session_id=session.id, new_message=content):
-            if event.content.parts and event.content.parts[0].text:
+            if event.content and event.content.parts and event.content.parts[0].text:
                 agent_response = event.content.parts[0].text
         print(f"     Agent: {agent_response}")

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/pyproject.toml RENAMED Viewed

@@ -9,7 +9,7 @@ description = "Standalone framework to evaluate agent correctness based on porta
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
-    "google-adk[eval]>=1.30.0",
+    "google-adk[eval]>=2.1.0,<2.2",
     "click>=8.0",
     "tabulate>=0.9.0",
     "fastapi>=0.115.0",
@@ -36,10 +36,17 @@ openai = [
 postgres = [
     "asyncpg>=0.30.0",
 ]
+kubernetes = [
+    "kubernetes>=36.0.0",
+]
 [project.scripts]
 agentevals = "agentevals.cli:main"
+[project.entry-points."agentevals.secret_resolvers"]
+env = "agentevals.resolvers:create_env_resolver"
+kubernetes = "agentevals.resolvers.kubernetes:create_kubernetes_resolver"
 [tool.hatch.version]
 source = "vcs"

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/models.py RENAMED Viewed

@@ -142,6 +142,15 @@ class EvaluateJsonRequest(CamelModel):
     traces: dict = Field(description="OTLP JSON export with resourceSpans structure.")
     config: EvalParams = Field(default_factory=EvalParams, description="Evaluation parameters.")
     eval_set: dict | None = Field(default=None, description="Optional ADK EvalSet JSON.")
+    credential_refs: dict[str, dict[str, Any]] | None = Field(
+        default=None,
+        description=(
+            "Map of logical credential name to a secret reference dict. Each reference has a "
+            "'kind' (the resolver to use) plus that kind's locator fields. Resolved per call to its "
+            "secret value; never written to the process environment. How a value is used (e.g. which "
+            "judge provider it authenticates) is configured on the consumer, not the reference."
+        ),
+    )
 # ---------------------------------------------------------------------------

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/api/routes.py RENAMED Viewed

@@ -9,6 +9,7 @@ import os
 import re
 import shutil
 import tempfile
+from contextlib import contextmanager
 from typing import Any
 from fastapi import APIRouter, File, Form, HTTPException, Request, UploadFile
@@ -23,6 +24,11 @@ from ..converter import convert_traces
 from ..extraction import get_extractor
 from ..loader import load_traces
 from ..loader.otlp import OtlpJsonLoader
+from ..resolvers import (
+    reset_resolved_credentials,
+    resolve_credential_refs,
+    set_resolved_credentials,
+)
 from ..runner import (
     RunResult,
     load_eval_set,
@@ -53,6 +59,57 @@ from .models import (
 logger = logging.getLogger(__name__)
+@contextmanager
+def _scoped_credentials(resolved: dict[str, str] | None):
+    """Scope an already-resolved ``logical-name -> secret value`` map to the current task.
+    Mirrors the async worker's set/reset (``run/worker.py``) so the synchronous evaluate
+    paths populate the same credential ContextVar that judge graders read. A falsy map is a
+    no-op, keeping callers byte-for-byte backward compatible. For streaming endpoints, enter
+    this BEFORE ``asyncio.create_task`` so the eval task inherits the populated context (a
+    child task snapshots its parent's context at creation time). Resolution is done by the
+    caller so its failures surface as request errors rather than scoping concerns.
+    """
+    token = set_resolved_credentials(resolved) if resolved else None
+    try:
+        yield
+    finally:
+        if token is not None:
+            reset_resolved_credentials(token)
+async def _resolve_credentials(refs: dict[str, dict[str, Any]] | None) -> dict[str, str] | None:
+    """Resolve credentialRefs to secret values, mapping bad references to a 400.
+    Resolver ``ValueError``s (missing/unknown ``kind``, missing locator fields, an unset
+    env var) are request/input errors, so surface them as 400s instead of letting them
+    bubble up as 500s. Infrastructure failures from custom resolvers raise other exception
+    types and are left to propagate as 5xx.
+    """
+    if not refs:
+        return None
+    try:
+        return await resolve_credential_refs(refs)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=f"Could not resolve credentialRefs: {exc}") from exc
+def _parse_credential_refs_form(raw: str | None) -> dict[str, dict[str, Any]] | None:
+    """Parse and validate the multipart ``credential_refs`` form field (a JSON object string).
+    Empty/absent is treated as no credentials. Raises ``ValueError`` (which
+    ``json.JSONDecodeError`` subclasses) on malformed JSON or a non-object shape, so callers
+    map both to the same error they use for a bad ``config``. The JSON request endpoints get
+    this shape check for free from the ``EvaluateJsonRequest`` model.
+    """
+    if not raw:
+        return None
+    refs = json.loads(raw)
+    if not isinstance(refs, dict) or not all(isinstance(ref, dict) for ref in refs.values()):
+        raise ValueError("credentialRefs must be a JSON object mapping each logical name to a reference object")
+    return refs
 def _camel_keys(obj: Any) -> Any:
     """Recursively convert dict keys from snake_case to camelCase."""
     if isinstance(obj, dict):
@@ -462,6 +519,7 @@ async def evaluate_traces(
     trace_files: list[UploadFile] = File(...),
     config: str = Form(...),
     eval_set_file: UploadFile | None = File(None),
+    credential_refs: str | None = Form(None),
 ):
     """
     Evaluate agent traces using the provided evaluator configuration.
@@ -470,6 +528,8 @@ async def evaluate_traces(
         trace_files: List of Jaeger or OTLP JSON trace files
         config: JSON string with evaluation configuration
         eval_set_file: Optional golden eval set file
+        credential_refs: Optional JSON string mapping logical credential names to
+            secret references, resolved so LLM-as-Judge graders can authenticate
     Returns:
         RunResult with trace results and any errors
@@ -481,6 +541,11 @@ async def evaluate_traces(
         except json.JSONDecodeError as exc:
             raise HTTPException(status_code=400, detail=f"Invalid config JSON: {exc}") from exc
+        try:
+            cred_refs = _parse_credential_refs_form(credential_refs)
+        except ValueError as exc:
+            raise HTTPException(status_code=400, detail=f"Invalid credentialRefs: {exc}") from exc
         trace_paths = []
         for trace_file in trace_files:
             if not trace_file.filename:
@@ -548,7 +613,9 @@ async def evaluate_traces(
             len(trace_paths),
             [e.name for e in eval_config.evaluators],
         )
-        result = await run_evaluation(eval_config)
+        resolved_creds = await _resolve_credentials(cred_refs)
+        with _scoped_credentials(resolved_creds):
+            result = await run_evaluation(eval_config)
         run_id = await _maybe_persist_evaluate_run(
             request,
@@ -580,6 +647,7 @@ async def evaluate_traces_stream(
     trace_files: list[UploadFile] = File(...),
     config: str = Form(...),
     eval_set_file: UploadFile | None = File(None),
+    credential_refs: str | None = Form(None),
 ):
     """Evaluate traces with real-time progress via SSE."""
     temp_dir = tempfile.mkdtemp()
@@ -593,6 +661,12 @@ async def evaluate_traces_stream(
                 yield f"data: {SSEErrorEvent(error=f'Invalid config JSON: {exc}').model_dump_json(by_alias=True)}\n\n"
                 return
+            try:
+                cred_refs = _parse_credential_refs_form(credential_refs)
+            except ValueError as exc:
+                yield f"data: {SSEErrorEvent(error=f'Invalid credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
+                return
             trace_paths = []
             for trace_file in trace_files:
                 if not trace_file.filename:
@@ -674,47 +748,54 @@ async def evaluate_traces_stream(
                 result = await run_evaluation(eval_config, progress_callback, trace_progress_callback)
                 await queue.put(("done", result))
-            eval_task = asyncio.create_task(run_with_progress())
             try:
-                while True:
-                    msg = await queue.get()
-                    tag, payload = msg
-                    if tag == "done":
-                        run_id = await _maybe_persist_evaluate_run(
-                            request,
-                            params=eval_config,
-                            eval_set_dict=_load_eval_set_dict(eval_set_path),
-                            trace_format=eval_config.trace_format,
-                            upload_filenames=upload_filenames,
-                            run_result=payload,
-                        )
-                        if run_id:
-                            payload.run_id = run_id
-                        evt = SSEDoneEvent(
-                            result=_camel_keys(payload.model_dump(by_alias=True)),
-                        )
-                        yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
-                        break
-                    elif tag == "trace_progress":
-                        evt = SSETraceProgressEvent(
-                            trace_progress=SSETraceProgress(
-                                trace_id=payload.trace_id,
-                                partial_result=_camel_keys(payload.model_dump(by_alias=True)),
+                resolved_creds = await resolve_credential_refs(cred_refs) if cred_refs else None
+            except ValueError as exc:
+                yield f"data: {SSEErrorEvent(error=f'Could not resolve credentialRefs: {exc}').model_dump_json(by_alias=True)}\n\n"
+                return
+            with _scoped_credentials(resolved_creds):
+                eval_task = asyncio.create_task(run_with_progress())
+                try:
+                    while True:
+                        msg = await queue.get()
+                        tag, payload = msg
+                        if tag == "done":
+                            run_id = await _maybe_persist_evaluate_run(
+                                request,
+                                params=eval_config,
+                                eval_set_dict=_load_eval_set_dict(eval_set_path),
+                                trace_format=eval_config.trace_format,
+                                upload_filenames=upload_filenames,
+                                run_result=payload,
                             )
-                        )
-                        yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
-                    elif tag == "progress":
-                        evt = SSEProgressEvent(message=payload)
-                        yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
-            finally:
-                if not eval_task.done():
-                    eval_task.cancel()
-                    try:
-                        await eval_task
-                    except asyncio.CancelledError:
-                        pass
+                            if run_id:
+                                payload.run_id = run_id
+                            evt = SSEDoneEvent(
+                                result=_camel_keys(payload.model_dump(by_alias=True)),
+                            )
+                            yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
+                            break
+                        elif tag == "trace_progress":
+                            evt = SSETraceProgressEvent(
+                                trace_progress=SSETraceProgress(
+                                    trace_id=payload.trace_id,
+                                    partial_result=_camel_keys(payload.model_dump(by_alias=True)),
+                                )
+                            )
+                            yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
+                        elif tag == "progress":
+                            evt = SSEProgressEvent(message=payload)
+                            yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
+                finally:
+                    if not eval_task.done():
+                        eval_task.cancel()
+                        try:
+                            await eval_task
+                        except asyncio.CancelledError:
+                            pass
         except Exception as exc:
             logger.exception("Evaluation stream failed")
@@ -775,13 +856,15 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
     """Evaluate OTLP JSON traces passed in the request body."""
     _check_json_body_size(raw_request)
     traces, eval_set = _parse_json_request(request)
+    resolved_creds = await _resolve_credentials(request.credential_refs)
     try:
-        result = await run_evaluation_from_traces(
-            traces=traces,
-            config=request.config,
-            eval_set=eval_set,
-        )
+        with _scoped_credentials(resolved_creds):
+            result = await run_evaluation_from_traces(
+                traces=traces,
+                config=request.config,
+                eval_set=eval_set,
+            )
         run_id = await _maybe_persist_evaluate_run(
             raw_request,
             params=request.config,
@@ -793,6 +876,8 @@ async def evaluate_traces_json(request: EvaluateJsonRequest, raw_request: Reques
         if run_id:
             result.run_id = run_id
         return StandardResponse(data=_camel_keys(result.model_dump(by_alias=True)))
+    except HTTPException:
+        raise
     except Exception as exc:
         logger.exception("JSON evaluation failed")
         raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
@@ -843,47 +928,56 @@ async def evaluate_traces_json_stream(request: EvaluateJsonRequest, raw_request:
                 )
                 await queue.put(("done", result))
-            eval_task = asyncio.create_task(run_with_progress())
             try:
-                while True:
-                    msg = await queue.get()
-                    tag, payload = msg
-                    if tag == "done":
-                        run_id = await _maybe_persist_evaluate_run(
-                            raw_request,
-                            params=request.config,
-                            eval_set_dict=request.eval_set,
-                            trace_format=None,
-                            upload_filenames=None,
-                            run_result=payload,
-                        )
-                        if run_id:
-                            payload.run_id = run_id
-                        evt = SSEDoneEvent(
-                            result=_camel_keys(payload.model_dump(by_alias=True)),
-                        )
-                        yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
-                        break
-                    elif tag == "trace_progress":
-                        evt = SSETraceProgressEvent(
-                            trace_progress=SSETraceProgress(
-                                trace_id=payload.trace_id,
-                                partial_result=_camel_keys(payload.model_dump(by_alias=True)),
+                resolved_creds = (
+                    await resolve_credential_refs(request.credential_refs) if request.credential_refs else None
+                )
+            except ValueError as exc:
+                yield _sse_error(f"Could not resolve credentialRefs: {exc}")
+                return
+            with _scoped_credentials(resolved_creds):
+                eval_task = asyncio.create_task(run_with_progress())
+                try:
+                    while True:
+                        msg = await queue.get()
+                        tag, payload = msg
+                        if tag == "done":
+                            run_id = await _maybe_persist_evaluate_run(
+                                raw_request,
+                                params=request.config,
+                                eval_set_dict=request.eval_set,
+                                trace_format=None,
+                                upload_filenames=None,
+                                run_result=payload,
                             )
-                        )
-                        yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
-                    elif tag == "progress":
-                        evt = SSEProgressEvent(message=payload)
-                        yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
-            finally:
-                if not eval_task.done():
-                    eval_task.cancel()
-                    try:
-                        await eval_task
-                    except asyncio.CancelledError:
-                        pass
+                            if run_id:
+                                payload.run_id = run_id
+                            evt = SSEDoneEvent(
+                                result=_camel_keys(payload.model_dump(by_alias=True)),
+                            )
+                            yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
+                            break
+                        elif tag == "trace_progress":
+                            evt = SSETraceProgressEvent(
+                                trace_progress=SSETraceProgress(
+                                    trace_id=payload.trace_id,
+                                    partial_result=_camel_keys(payload.model_dump(by_alias=True)),
+                                )
+                            )
+                            yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
+                        elif tag == "progress":
+                            evt = SSEProgressEvent(message=payload)
+                            yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
+                finally:
+                    if not eval_task.done():
+                        eval_task.cancel()
+                        try:
+                            await eval_task
+                        except asyncio.CancelledError:
+                            pass
         except Exception as exc:
             logger.exception("JSON evaluation stream failed")

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/builtin_metrics.py RENAMED Viewed

@@ -27,6 +27,8 @@ from google.adk.evaluation.eval_metrics import (
 from google.adk.evaluation.eval_rubrics import Rubric, RubricContent
 from google.adk.evaluation.evaluator import EvaluationResult, Evaluator
+from .resolvers import get_resolved_credential
 logger = logging.getLogger(__name__)
 METRICS_NEEDING_EXPECTED = {
@@ -267,6 +269,67 @@ def get_evaluator(eval_metric: EvalMetric) -> Evaluator:
     return DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(eval_metric)
+def _build_judge_model(model_id: str, api_key: str, base_url: str | None = None):
+    """Build a judge ``BaseLlm`` carrying *api_key* directly, instead of reading it from env.
+    LiteLlm-backed providers take ``api_key`` (and optional ``base_url``) as constructor
+    kwargs that forward into every ``litellm.acompletion`` call. The Gemini-native model
+    class takes no ``api_key``; its cached ``google.genai`` client is replaced with one
+    built from the resolved key.
+    Routing is by ADK's ``LLMRegistry`` class resolution, which is authoritative: the
+    evaluator already resolved this same *model_id* to a model class when ``_setup_auto_rater``
+    ran at construction, so this lookup cannot disagree or fail here.
+    """
+    from google.adk.models.lite_llm import LiteLlm
+    from google.adk.models.registry import LLMRegistry
+    if issubclass(LLMRegistry().resolve(model_id), LiteLlm):
+        kwargs: dict[str, Any] = {"api_key": api_key}
+        if base_url:
+            kwargs["base_url"] = base_url
+        return LiteLlm(model=model_id, **kwargs)
+    from google.adk.models.google_llm import Gemini
+    from google.genai import Client
+    from google.genai import types as genai_types
+    model = Gemini(model=model_id)
+    client_kwargs: dict[str, Any] = {"api_key": api_key}
+    if base_url:
+        client_kwargs["http_options"] = genai_types.HttpOptions(base_url=base_url)
+    # api_client is a functools.cached_property that memoizes into the instance __dict__;
+    # seeding that slot pre-empts the lazily-built client so the judge uses the resolved key.
+    model.__dict__["api_client"] = Client(**client_kwargs)
+    return model
+def _inject_judge_credential(evaluator: Evaluator, api_key: str, base_url: str | None = None) -> None:
+    """Replace a judge evaluator's auto-rater model with one built from *api_key*.
+    Keyed on the ADK private seam (``_judge_model_options`` / ``_judge_model``, set by
+    ``LlmAsJudge._setup_auto_rater``) rather than on a class, so this single path covers
+    ``FinalResponseMatchV2Evaluator``, the ``rubric_based_*_v1`` evaluators, and
+    ``HallucinationsV1Evaluator`` (which exposes the same attributes without subclassing
+    ``LlmAsJudge``). ``get_evaluator`` returns a fresh instance per evaluation, so mutating
+    it here carries no shared state and is safe across concurrent runs.
+    TODO(upstream): propose that ADK ``JudgeModelOptions`` carry a credential or a prebuilt
+    model instance, so judge auth no longer depends on this private seam or process env.
+    """
+    opts = getattr(evaluator, "_judge_model_options", None)
+    if opts is None or not hasattr(evaluator, "_judge_model"):
+        logger.warning("evaluator %s is not judge-backed; cannot inject credential", type(evaluator).__name__)
+        return
+    model_id = getattr(opts, "judge_model", None)
+    if not model_id:
+        logger.warning(
+            "evaluator %s has no resolved judge_model; skipping credential injection", type(evaluator).__name__
+        )
+        return
+    evaluator._judge_model = _build_judge_model(model_id, api_key, base_url)
 def extract_trajectory_details(eval_result: EvaluationResult) -> dict[str, Any]:
     """Extract expected vs actual tool call details from trajectory evaluation."""
     comparisons = []
@@ -305,6 +368,8 @@ async def evaluate_builtin_metric(
     judge_model: str | None,
     threshold: float | None,
     match_type: str | None = None,
+    credential_ref: str | None = None,
+    judge_base_url: str | None = None,
 ) -> dict[str, Any]:
     """Evaluate a single built-in ADK metric.
@@ -326,6 +391,18 @@ async def evaluate_builtin_metric(
         eval_metric = build_eval_metric(metric_name, judge_model, threshold, match_type=match_type)
         evaluator: Evaluator = get_evaluator(eval_metric)
+        if credential_ref:
+            api_key = get_resolved_credential(credential_ref)
+            if api_key is None:
+                return MetricResult(
+                    metric_name=metric_name,
+                    error=(
+                        f"Metric '{metric_name}' references credential '{credential_ref}', "
+                        f"which was not provided in the run's credentialRefs."
+                    ),
+                )
+            _inject_judge_credential(evaluator, api_key, judge_base_url)
         if metric_name in _METRICS_NEEDING_INVOCATION_EVENTS:
             actual_invocations = _enrich_app_details([_to_invocation_events(inv) for inv in actual_invocations])
             if expected_invocations is not None:

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/config.py RENAMED Viewed

@@ -27,6 +27,14 @@ class BuiltinMetricDef(BaseModel):
     threshold: float | None = Field(default=None, ge=0, le=1)
     judge_model: str | None = None
     trajectory_match_type: str | None = None
+    credential_ref: str | None = Field(
+        default=None,
+        description="Logical name of a RunSpec.credential_refs entry whose resolved value is the judge API key.",
+    )
+    judge_base_url: str | None = Field(
+        default=None,
+        description="Optional base URL for the judge endpoint (e.g. an OpenAI-compatible proxy).",
+    )
     @field_validator("trajectory_match_type")
     @classmethod

{agentevals_cli-0.9.3 → agentevals_cli-0.9.5}/src/agentevals/custom_evaluators.py RENAMED Viewed

@@ -453,6 +453,8 @@ async def evaluate_custom_evaluator(
             judge_model=evaluator_def.judge_model,
             threshold=evaluator_def.threshold,
             match_type=evaluator_def.trajectory_match_type,
+            credential_ref=evaluator_def.credential_ref,
+            judge_base_url=evaluator_def.judge_base_url,
         )
     if isinstance(evaluator_def, OpenAIEvalDef):

agentevals-cli 0.9.3__tar.gz → 0.9.5__tar.gz

agentevals-cli 0.9.3tar.gz → 0.9.5tar.gz