PyPI - agentops-toolkit - Versions diffs - 0.2.3__tar.gz → 0.2.4__tar.gz - Mend

agentops-toolkit 0.2.3tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: agentops-toolkit
-Version: 0.2.3
+Version: 0.2.4
 Summary: CLI toolkit for evaluating, tracing, and monitoring AI agents on Azure AI Foundry
 Keywords: ai,agent,evaluation,azure,foundry,observability
 Author: DB Lee

{agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "agentops-toolkit"
-version = "0.2.3"
+version = "0.2.4"
 description = "CLI toolkit for evaluating, tracing, and monitoring AI agents on Azure AI Foundry"
 readme = "README.md"
 license = { text = "MIT" }

{agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """AgentOps Toolkit — Evaluate, trace, and monitor AI agents."""
-__version__ = "0.2.3"
+__version__ = "0.2.4"

{agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/eval_cmd.py RENAMED Viewed

@@ -72,7 +72,7 @@ def eval_run(
     # Resolve bundle
     bundle_name = bundle or (cfg.bundles.default if cfg else "rag_quality")
     project_connection = cfg.foundry.project_connection if cfg else ""
-    model_deployment = getattr(cfg.foundry, "model_deployment", "gpt-4o") if cfg else "gpt-4o"
+    model_deployment = getattr(cfg.foundry, "model_deployment", "") if cfg else ""
     output_dir = cfg.runs.output_dir if cfg else "agentops/runs"
     # Run evaluation

{agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/report_cmd.py RENAMED Viewed

@@ -150,9 +150,7 @@ def report_show(
             if evaluator:
                 results = [er for er in results if er.evaluator_name == evaluator]
             scores_str = ", ".join(
-                f"{er.evaluator_name}={er.score}"
-                for er in results
-                if er.score is not None
+                f"{er.evaluator_name}={er.score}" for er in results if er.score is not None
             )
             console.print(f"  [{entry.status.value}] {entry.dataset_entry_id}: {scores_str}")

{agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/pipeline.py RENAMED Viewed

@@ -186,18 +186,139 @@ def compute_summary(entries: list[RunEntry], total_duration_ms: float) -> RunSum
     )
+def _derive_ai_services_endpoint(project_endpoint: str) -> str:
+    """Derive the AI Services base endpoint from a project endpoint.
+    Project endpoint format:
+        https://<account>.services.ai.azure.com/api/projects/<project>
+    AI Services base endpoint:
+        https://<account>.services.ai.azure.com
+    """
+    from urllib.parse import urlparse
+    parsed = urlparse(project_endpoint)
+    return f"{parsed.scheme}://{parsed.netloc}"
+def _is_reasoning_model(model_name: str) -> bool:
+    """Check if a model is an o-series reasoning model.
+    Reasoning models (o1, o3, o4-mini, etc.) require API version
+    2024-12-01-preview or later, which is incompatible with the
+    azure-ai-evaluation SDK's default of 2024-02-15-preview.
+    """
+    import re
+    name = model_name.lower().strip()
+    # Match o-series: o1, o1-mini, o3, o4-mini, etc.
+    return bool(re.match(r"^o\d", name))
+# The azure-ai-evaluation SDK uses "2024-02-15-preview" by default.
+# Reasoning models require "2024-12-01-preview" or later.
+_EVAL_SDK_API_VERSION = "2024-02-15-preview"
+_REASONING_MODEL_API_VERSION = "2024-12-01-preview"
+def _discover_chat_deployment(
+    project_endpoint: str,
+) -> tuple[str, str] | None:
+    """Auto-discover a chat-capable model deployment via AI Foundry SDK.
+    Returns a ``(deployment_name, api_version)`` tuple for the best candidate,
+    or *None* when no suitable deployment is found.
+    Selection criteria:
+      1. Must have ``chat_completion`` capability.
+      2. Prefers models compatible with the evaluation SDK's default API
+         version (``2024-02-15-preview``).  Reasoning / o-series models that
+         require a newer version are ranked lower but still usable.
+      3. Among compatible models, smaller / cheaper variants are preferred
+         (``mini`` → ``gpt-4.1`` → others).
+    """
+    try:
+        from azure.ai.projects import AIProjectClient
+        from azure.ai.projects.models import ModelDeployment
+        from azure.identity import DefaultAzureCredential
+        client = AIProjectClient(
+            credential=DefaultAzureCredential(),
+            endpoint=project_endpoint,
+        )
+        # Partition into compatible and reasoning-only buckets
+        compatible: list[tuple[str, str]] = []  # (name, model_name)
+        reasoning_only: list[tuple[str, str]] = []
+        for deployment in client.deployments.list():
+            if not isinstance(deployment, ModelDeployment):
+                continue
+            caps = deployment.capabilities or {}
+            if caps.get("chat_completion") != "true":
+                continue
+            model_name = deployment.model_name or deployment.name
+            if _is_reasoning_model(model_name):
+                reasoning_only.append((deployment.name, model_name))
+            else:
+                compatible.append((deployment.name, model_name))
+        def _pick_preferred(
+            candidates: list[tuple[str, str]],
+        ) -> str | None:
+            """Return the best deployment name from *candidates*."""
+            preferred = ["mini", "4.1-mini", "4o-mini", "gpt-4.1"]
+            for pref in preferred:
+                for dep_name, mdl_name in candidates:
+                    if pref in dep_name.lower() or pref in mdl_name.lower():
+                        return dep_name
+            return candidates[0][0] if candidates else None
+        # First try compatible models (work with eval SDK default API version)
+        if compatible:
+            chosen = _pick_preferred(compatible)
+            if chosen:
+                logger.info(
+                    "Auto-discovered deployment '%s' (api_version=%s)",
+                    chosen,
+                    _EVAL_SDK_API_VERSION,
+                )
+                return (chosen, _EVAL_SDK_API_VERSION)
+        # Fall back to reasoning models with a newer API version
+        if reasoning_only:
+            chosen = _pick_preferred(reasoning_only)
+            if chosen:
+                logger.warning(
+                    "Only reasoning model deployments available. Using '%s' with api_version=%s",
+                    chosen,
+                    _REASONING_MODEL_API_VERSION,
+                )
+                return (chosen, _REASONING_MODEL_API_VERSION)
+        return None
+    except Exception as e:
+        logger.warning("Failed to auto-discover deployments: %s", e)
+        return None
 async def run_evaluation(
     dataset_path: str | Path,
     bundle_name: str,
     output_dir: str | Path = "agentops/runs",
     run_name: str = "default",
     project_connection: str = "",
-    model_deployment: str = "gpt-4o",
+    model_deployment: str = "",
 ) -> Run:
     """Execute the full evaluation pipeline.
     For Sprint 1, this evaluates a pre-populated dataset (agent responses
     already in the JSONL) against the specified bundle's evaluators.
+    Args:
+        project_connection: AI Foundry project endpoint URL.
+        model_deployment: Model deployment name for LLM-judge evaluators.
+            If empty, auto-discovers a suitable chat model from the project.
     """
     output_dir = Path(output_dir)
@@ -213,16 +334,47 @@ async def run_evaluation(
     # Build model_config for LLM-judge evaluators
     model_config: dict[str, Any] | None = None
     if project_connection:
-        try:
-            from azure.identity import DefaultAzureCredential
+        # Derive the AI Services base endpoint from the project endpoint
+        azure_endpoint = _derive_ai_services_endpoint(project_connection)
+        # Resolve deployment + API version
+        deployment: str | None = None
+        api_version: str = _EVAL_SDK_API_VERSION
+        if model_deployment:
+            # Explicit deployment — pick correct API version
+            deployment = model_deployment
+            if _is_reasoning_model(model_deployment):
+                api_version = _REASONING_MODEL_API_VERSION
+                logger.info(
+                    "Reasoning model detected; using api_version=%s",
+                    api_version,
+                )
+        else:
+            # Auto-discover
+            result = _discover_chat_deployment(project_connection)
+            if result:
+                deployment, api_version = result
+        if deployment:
             model_config = {
-                "azure_endpoint": project_connection,
-                "azure_deployment": model_deployment,
-                "credential": DefaultAzureCredential(),
+                "azure_endpoint": azure_endpoint,
+                "azure_deployment": deployment,
+                "api_version": api_version,
             }
-        except ImportError:
-            logger.warning("azure-identity not installed; LLM-judge evaluators may fail")
+            logger.info(
+                "Using model deployment '%s' at %s (api_version=%s)",
+                deployment,
+                azure_endpoint,
+                api_version,
+            )
+        else:
+            logger.warning(
+                "No chat-capable model deployment found. "
+                "LLM-judge evaluators will fail. "
+                "Deploy a chat model in your AI Foundry project or set "
+                "model_deployment in agentops.yaml."
+            )
     # Build evaluators
     evaluators = [

{agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/evaluators/base.py RENAMED Viewed

@@ -39,8 +39,16 @@ class BaseEvaluator(ABC):
 # Evaluators that use an LLM judge and require model_config
 _LLM_JUDGE_EVALUATORS: set[str] = {
-    "groundedness", "relevance", "coherence", "fluency", "similarity",
-    "hate_unfairness", "sexual", "violence", "self_harm", "protected_material",
+    "groundedness",
+    "relevance",
+    "coherence",
+    "fluency",
+    "similarity",
+    "hate_unfairness",
+    "sexual",
+    "violence",
+    "self_harm",
+    "protected_material",
 }

{agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/models/config.py RENAMED Viewed

@@ -74,8 +74,9 @@ class FoundryConnection(BaseModel):
         description="Foundry project endpoint URL or ${ENV_VAR}",
     )
     model_deployment: str = Field(
-        default="gpt-4o",
-        description="Model deployment name for LLM-judge evaluators",
+        default="",
+        description="Model deployment name for LLM-judge evaluators. "
+        "If empty, auto-discovers a suitable chat model from the project.",
     )
     credential: CredentialType = CredentialType.DEFAULT
     rate_limit: RateLimitConfig = Field(default_factory=RateLimitConfig)