PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py CHANGED Viewed

@@ -2,17 +2,10 @@ from __future__ import annotations
 import json
 from types import NoneType
-from typing import (
-    Any,
-    Dict,
-    List,
-    Literal,
-    Optional,
-    Union,
-)
-from typing_extensions import Self
+from typing import Any, Dict, List, Literal, Optional, Union
 from pydantic import BaseModel, Field, ValidationError, model_validator
+from typing_extensions import Self
 from wxo_agentic_evaluation.referenceless_eval.metrics import MetricRunResult
@@ -32,12 +25,14 @@ class FunctionCallMetric(BaseModel):
     jsonschema: Dict[str, Any] = Field(
         ..., description="JSON Schema dict for this metric's output."
     )
-    examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = Field(
-        None,
-        description=(
-            "List of example inputs and outputs for this metric; "
-            "each example is a dict with 'user_kwargs' and 'output' keys."
-        ),
+    examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = (
+        Field(
+            None,
+            description=(
+                "List of example inputs and outputs for this metric; "
+                "each example is a dict with 'user_kwargs' and 'output' keys."
+            ),
+        )
     )
@@ -52,7 +47,8 @@ class StaticMetricResult(BaseModel):
     """
     description: str = Field(
-        ..., description="Human-readable description of this static validation check."
+        ...,
+        description="Human-readable description of this static validation check.",
     )
     valid: bool = Field(
         ..., description="True if this static check passed; False otherwise."
@@ -73,7 +69,9 @@ class StaticResult(BaseModel):
     metrics: Dict[str, StaticMetricResult] = Field(
         ...,
-        description=("Mapping from each static-check name to its StaticMetricResult."),
+        description=(
+            "Mapping from each static-check name to its StaticMetricResult."
+        ),
     )
     final_decision: bool = Field(
         ...,
@@ -133,7 +131,8 @@ class SemanticMetricResult(BaseModel):
     error: Optional[str] = Field(
         None,
         description=(
-            "Error message if prompt generation or parsing failed; " "otherwise None."
+            "Error message if prompt generation or parsing failed; "
+            "otherwise None."
         ),
     )
     is_correct: bool = Field(
@@ -157,11 +156,11 @@ class SemanticMetricResult(BaseModel):
         ),
     )
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def raw_response_json(self) -> Self:
         if isinstance(self.raw_response, str):
             self.raw_response = json.loads(self.raw_response)
         return self
     @classmethod
@@ -211,7 +210,9 @@ class SemanticCategoryResult(BaseModel):
     metrics: Optional[Dict[str, SemanticMetricResult]] = Field(
         None,
-        description=("Mapping metric_name -> SemanticMetricResult for this category."),
+        description=(
+            "Mapping metric_name -> SemanticMetricResult for this category."
+        ),
     )
     avg_score: Optional[float] = Field(
         None,
@@ -222,7 +223,9 @@ class SemanticCategoryResult(BaseModel):
     )
     @classmethod
-    def from_results(cls, results: List[MetricRunResult]) -> "SemanticCategoryResult":
+    def from_results(
+        cls, results: List[MetricRunResult]
+    ) -> "SemanticCategoryResult":
         """
         Build a category result from a list of MetricRunResult objects.
         """
@@ -249,11 +252,15 @@ class SemanticResult(BaseModel):
     general: Optional[SemanticCategoryResult] = Field(
         None,
-        description=("Results of general tool-call metrics, if any; otherwise None."),
+        description=(
+            "Results of general tool-call metrics, if any; otherwise None."
+        ),
     )
     function_selection: Optional[SemanticCategoryResult] = Field(
         None,
-        description=("Results of function-selection metrics, if any; otherwise None."),
+        description=(
+            "Results of function-selection metrics, if any; otherwise None."
+        ),
     )
     parameter: Optional[Dict[str, SemanticCategoryResult]] = Field(
         None,
@@ -302,7 +309,8 @@ class TransformResult(BaseModel):
         ),
     )
     execution_output: Any = Field(
-        None, description="The actual output of executing the transformation code."
+        None,
+        description="The actual output of executing the transformation code.",
     )
     correction: Optional[str] = Field(
         None,
@@ -311,7 +319,8 @@ class TransformResult(BaseModel):
     error: Optional[str] = Field(
         None,
         description=(
-            "Error message if code generation or execution failed; " "otherwise None."
+            "Error message if code generation or execution failed; "
+            "otherwise None."
         ),
     )
@@ -356,7 +365,9 @@ class PipelineResult(BaseModel):
     Final output of the function-calling pipeline for one tool call.
     """
-    inputs: FunctionCallInput = Field(..., description="Echo of the pipeline inputs.")
+    inputs: FunctionCallInput = Field(
+        ..., description="Echo of the pipeline inputs."
+    )
     static: Optional[StaticResult] = Field(
         None, description="Static schema-validation results, if enabled."
     )
@@ -430,7 +441,9 @@ class PipelineResult(BaseModel):
             if param_avgs:
                 cat_avgs.append(sum(param_avgs) / len(param_avgs))
-        values.overall_avg_score = sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
+        values.overall_avg_score = (
+            sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
+        )
         values.overall_valid = ok
         return values
@@ -506,7 +519,9 @@ class ToolFunctionCall(BaseModel):
     Parsed representation of an LLM's function call response.
     """
-    name: str = Field(..., description="Name of the function the LLM chose to call.")
+    name: str = Field(
+        ..., description="Name of the function the LLM chose to call."
+    )
     arguments: str = Field(
         ..., description="JSON-encoded string of the call's arguments."
     )

wxo_agentic_evaluation/referenceless_eval/metrics/field.py CHANGED Viewed

@@ -60,7 +60,9 @@ class BaseField(BaseModel, ABC):
             if field_cls.can_handle(name, schema):
                 desc = schema.get("description", "")
                 extra = {
-                    k: v for k, v in schema.items() if k not in ("type", "description")
+                    k: v
+                    for k, v in schema.items()
+                    if k not in ("type", "description")
                 }
                 return field_cls(
                     name=name,
@@ -74,7 +76,9 @@ class BaseField(BaseModel, ABC):
             json_type=schema.get("type", "string"),
             description=schema.get("description", ""),
             jsonschema_extra={
-                k: v for k, v in schema.items() if k not in ("type", "description")
+                k: v
+                for k, v in schema.items()
+                if k not in ("type", "description")
             },
             extra_params={},
         )
@@ -122,10 +126,12 @@ class NumericField(BaseField):
     """
     threshold_low: Optional[float] = PydanticField(
-        None, description="Lower bound for correctness checks (not in JSONSchema)."
+        None,
+        description="Lower bound for correctness checks (not in JSONSchema).",
     )
     threshold_high: Optional[float] = PydanticField(
-        None, description="Upper bound for correctness checks (not in JSONSchema)."
+        None,
+        description="Upper bound for correctness checks (not in JSONSchema).",
     )
     __abstract__ = False
@@ -153,7 +159,9 @@ class NumericField(BaseField):
             json_type=schema.get("type", "number"),
             description=schema.get("description", ""),
             jsonschema_extra={
-                k: v for k, v in schema.items() if k not in ("type", "description")
+                k: v
+                for k, v in schema.items()
+                if k not in ("type", "description")
             },
             extra_params={},
         )

wxo_agentic_evaluation/referenceless_eval/metrics/metric.py CHANGED Viewed

@@ -131,7 +131,9 @@ class Metric:
             additional_properties=additional_props,
         )
-    def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
+    def is_important(
+        self, result: Dict[str, Any]
+    ) -> Tuple[bool, Optional[str]]:
         """
         A result is 'important' if its confidence lies within the defined confidence thresholds.
@@ -146,7 +148,9 @@ class Metric:
         except (TypeError, ValueError):
             return False, "Invalid confidence value"
         # locate the confidence field
-        conf_field = next((f for f in self.fields if f.name == "confidence"), None)
+        conf_field = next(
+            (f for f in self.fields if f.name == "confidence"), None
+        )
         if isinstance(conf_field, NumericField):
             ok = conf_field.is_within_threshold(conf)
             reason = (
@@ -266,7 +270,10 @@ class StandardMetric(Metric):
             json_type="number",
             description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
             jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
-            extra_params={"threshold_low": min_conf, "threshold_high": max_conf},
+            extra_params={
+                "threshold_low": min_conf,
+                "threshold_high": max_conf,
+            },
         )
         correction = CorrectionField(
             name="correction",
@@ -277,7 +284,9 @@ class StandardMetric(Metric):
         fields = [explanation, evidence, output, confidence, correction]
         super().__init__(name=name, description=description, fields=fields)
-    def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
+    def is_important(
+        self, result: Dict[str, Any]
+    ) -> Tuple[bool, Optional[str]]:
         """
         A result is 'important' if its confidence lies within the defined confidence thresholds.
@@ -292,7 +301,9 @@ class StandardMetric(Metric):
         except (TypeError, ValueError):
             return False, "Invalid confidence value"
         # locate the confidence field
-        conf_field = next((f for f in self.fields if f.name == "confidence"), None)
+        conf_field = next(
+            (f for f in self.fields if f.name == "confidence"), None
+        )
         if isinstance(conf_field, NumericField):
             ok = conf_field.is_within_threshold(conf)
             reason = (

wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py CHANGED Viewed

@@ -5,7 +5,9 @@ from pydantic import BaseModel
 from wxo_agentic_evaluation.referenceless_eval.metrics.field import NumericField
 from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
-from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
+from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
+    MetricPrompt,
+)
 from wxo_agentic_evaluation.referenceless_eval.prompt.runner import (
     AsyncGen,
     Prompt,
@@ -40,7 +42,8 @@ class MetricRunner:
     """
     def __init__(
-        self, entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None
+        self,
+        entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None,
     ) -> None:
         """
         Args:
@@ -51,7 +54,9 @@ class MetricRunner:
             for mp, kw in entries:
                 self.add(mp, kw)
-    def add(self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]) -> None:
+    def add(
+        self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]
+    ) -> None:
         """
         Add a metric to run.

wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py CHANGED Viewed

@@ -68,7 +68,9 @@ class MetricPrompt:
         # Store defaults for system context
         # This allows overriding system context without modifying the template
         # during prompt building
-        self.system_kwargs_defaults: Dict[str, Any] = system_kwargs_defaults.copy()
+        self.system_kwargs_defaults: Dict[str, Any] = (
+            system_kwargs_defaults.copy()
+        )
         # Initialize examples list
         # This will hold (user_kwargs, output) pairs for few-shot prompting
@@ -104,7 +106,9 @@ class MetricPrompt:
     # --- Example Management ---
-    def add_example(self, user_kwargs: Dict[str, Any], output: Dict[str, Any]) -> None:
+    def add_example(
+        self, user_kwargs: Dict[str, Any], output: Dict[str, Any]
+    ) -> None:
         """
         Add a few-shot example.

wxo_agentic_evaluation/referenceless_eval/metrics/utils.py CHANGED Viewed

@@ -17,7 +17,11 @@ def remove_threshold_fields(schema: dict) -> dict:
                 schema[key] = remove_threshold_fields(value)
             elif isinstance(value, list):
                 schema[key] = [
-                    remove_threshold_fields(item) if isinstance(item, dict) else item
+                    (
+                        remove_threshold_fields(item)
+                        if isinstance(item, dict)
+                        else item
+                    )
                     for item in value
                 ]
     return schema

wxo_agentic_evaluation/referenceless_eval/prompt/runner.py CHANGED Viewed

@@ -1,10 +1,22 @@
 import asyncio
-from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
 from pydantic import BaseModel
 Prompt = Union[str, List[Dict[str, Any]]]
-PromptAndSchema = Tuple[Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]]
+PromptAndSchema = Tuple[
+    Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]
+]
 SyncGen = Callable[[Prompt], Union[str, Any]]
 BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
 AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
@@ -137,7 +149,8 @@ class PromptRunner:
                     return index, PromptResult(prompt=prompt, error=str(e))
         tasks = [
-            asyncio.create_task(_run_one(i, p)) for i, p in enumerate(self.prompts)
+            asyncio.create_task(_run_one(i, p))
+            for i, p in enumerate(self.prompts)
         ]
         indexed_results = await asyncio.gather(*tasks)
         # Sort results to match original order

wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py CHANGED Viewed

@@ -14,30 +14,37 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
     ToolCall,
     ToolSpec,
 )
-from wxo_agentic_evaluation.type import Message
 from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.type import Message
 class ReferencelessEvaluation:
     """
-        Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
-        Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
-        ---
-        Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
-        Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
-        Note: When parsing the semantic metrics, check for `is_correct` field.  if `false` there is some mistake that the LLMaJ found in that tool call.
+    Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
+    Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
+    ---
+    Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
+    Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
+    Note: When parsing the semantic metrics, check for `is_correct` field.  if `false` there is some mistake that the LLMaJ found in that tool call.
     """
     def __init__(
         self,
         api_spec: List[Mapping[str, Any]],
         messages: List[Message],
         model_id: str,
         task_n: str,
-        dataset_name: str,):
+        dataset_name: str,
+    ):
         self.metrics_client = get_provider(
             model_id=model_id,
-            params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
-            referenceless_eval=True
+            params={
+                "min_new_tokens": 0,
+                "decoding_method": "greedy",
+                "max_new_tokens": 4096,
+            },
+            referenceless_eval=True,
         )
         self.pipeline = ReflectionPipeline(
@@ -72,7 +79,11 @@ class ReferencelessEvaluation:
         examples = []
         processed_data = [
-            {k: msg.model_dump().get(k) for k in ["role", "content", "type"] if k in msg.model_dump()}
+            {
+                k: msg.model_dump().get(k)
+                for k in ["role", "content", "type"]
+                if k in msg.model_dump()
+            }
             for msg in self.messages
         ]

wxo_agentic_evaluation/resource_map.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from collections import defaultdict
 from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
@@ -44,4 +45,4 @@ class ResourceMap:
         agent2tools = dict(agent2tools)
         tools2agents = dict(tools2agents)
-        return agent2tools, tools2agents
+        return agent2tools, tools2agents

wxo_agentic_evaluation/service_instance.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import logging
-import yaml
 import os
 import requests
-from wxo_agentic_evaluation.utils.utils import is_saas_url, is_ibm_cloud_url
+import yaml
+from wxo_agentic_evaluation.utils.utils import is_ibm_cloud_url, is_saas_url
 logger = logging.getLogger(__name__)
@@ -11,13 +13,15 @@ USER = {"username": "wxo.archer@ibm.com", "password": "watsonx"}
 class ServiceInstance:
     def __init__(
-        self, service_url, tenant_name, is_saas: bool = None, is_ibm_cloud: bool = None
+        self,
+        service_url,
+        tenant_name,
+        is_saas: bool = None,
+        is_ibm_cloud: bool = None,
     ) -> None:
         self.service_url = service_url
         self.tenant_name = tenant_name
-        STAGING_AUTH_ENDPOINT = (
-            "https://iam.platform.test.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
-        )
+        STAGING_AUTH_ENDPOINT = "https://iam.platform.test.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
         PROD_AUTH_ENDPOINT = (
             "https://iam.platform.saas.ibm.com/siusermgr/api/1.0/apikeys/token"
         )
@@ -25,7 +29,9 @@ class ServiceInstance:
         self.is_saas = is_saas_url(service_url) if is_saas is None else is_saas
         self.is_ibm_cloud = (
-            is_ibm_cloud_url(service_url) if is_ibm_cloud is None else is_ibm_cloud
+            is_ibm_cloud_url(service_url)
+            if is_ibm_cloud is None
+            else is_ibm_cloud
         )
         if self.is_saas:
@@ -88,7 +94,8 @@ class ServiceInstance:
     def _get_tenant_token(self, tenant_id: str):
         resp = requests.post(
-            self.tenant_auth_endpoint.format(self.service_url, tenant_id), data=USER
+            self.tenant_auth_endpoint.format(self.service_url, tenant_id),
+            data=USER,
         )
         if resp.status_code == 200:
             return resp.json()["access_token"]
@@ -122,7 +129,9 @@ class ServiceInstance:
             "tags": ["test"],
         }
-        resp = requests.post(self.tenant_url, headers=headers, json=tenant_config)
+        resp = requests.post(
+            self.tenant_url, headers=headers, json=tenant_config
+        )
         if resp.status_code == 201:
             return True
         else:
@@ -159,8 +168,12 @@ def tenant_setup(service_url: str, tenant_name: str):
     # else:
     #     tenant_token = service_instance._get_tenant_token(tenant_id)
-    auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
-    env_config_path = f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
+    auth_config_path = (
+        f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
+    )
+    env_config_path = (
+        f"{os.path.expanduser('~')}/.config/orchestrate/config.yaml"
+    )
     # TO-DO: update SDK and use SDK to manage this
     with open(auth_config_path, "r") as f:

wxo_agentic_evaluation/service_provider/__init__.py CHANGED Viewed

@@ -1,12 +1,24 @@
-from wxo_agentic_evaluation.service_provider.ollama_provider import OllamaProvider
-from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
-from wxo_agentic_evaluation.service_provider.model_proxy_provider import ModelProxyProvider
-from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import ModelProxyProviderLLMKitWrapper, WatsonXLLMKitWrapper
+import os
 from wxo_agentic_evaluation.arg_configs import ProviderConfig
+from wxo_agentic_evaluation.service_provider.model_proxy_provider import (
+    ModelProxyProvider,
+)
+from wxo_agentic_evaluation.service_provider.ollama_provider import (
+    OllamaProvider,
+)
+from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
+    ModelProxyProviderLLMKitWrapper,
+    WatsonXLLMKitWrapper,
+)
+from wxo_agentic_evaluation.service_provider.watsonx_provider import (
+    WatsonXProvider,
+)
-import os
-def _instantiate_provider(config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs):
+def _instantiate_provider(
+    config: ProviderConfig, is_referenceless_eval: bool = False, **kwargs
+):
     if config.provider == "watsonx":
         if is_referenceless_eval:
             provider = WatsonXLLMKitWrapper
@@ -22,12 +34,17 @@ def _instantiate_provider(config: ProviderConfig, is_referenceless_eval: bool =
             provider = ModelProxyProvider
         return provider(model_id=config.model_id, **kwargs)
     else:
-        raise RuntimeError(f"target provider is not supported {config.provider}")
-def get_provider(config: ProviderConfig = None, model_id: str = None, referenceless_eval: bool = False, **kwargs):
-    if config:
-        return _instantiate_provider(config, **kwargs)
+        raise RuntimeError(
+            f"target provider is not supported {config.provider}"
+        )
+def get_provider(
+    config: ProviderConfig = None,
+    model_id: str = None,
+    referenceless_eval: bool = False,
+    **kwargs,
+):
     if not model_id:
         raise ValueError("model_id must be provided if config is not supplied")
@@ -35,10 +52,13 @@ def get_provider(config: ProviderConfig = None, model_id: str = None, referencel
         config = ProviderConfig(provider="watsonx", model_id=model_id)
         return _instantiate_provider(config, referenceless_eval, **kwargs)
-    if "WO_API_KEY" in os.environ and "WO_INSTANCE" in os.environ:
+    if "WO_INSTANCE" in os.environ:
         config = ProviderConfig(provider="model_proxy", model_id=model_id)
         return _instantiate_provider(config, referenceless_eval, **kwargs)
+    if config:
+        return _instantiate_provider(config, **kwargs)
     raise RuntimeError(
         "No provider found. Please either provide a config or set the required environment variables."
-    )
+    )

ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl