PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py CHANGED Viewed

@@ -1,8 +1,6 @@
 from typing import Dict, List
-from jsonschema import (
-    Draft7Validator,
-)
+from jsonschema import Draft7Validator
 from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
     StaticMetricResult,
@@ -27,7 +25,9 @@ _STATIC_CHECKS: Dict[str, str] = {
 }
-def evaluate_static(apis_specs: List[ToolSpec], api_call: ToolCall) -> StaticResult:
+def evaluate_static(
+    apis_specs: List[ToolSpec], api_call: ToolCall
+) -> StaticResult:
     """
     Perform static validation on a single tool call.
@@ -97,7 +97,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
     errors: Dict[str, str] = {}
     # 1) Function existence
-    spec = next((s for s in specs if s.function.name == call.function.name), None)
+    spec = next(
+        (s for s in specs if s.function.name == call.function.name), None
+    )
     if not spec:
         errors["non_existent_function"] = (
             f"Function '{call.function.name}' does not exist in the provided API specifications:"
@@ -110,7 +112,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
     parsed_arguments = call.function.parsed_arguments
     # 2) Parameter existence check
-    if non_existent_params := set(parsed_arguments.keys()) - set(properties.keys()):
+    if non_existent_params := set(parsed_arguments.keys()) - set(
+        properties.keys()
+    ):
         errors["non_existent_parameter"] = (
             f"Parameters not defined in function '{call.function.name}': "
             f"{', '.join(sorted(non_existent_params))}. "
@@ -126,7 +130,9 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
     other_errors = []
     for error in validator.iter_errors(parsed_arguments):
-        field = ".".join(str(x) for x in error.path) if error.path else "unknown"
+        field = (
+            ".".join(str(x) for x in error.path) if error.path else "unknown"
+        )
         if error.validator == "required":
             missing_required.append(error.message)
         elif error.validator == "type":
@@ -145,12 +151,12 @@ def _check_tool_call(specs: List[ToolSpec], call: ToolCall) -> Dict[str, str]:
             "Incorrect parameter type(s): " + "; ".join(incorrect_types)
         )
     if invalid_enum:
-        errors["allowed_values_violation"] = "Invalid parameter value(s): " + "; ".join(
-            invalid_enum
+        errors["allowed_values_violation"] = (
+            "Invalid parameter value(s): " + "; ".join(invalid_enum)
         )
     if other_errors:
-        errors["json_schema_validation"] = "Other validation error(s): " + "; ".join(
-            other_errors
+        errors["json_schema_validation"] = (
+            "Other validation error(s): " + "; ".join(other_errors)
         )
     return errors

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py CHANGED Viewed

@@ -2,17 +2,10 @@ from __future__ import annotations
 import json
 from types import NoneType
-from typing import (
-    Any,
-    Dict,
-    List,
-    Literal,
-    Optional,
-    Union,
-)
-from typing_extensions import Self
+from typing import Any, Dict, List, Literal, Optional, Union
 from pydantic import BaseModel, Field, ValidationError, model_validator
+from typing_extensions import Self
 from wxo_agentic_evaluation.referenceless_eval.metrics import MetricRunResult
@@ -32,12 +25,14 @@ class FunctionCallMetric(BaseModel):
     jsonschema: Dict[str, Any] = Field(
         ..., description="JSON Schema dict for this metric's output."
     )
-    examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = Field(
-        None,
-        description=(
-            "List of example inputs and outputs for this metric; "
-            "each example is a dict with 'user_kwargs' and 'output' keys."
-        ),
+    examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = (
+        Field(
+            None,
+            description=(
+                "List of example inputs and outputs for this metric; "
+                "each example is a dict with 'user_kwargs' and 'output' keys."
+            ),
+        )
     )
@@ -52,7 +47,8 @@ class StaticMetricResult(BaseModel):
     """
     description: str = Field(
-        ..., description="Human-readable description of this static validation check."
+        ...,
+        description="Human-readable description of this static validation check.",
     )
     valid: bool = Field(
         ..., description="True if this static check passed; False otherwise."
@@ -73,7 +69,9 @@ class StaticResult(BaseModel):
     metrics: Dict[str, StaticMetricResult] = Field(
         ...,
-        description=("Mapping from each static-check name to its StaticMetricResult."),
+        description=(
+            "Mapping from each static-check name to its StaticMetricResult."
+        ),
     )
     final_decision: bool = Field(
         ...,
@@ -133,7 +131,8 @@ class SemanticMetricResult(BaseModel):
     error: Optional[str] = Field(
         None,
         description=(
-            "Error message if prompt generation or parsing failed; " "otherwise None."
+            "Error message if prompt generation or parsing failed; "
+            "otherwise None."
         ),
     )
     is_correct: bool = Field(
@@ -157,11 +156,11 @@ class SemanticMetricResult(BaseModel):
         ),
     )
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def raw_response_json(self) -> Self:
         if isinstance(self.raw_response, str):
             self.raw_response = json.loads(self.raw_response)
         return self
     @classmethod
@@ -211,7 +210,9 @@ class SemanticCategoryResult(BaseModel):
     metrics: Optional[Dict[str, SemanticMetricResult]] = Field(
         None,
-        description=("Mapping metric_name -> SemanticMetricResult for this category."),
+        description=(
+            "Mapping metric_name -> SemanticMetricResult for this category."
+        ),
     )
     avg_score: Optional[float] = Field(
         None,
@@ -222,7 +223,9 @@ class SemanticCategoryResult(BaseModel):
     )
     @classmethod
-    def from_results(cls, results: List[MetricRunResult]) -> "SemanticCategoryResult":
+    def from_results(
+        cls, results: List[MetricRunResult]
+    ) -> "SemanticCategoryResult":
         """
         Build a category result from a list of MetricRunResult objects.
         """
@@ -249,11 +252,15 @@ class SemanticResult(BaseModel):
     general: Optional[SemanticCategoryResult] = Field(
         None,
-        description=("Results of general tool-call metrics, if any; otherwise None."),
+        description=(
+            "Results of general tool-call metrics, if any; otherwise None."
+        ),
     )
     function_selection: Optional[SemanticCategoryResult] = Field(
         None,
-        description=("Results of function-selection metrics, if any; otherwise None."),
+        description=(
+            "Results of function-selection metrics, if any; otherwise None."
+        ),
     )
     parameter: Optional[Dict[str, SemanticCategoryResult]] = Field(
         None,
@@ -302,7 +309,8 @@ class TransformResult(BaseModel):
         ),
     )
     execution_output: Any = Field(
-        None, description="The actual output of executing the transformation code."
+        None,
+        description="The actual output of executing the transformation code.",
     )
     correction: Optional[str] = Field(
         None,
@@ -311,7 +319,8 @@ class TransformResult(BaseModel):
     error: Optional[str] = Field(
         None,
         description=(
-            "Error message if code generation or execution failed; " "otherwise None."
+            "Error message if code generation or execution failed; "
+            "otherwise None."
         ),
     )
@@ -356,7 +365,9 @@ class PipelineResult(BaseModel):
     Final output of the function-calling pipeline for one tool call.
     """
-    inputs: FunctionCallInput = Field(..., description="Echo of the pipeline inputs.")
+    inputs: FunctionCallInput = Field(
+        ..., description="Echo of the pipeline inputs."
+    )
     static: Optional[StaticResult] = Field(
         None, description="Static schema-validation results, if enabled."
     )
@@ -430,7 +441,9 @@ class PipelineResult(BaseModel):
             if param_avgs:
                 cat_avgs.append(sum(param_avgs) / len(param_avgs))
-        values.overall_avg_score = sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
+        values.overall_avg_score = (
+            sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
+        )
         values.overall_valid = ok
         return values
@@ -506,7 +519,9 @@ class ToolFunctionCall(BaseModel):
     Parsed representation of an LLM's function call response.
     """
-    name: str = Field(..., description="Name of the function the LLM chose to call.")
+    name: str = Field(
+        ..., description="Name of the function the LLM chose to call."
+    )
     arguments: str = Field(
         ..., description="JSON-encoded string of the call's arguments."
     )

wxo_agentic_evaluation/referenceless_eval/metrics/field.py CHANGED Viewed

@@ -60,7 +60,9 @@ class BaseField(BaseModel, ABC):
             if field_cls.can_handle(name, schema):
                 desc = schema.get("description", "")
                 extra = {
-                    k: v for k, v in schema.items() if k not in ("type", "description")
+                    k: v
+                    for k, v in schema.items()
+                    if k not in ("type", "description")
                 }
                 return field_cls(
                     name=name,
@@ -74,7 +76,9 @@ class BaseField(BaseModel, ABC):
             json_type=schema.get("type", "string"),
             description=schema.get("description", ""),
             jsonschema_extra={
-                k: v for k, v in schema.items() if k not in ("type", "description")
+                k: v
+                for k, v in schema.items()
+                if k not in ("type", "description")
             },
             extra_params={},
         )
@@ -122,10 +126,12 @@ class NumericField(BaseField):
     """
     threshold_low: Optional[float] = PydanticField(
-        None, description="Lower bound for correctness checks (not in JSONSchema)."
+        None,
+        description="Lower bound for correctness checks (not in JSONSchema).",
     )
     threshold_high: Optional[float] = PydanticField(
-        None, description="Upper bound for correctness checks (not in JSONSchema)."
+        None,
+        description="Upper bound for correctness checks (not in JSONSchema).",
     )
     __abstract__ = False
@@ -153,7 +159,9 @@ class NumericField(BaseField):
             json_type=schema.get("type", "number"),
             description=schema.get("description", ""),
             jsonschema_extra={
-                k: v for k, v in schema.items() if k not in ("type", "description")
+                k: v
+                for k, v in schema.items()
+                if k not in ("type", "description")
             },
             extra_params={},
         )

wxo_agentic_evaluation/referenceless_eval/metrics/metric.py CHANGED Viewed

@@ -131,7 +131,9 @@ class Metric:
             additional_properties=additional_props,
         )
-    def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
+    def is_important(
+        self, result: Dict[str, Any]
+    ) -> Tuple[bool, Optional[str]]:
         """
         A result is 'important' if its confidence lies within the defined confidence thresholds.
@@ -146,7 +148,9 @@ class Metric:
         except (TypeError, ValueError):
             return False, "Invalid confidence value"
         # locate the confidence field
-        conf_field = next((f for f in self.fields if f.name == "confidence"), None)
+        conf_field = next(
+            (f for f in self.fields if f.name == "confidence"), None
+        )
         if isinstance(conf_field, NumericField):
             ok = conf_field.is_within_threshold(conf)
             reason = (
@@ -266,7 +270,10 @@ class StandardMetric(Metric):
             json_type="number",
             description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
             jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
-            extra_params={"threshold_low": min_conf, "threshold_high": max_conf},
+            extra_params={
+                "threshold_low": min_conf,
+                "threshold_high": max_conf,
+            },
         )
         correction = CorrectionField(
             name="correction",
@@ -277,7 +284,9 @@ class StandardMetric(Metric):
         fields = [explanation, evidence, output, confidence, correction]
         super().__init__(name=name, description=description, fields=fields)
-    def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
+    def is_important(
+        self, result: Dict[str, Any]
+    ) -> Tuple[bool, Optional[str]]:
         """
         A result is 'important' if its confidence lies within the defined confidence thresholds.
@@ -292,7 +301,9 @@ class StandardMetric(Metric):
         except (TypeError, ValueError):
             return False, "Invalid confidence value"
         # locate the confidence field
-        conf_field = next((f for f in self.fields if f.name == "confidence"), None)
+        conf_field = next(
+            (f for f in self.fields if f.name == "confidence"), None
+        )
         if isinstance(conf_field, NumericField):
             ok = conf_field.is_within_threshold(conf)
             reason = (

wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py CHANGED Viewed

@@ -5,7 +5,9 @@ from pydantic import BaseModel
 from wxo_agentic_evaluation.referenceless_eval.metrics.field import NumericField
 from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
-from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
+from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import (
+    MetricPrompt,
+)
 from wxo_agentic_evaluation.referenceless_eval.prompt.runner import (
     AsyncGen,
     Prompt,
@@ -40,7 +42,8 @@ class MetricRunner:
     """
     def __init__(
-        self, entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None
+        self,
+        entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None,
     ) -> None:
         """
         Args:
@@ -51,7 +54,9 @@ class MetricRunner:
             for mp, kw in entries:
                 self.add(mp, kw)
-    def add(self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]) -> None:
+    def add(
+        self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]
+    ) -> None:
         """
         Add a metric to run.

wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py CHANGED Viewed

@@ -68,7 +68,9 @@ class MetricPrompt:
         # Store defaults for system context
         # This allows overriding system context without modifying the template
         # during prompt building
-        self.system_kwargs_defaults: Dict[str, Any] = system_kwargs_defaults.copy()
+        self.system_kwargs_defaults: Dict[str, Any] = (
+            system_kwargs_defaults.copy()
+        )
         # Initialize examples list
         # This will hold (user_kwargs, output) pairs for few-shot prompting
@@ -104,7 +106,9 @@ class MetricPrompt:
     # --- Example Management ---
-    def add_example(self, user_kwargs: Dict[str, Any], output: Dict[str, Any]) -> None:
+    def add_example(
+        self, user_kwargs: Dict[str, Any], output: Dict[str, Any]
+    ) -> None:
         """
         Add a few-shot example.

wxo_agentic_evaluation/referenceless_eval/metrics/utils.py CHANGED Viewed

@@ -17,7 +17,11 @@ def remove_threshold_fields(schema: dict) -> dict:
                 schema[key] = remove_threshold_fields(value)
             elif isinstance(value, list):
                 schema[key] = [
-                    remove_threshold_fields(item) if isinstance(item, dict) else item
+                    (
+                        remove_threshold_fields(item)
+                        if isinstance(item, dict)
+                        else item
+                    )
                     for item in value
                 ]
     return schema

wxo_agentic_evaluation/referenceless_eval/prompt/runner.py CHANGED Viewed

@@ -1,10 +1,22 @@
 import asyncio
-from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
 from pydantic import BaseModel
 Prompt = Union[str, List[Dict[str, Any]]]
-PromptAndSchema = Tuple[Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]]
+PromptAndSchema = Tuple[
+    Union[str, List[Dict[str, Any]]], Optional[Dict[str, Any]]
+]
 SyncGen = Callable[[Prompt], Union[str, Any]]
 BatchGen = Callable[[List[Prompt]], List[Union[str, Any]]]
 AsyncGen = Callable[[Prompt], Awaitable[Union[str, Any]]]
@@ -137,7 +149,8 @@ class PromptRunner:
                     return index, PromptResult(prompt=prompt, error=str(e))
         tasks = [
-            asyncio.create_task(_run_one(i, p)) for i, p in enumerate(self.prompts)
+            asyncio.create_task(_run_one(i, p))
+            for i, p in enumerate(self.prompts)
         ]
         indexed_results = await asyncio.gather(*tasks)
         # Sort results to match original order

wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py CHANGED Viewed

@@ -14,30 +14,37 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
     ToolCall,
     ToolSpec,
 )
-from wxo_agentic_evaluation.type import Message
 from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.type import Message
 class ReferencelessEvaluation:
     """
-        Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
-        Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
-        ---
-        Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
-        Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
-        Note: When parsing the semantic metrics, check for `is_correct` field.  if `false` there is some mistake that the LLMaJ found in that tool call.
+    Note: static.final_decison, if `True` -> then all static metrics were valid. If false, atleast one of the static metrics failed. Look at explanation for reasoning
+    Note: if static.final_decision == True, check semantic metrics. Semantic metrics **not** run if static.final_decision is False.
+    ---
+    Note: For semantic metrics, check agentic constraints. If agent-constraints == False, no point in checking others. If true, check others.
+    Note: METRIC_FUNCTION_SELECTION_APPROPRIATENESS == False, implies that the LLM should have called some other function/tool before *OR* it is a redundant call.
+    Note: When parsing the semantic metrics, check for `is_correct` field.  if `false` there is some mistake that the LLMaJ found in that tool call.
     """
     def __init__(
         self,
         api_spec: List[Mapping[str, Any]],
         messages: List[Message],
         model_id: str,
         task_n: str,
-        dataset_name: str,):
+        dataset_name: str,
+    ):
         self.metrics_client = get_provider(
             model_id=model_id,
-            params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
-            referenceless_eval=True
+            params={
+                "min_new_tokens": 0,
+                "decoding_method": "greedy",
+                "max_new_tokens": 4096,
+            },
+            referenceless_eval=True,
         )
         self.pipeline = ReflectionPipeline(
@@ -72,7 +79,11 @@ class ReferencelessEvaluation:
         examples = []
         processed_data = [
-            {k: msg.model_dump().get(k) for k in ["role", "content", "type"] if k in msg.model_dump()}
+            {
+                k: msg.model_dump().get(k)
+                for k in ["role", "content", "type"]
+                if k in msg.model_dump()
+            }
             for msg in self.messages
         ]

wxo_agentic_evaluation/resource_map.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from collections import defaultdict
 from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
@@ -44,4 +45,4 @@ class ResourceMap:
         agent2tools = dict(agent2tools)
         tools2agents = dict(tools2agents)
-        return agent2tools, tools2agents
+        return agent2tools, tools2agents

ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.1py3-none-any.whl → 1.1.3py3-none-any.whl