PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.4py3-none-any.whl → 1.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (35) hide show

wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py CHANGED Viewed

@@ -390,18 +390,18 @@ class PipelineResult(BaseModel):
     )
     @model_validator(mode="after")
-    def compute_overall(cls, values: PipelineResult) -> PipelineResult:
+    def compute_overall(self) -> Self:
         """
         After validation, compute overall_valid as AND of:
           • all semantic is_correct flags
           • if transform exists: all execution_success flags
         """
-        static: StaticResult = values.static
+        static: StaticResult = self.static
         if static:
             # static checks
             ok = static.final_decision
-        sem: SemanticResult = values.semantic
+        sem: SemanticResult = self.semantic
         if sem:
             # semantic checks
             if sem.general and sem.general.metrics:
@@ -441,11 +441,11 @@ class PipelineResult(BaseModel):
             if param_avgs:
                 cat_avgs.append(sum(param_avgs) / len(param_avgs))
-        values.overall_avg_score = (
+        self.overall_avg_score = (
             sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
         )
-        values.overall_valid = ok
-        return values
+        self.overall_valid = ok
+        return self
 # ----------------------------------------------------------------------
@@ -531,17 +531,17 @@ class ToolFunctionCall(BaseModel):
     )
     @model_validator(mode="after")
-    def _parse_arguments(cls, values: ToolFunctionCall) -> ToolFunctionCall:
+    def _parse_arguments(self) -> Self:
         """
         After model construction, parse the `arguments` JSON string
         into `parsed_arguments`, or raise a ValidationError.
         """
         try:
-            raw = values.arguments
-            values.parsed_arguments = json.loads(raw)
+            raw = self.arguments
+            self.parsed_arguments = json.loads(raw)
         except json.JSONDecodeError as e:
             raise ValidationError(f"Invalid JSON in arguments: {e}") from e
-        return values
+        return self
 class ToolCall(BaseModel):

wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py CHANGED Viewed

@@ -17,6 +17,11 @@ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types i
 from wxo_agentic_evaluation.service_provider import get_provider
 from wxo_agentic_evaluation.type import Message
+DEFAULT_GENERATION_PARAMS= {
+    "min_new_tokens": 0,
+    "decoding_method": "greedy",
+    "max_new_tokens": 4096,
+}
 class ReferencelessEvaluation:
     """
@@ -31,19 +36,16 @@ class ReferencelessEvaluation:
     def __init__(
         self,
         api_spec: List[Mapping[str, Any]],
-        messages: List[Message],
         model_id: str,
         task_n: str,
         dataset_name: str,
+        runtime_pipeline: bool = True,
+        generation_params = DEFAULT_GENERATION_PARAMS
     ):
         self.metrics_client = get_provider(
             model_id=model_id,
-            params={
-                "min_new_tokens": 0,
-                "decoding_method": "greedy",
-                "max_new_tokens": 4096,
-            },
+            params=generation_params,
             referenceless_eval=True,
         )
@@ -52,39 +54,45 @@ class ReferencelessEvaluation:
             general_metrics=[METRIC_GENERAL_HALLUCINATION_CHECK],
             function_metrics=[METRIC_FUNCTION_SELECTION_APPROPRIATENESS],
             parameter_metrics=None,
+            runtime_pipeline=runtime_pipeline,
         )
         self.task_n = task_n
         self.dataset_name = dataset_name
         self.apis_specs = [ToolSpec.model_validate(spec) for spec in api_spec]
-        self.messages = messages
-    def _run_pipeline(self, examples: List[Mapping[str, Any]]):
-        results = []
-        for example in examples:
-            result = self.pipeline.run_sync(
-                conversation=example["context"],
-                inventory=self.apis_specs,
-                call=example["call"],
-                continue_on_static=False,
-                retries=2,
-            )
-            result_dict = result.model_dump()
-            results.append(result_dict)
-        return results
+    @staticmethod
+    def fmt_tool_call(tool_id, tool_call_name, arguments, context):
+        call = {
+                "call": {
+                    "id": tool_id,
+                    "type": "function",
+                    "function": {
+                        "name": tool_call_name,
+                        "arguments": arguments,
+                    },
+                },
+                "context": context,
+            }
+        return call
-    def run(self):
+    @staticmethod
+    def fmt_msgs_referenceless(
+        messages: List[Message],
+    ) -> List[Mapping[str, Any]]:
+        """Assume that the last item in the `messages` array is the tool call, and preceding items
+        in the messages array is the context.
+        """
         examples = []
         processed_data = [
             {
                 k: msg.model_dump().get(k)
                 for k in ["role", "content", "type"]
                 if k in msg.model_dump()
             }
-            for msg in self.messages
+            for msg in messages
         ]
         for idx, message in enumerate(processed_data):
@@ -96,23 +104,48 @@ class ReferencelessEvaluation:
                 tool_call_msg = json.loads(content)
                 if tool_call_msg["name"].startswith("transfer_to"):
                     continue
-                call = {
-                    "call": {
-                        "id": tool_call_msg.get("id", "1"),
-                        "type": "function",
-                        "function": {
-                            "name": tool_call_msg["name"],
-                            "arguments": json.dumps(tool_call_msg["args"]),
-                        },
-                    },
-                    "context": context,
-                }
+                call = ReferencelessEvaluation.fmt_tool_call(
+                    tool_id=tool_call_msg.get("id", "1"),
+                    tool_call_name=tool_call_msg["name"],
+                    arguments=json.dumps(tool_call_msg["args"]),
+                    context=context
+                )
                 examples.append(call)
-        rich.print(
-            f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
-        )
+        return examples
+    def _run_pipeline(self, examples: List[Mapping[str, Any]]):
+        results = []
+        for example in examples:
+            result = self.pipeline.run_sync(
+                conversation=example["context"],
+                inventory=self.apis_specs,
+                call=example["call"],
+                continue_on_static=False,
+                retries=2,
+            )
+            result_dict = result.model_dump()
+            results.append(result_dict)
+        return results
+    def run(self, examples: List[Mapping[str, str]], verbose=False):
+        """`examples` should be an array where each element is formatted:
+        call = {
+            "call": {
+                "id": tool_call_msg.get("id", "1"),
+                "type": "function",
+                "function": {
+                    "name": tool_call_msg["name"],
+                    "arguments": json.dumps(tool_call_msg["args"]),
+                },
+            },
+            "context": context,
+        }
+        """
         examples = [
             {
                 "call": ToolCall.model_validate(ex["call"]),
@@ -120,6 +153,11 @@ class ReferencelessEvaluation:
             }
             for ex in examples
         ]
+        if verbose:
+            rich.print(
+                f"[yellow][b][Task-{self.task_n}] There are {len(examples)} examples to analyze"
+            )
         results = self._run_pipeline(examples)
         return results

wxo_agentic_evaluation/resource_map.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from collections import defaultdict
-from wxo_agentic_evaluation.inference_backend import WXOClient, is_saas_url
+from wxo_agentic_evaluation.utils.utils import is_saas_url
+from wxo_agentic_evaluation.wxo_client import WXOClient
 class ResourceMap:
@@ -34,6 +35,7 @@ class ResourceMap:
         if resp.status_code == 200:
             agents = resp.json()
+            self.all_agent_objs = agents
             for agent in agents:
                 agent_name = agent["name"]
                 tools = [tool_map[id] for id in agent["tools"]]

wxo_agentic_evaluation/service_instance.py CHANGED Viewed

@@ -247,6 +247,13 @@ def tenant_setup(
     context["active_environment"] = tenant_name
+    # Ensure parent directories exist so tests (which may run in clean envs)
+    # can write these files without raising FileNotFoundError.
+    auth_dir = os.path.dirname(auth_config_path)
+    env_dir = os.path.dirname(env_config_path)
+    os.makedirs(auth_dir, exist_ok=True)
+    os.makedirs(env_dir, exist_ok=True)
     with open(auth_config_path, "w") as f:
         yaml.dump(auth_config, f)
     with open(env_config_path, "w") as f:

wxo_agentic_evaluation/type.py CHANGED Viewed

@@ -131,7 +131,7 @@ class AttackData(BaseModel):
 class AttackData(BaseModel):
     agent: str
-    agents_path: str
+    agents_list_or_path: Union[List[str], str]
     attack_data: AttackData
     story: str
     starting_sentence: str

wxo_agentic_evaluation/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import json
+from wxo_agentic_evaluation.utils.utils import TestCaseResources, add_line_seperator, list_run_files, load_run_metrics, N_A
+from wxo_agentic_evaluation.utils.open_ai_tool_extractor import ToolExtractionOpenAIFormat
+from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
 def json_dump(output_path, object):

wxo_agentic_evaluation/utils/parsers.py ADDED Viewed

@@ -0,0 +1,71 @@
+from typing import Any, List, Mapping, Optional
+from wxo_agentic_evaluation.metrics import (
+    Annotation,
+    FailedSemanticTestCases,
+    FailedStaticTestCases,
+)
+class ReferencelessEvalParser:
+    @staticmethod
+    def static_parser(
+        static_metrics: Mapping[str, Mapping[str, Any]],
+    ) -> List[FailedStaticTestCases]:
+        """
+        static.metrics
+        """
+        failed_test_cases = []
+        for metric, metric_data in static_metrics.items():
+            if not metric_data.get("valid", False):
+                fail = FailedStaticTestCases(
+                    metric_name=metric,
+                    description=metric_data.get("description"),
+                    explanation=metric_data.get("explanation"),
+                )
+                failed_test_cases.append(fail)
+        return failed_test_cases
+    @staticmethod
+    def parse_annotations(
+        actionable_reccomendations, filters: List[str]
+    ) -> Optional[List[Annotation]]:
+        annotations = [
+            Annotation(
+                parameter_name=recc.get("parameter_name"),
+                recommendation=recc.get("recommendation"),
+                details=recc.get("details"),
+                quote=recc.get("quote"),
+            )
+            for recc in actionable_reccomendations
+            if recc.get("recommendation") in filters
+        ]
+        annotations = annotations if annotations else None
+        return annotations
+    @staticmethod
+    def semantic_parser(
+        metric_name, data, annotation_filters: Optional[List[str]]
+    ):
+        semantic_metric = FailedSemanticTestCases(
+            metric_name=metric_name,
+            evidence=data.get("evidence"),
+            explanation=data.get("explanation"),
+            output=data.get("output"),
+            confidence=data.get("confidence"),
+        )
+        if annotation_filters and (
+            annotations := ReferencelessEvalParser.parse_annotations(
+                data.get("actionable_recommendations"), annotation_filters
+            )
+        ):
+            semantic_metric.annotations = annotations
+        return semantic_metric

wxo_agentic_evaluation/utils/utils.py CHANGED Viewed

@@ -2,10 +2,14 @@ import glob
 import json
 import os
 import re
-from typing import List, Optional, Union
+import csv
+from collections import defaultdict
+from pathlib import Path
+from typing import List, Optional, Union, Mapping, Tuple, Any
 from urllib.parse import urlparse
 import yaml
+import rich
 from rich import box, print
 from rich.console import Console, Group
 from rich.panel import Panel
@@ -18,14 +22,20 @@ from wxo_agentic_evaluation.metrics.metrics import (
     KnowledgeBaseMetricSummary,
     ReferenceLessEvalMetrics,
     ToolCallAndRoutingMetrics,
+    EnhancedAnalyzeMetrics,
 )
 from wxo_agentic_evaluation.type import (
     ConversationalConfidenceThresholdScore,
     Message,
+    ExtendedMessage,
 )
 console = Console()
+RUN_FILE_RE = re.compile(
+    r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
+)
+N_A = "N/A"
 class AttackResultsTable:
     def __init__(self, attack_results: dict):
@@ -65,6 +75,94 @@ class AttackResultsTable:
         console.print(self.table)
+class TestCaseResources:
+    def __init__(self, output_dir: str):
+        """Todo flesh out for all resources that are saved"""
+        self.output_dir = Path(output_dir)
+    @property
+    def get_summary(self):
+        summary = []
+        with open(self.output_dir / "summary_metrics.csv", "r") as f:
+            reader = csv.reader(f)
+            header = next(reader)
+            for row in reader:
+                summary.append(dict(zip(header, row)))
+        return summary
+    def get_analyze_messages(
+        self, test_case_name=None, path=None
+    ) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
+        test_messages = []
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.messages.analyze.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No analyze file found at {path}")
+            raise Exception(f"No analyze file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            temp = json.load(f)
+            meta = None
+            if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
+                meta = temp[-1]["meta"]
+                temp = temp[:-1]
+            for entry in temp:
+                msg = ExtendedMessage(**entry)
+                test_messages.append(msg)
+        return test_messages, meta
+    def get_messages(self, test_case_name=None, path=None) -> List[Message]:
+        test_messages = []
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.messages.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No messages file found at {path}")
+            raise Exception(f"No messages file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            temp = json.load(f)
+            for entry in temp:
+                msg = Message(**entry)
+                test_messages.append(msg)
+        return test_messages
+    def get_test_metrics(
+        self, test_case_name=None, path=None
+    ) -> ToolCallAndRoutingMetrics:
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.metrics.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No metrics file found at {path}")
+            raise Exception(f"No metrics file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            metrics = ToolCallAndRoutingMetrics(**json.load(f))
+        return metrics
 class AgentMetricsTable:
     def __init__(self, data):
         self.table = Table(
@@ -123,18 +221,27 @@ def is_ibm_cloud_url(service_url: str) -> bool:
 def add_line_seperator(
     style_config: Optional[Union[str, Style]] = None,
+    print=True,
 ):
+    """
+    Adds a lined seperator provided the style config.
+    `print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
+    Set `print` to False, the lined seperator is printed later as part of the pager view for example.
+    """
     if not style_config:
         style = "grey42"
     else:
         style = style_config
-    console.print(
-        Rule(
-            style=style,
+    if print:
+        console.print(
+            Rule(
+                style=style,
+            )
         )
-    )
+    else:
+        return Rule(style=style, characters="==")
 class FaithfulnessTable:
@@ -347,6 +454,7 @@ class ReferencelessEvalPanel:
 # Function to load messages from JSON file
 def load_messages(file_path):
+    """TODO: replace in favor of TestCaseResources.get_messages(...)"""
     with open(file_path, "r") as f:
         try:
             message_data = json.load(f)
@@ -362,7 +470,7 @@ def load_messages(file_path):
             return None
-def load_agents(agents_path: str):
+def load_agents_from_disk(agents_path: str):
     agents_json = glob.glob(os.path.join(agents_path, "*.json"))
     agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
@@ -379,32 +487,39 @@ def load_agents(agents_path: str):
     return agents
-RUN_FILE_RE = re.compile(
-    r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
-)
-def list_run_files(messages_dir: str, dataset_base: str):
+def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
     """
     Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
     (We only need analyze+metrics for this feature.)
+    `filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
+    For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
+    the files related to only the second run are retrieved.
     """
-    runs = {}
+    runs = defaultdict(
+        lambda: {"analyze": None, "metrics": None, "messages": None}
+    )
     for fn in os.listdir(messages_dir):
         m = RUN_FILE_RE.match(fn)
         if not m or m.group("base") != dataset_base:
             continue
         run_id = int(m.group("run"))
+        if filter_run != -1 and run_id != filter_run:
+            continue
         kind = m.group("kind")
-        entry = runs.setdefault(run_id, {"analyze": None, "metrics": None})
         full = os.path.join(messages_dir, fn)
         if kind == "messages.analyze":
-            entry["analyze"] = full
+            runs[run_id]["analyze"] = full
         elif kind == "metrics":
-            entry["metrics"] = full
+            runs[run_id]["metrics"] = full
+        elif kind == "messages":
+            runs[run_id]["messages"] = full
     return runs
 def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
+    """Todo remove in a later PR"""
     with open(metrics_path, "r", encoding="utf-8") as f:
         return ToolCallAndRoutingMetrics(**json.load(f))

wxo_agentic_evaluation/wxo_client.py ADDED Viewed

@@ -0,0 +1,80 @@
+import os
+import requests
+import urllib3
+from urllib3.exceptions import InsecureRequestWarning
+from typing import Dict, Any, Optional
+from wxo_agentic_evaluation.service_instance import tenant_setup
+class WXOClient:
+    def __init__(
+        self, service_url, api_key, env: Optional[Dict[str, Any]] = None
+    ):
+        self.service_url = service_url
+        self.api_key = api_key
+        ov = os.getenv("WO_SSL_VERIFY")
+        if ov and ov.strip().lower() in ("true", "false"):
+            self._verify_ssl = ov.strip().lower() == "true"
+        else:
+            v, bs = (env.get("verify") if env else None), (
+                env.get("bypass_ssl") if env else None
+            )
+            self._verify_ssl = (
+                False
+                if (
+                    (bs is True)
+                    or (isinstance(bs, str) and bs.strip().lower() == "true")
+                    or (v is None)
+                    or (
+                        isinstance(v, str)
+                        and v.strip().lower() in {"none", "null"}
+                    )
+                )
+                else (v if isinstance(v, bool) else True)
+            )
+        if not self._verify_ssl:
+            urllib3.disable_warnings(InsecureRequestWarning)
+    def _get_headers(self) -> dict:
+        headers = {}
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        return headers
+    def post(self, payload: dict, path: str, stream=False):
+        url = f"{self.service_url}/{path}"
+        return requests.post(
+            url=url,
+            headers=self._get_headers(),
+            json=payload,
+            stream=stream,
+            verify=self._verify_ssl,
+        )
+    def get(self, path: str, params: dict = None):
+        url = f"{self.service_url}/{path}"
+        return requests.get(
+            url,
+            params=params,
+            headers=self._get_headers(),
+            verify=self._verify_ssl,
+        )
+def get_wxo_client(
+    service_url: Optional[str], tenant_name: str, token: Optional[str] = None
+) -> WXOClient:
+    token, resolved_url, env = tenant_setup(service_url, tenant_name)
+    service_url = service_url or resolved_url
+    if not (service_url and str(service_url).strip()):
+        raise ValueError(
+            f"service_url not provided and not found in config for tenant '{tenant_name}'"
+        )
+    wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
+    return wxo_client

{ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt RENAMED Viewed

File without changes

ibm-watsonx-orchestrate-evaluation-framework 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.4py3-none-any.whl → 1.1.6py3-none-any.whl