PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (49) hide show

wxo_agentic_evaluation/utils/__init__.py CHANGED Viewed

@@ -1,6 +1,47 @@
 import json
+import os
+import tempfile
+from pathlib import Path
+from wxo_agentic_evaluation.utils.open_ai_tool_extractor import (
+    ToolExtractionOpenAIFormat,
+)
+from wxo_agentic_evaluation.utils.parsers import ReferencelessEvalParser
+from wxo_agentic_evaluation.utils.utils import (
+    N_A,
+    TestCaseResources,
+    add_line_seperator,
+    list_run_files,
+    load_run_metrics,
+)
-def json_dump(output_path, object):
-    with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(object, f, indent=4)
+def json_dump(output_path, obj):
+    """
+    Atomically dump JSON to `output_path`.
+    - Writes to a temporary file first
+    - Then atomically replaces the target file
+    - Prevents corrupted/half-written JSON if process is interrupted
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp_path = tempfile.mkstemp(
+        dir=output_path.parent,
+        prefix=output_path.stem,
+        suffix=".tmp",
+        text=True,
+    )
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as f:
+            json.dump(obj, f, indent=4, ensure_ascii=False)
+            f.flush()
+            os.fsync(f.fileno())
+        os.replace(tmp_path, output_path)
+    except Exception:
+        try:
+            os.remove(tmp_path)
+        except OSError:
+            pass
+        raise

wxo_agentic_evaluation/utils/evaluation_discovery.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""
+Evaluation discovery mechanism.
+This module provides functionality for discovering classes that inherit from Evaluation.
+"""
+import importlib.util
+import inspect
+import os
+def find_evaluation_subclasses(directory: str, base_class_name="Evaluation"):
+    """
+    Dynamically import Python files under 'directory' and find classes that
+    inherit from a class named 'Evaluation'. Returns a list of non-abstract
+    class objects.
+    """
+    subclasses = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".py") and not file.startswith("__"):
+                filepath = os.path.join(root, file)
+                module_name = os.path.splitext(os.path.basename(filepath))[0]
+                spec = importlib.util.spec_from_file_location(
+                    module_name, filepath
+                )
+                if spec and spec.loader:
+                    module = importlib.util.module_from_spec(spec)
+                    try:
+                        spec.loader.exec_module(module)
+                    except Exception as e:
+                        print(f"Skipping {filepath} due to import error: {e}")
+                        continue
+                    # Inspect for subclasses
+                    for name, obj in inspect.getmembers(
+                        module, inspect.isclass
+                    ):
+                        if any(
+                            base.__name__ == base_class_name
+                            for base in obj.__mro__[1:]
+                        ) and not inspect.isabstract(obj):
+                            subclasses.append(obj)
+    return subclasses

wxo_agentic_evaluation/utils/gateway_provider_utils.py ADDED Viewed

@@ -0,0 +1,39 @@
+import os
+from functools import lru_cache
+from wxo_agentic_evaluation.arg_configs import AuthConfig
+from wxo_agentic_evaluation.service_provider import USE_GATEWAY_MODEL_PROVIDER
+from wxo_agentic_evaluation.wxo_client import get_wxo_client
+WXO_AUTH_CONFIG_DEFAULTS = AuthConfig(
+    url=os.getenv("WXO_URL", "http://localhost:4321"),
+    tenant_name=os.getenv("WXO_TENANT", "wxo-dev"),
+    token=os.getenv("WXO_TOKEN", None),
+)
+@lru_cache(maxsize=1)
+def _get_cached_wxo_client():
+    # TODO: remove this once the client is implemented as a Singleton.
+    return get_wxo_client(
+        WXO_AUTH_CONFIG_DEFAULTS.url,
+        WXO_AUTH_CONFIG_DEFAULTS.tenant_name,
+        WXO_AUTH_CONFIG_DEFAULTS.token,
+    )
+def get_provider_kwargs(**base_kwargs: dict) -> dict:
+    if not USE_GATEWAY_MODEL_PROVIDER:
+        return base_kwargs
+    if "instance_url" in base_kwargs and "token" in base_kwargs:
+        return base_kwargs
+    wxo_client = _get_cached_wxo_client()
+    return {
+        **base_kwargs,
+        "instance_url": wxo_client.service_url,
+        "token": wxo_client.api_key,
+    }

wxo_agentic_evaluation/utils/messages_parser.py ADDED Viewed

@@ -0,0 +1,30 @@
+from typing import Optional
+from pydantic import BaseModel, Field
+from wxo_agentic_evaluation.type import ContentType, Message
+class ParsedMessages(BaseModel):
+    """
+    A parsed history of messages.
+    """
+    messages: list[Message] = Field(description="The list of messages")
+    @property
+    def user_input(self) -> Optional[str]:
+        """Find the original user message."""
+        for message in self.messages:
+            if message.role == "user" and message.type == ContentType.text:
+                return str(message.content)
+        return None
+    @property
+    def agent_response(self) -> Optional[str]:
+        """Find the most recent assistant message."""
+        messages_in_reverse = reversed(self.messages)
+        for message in messages_in_reverse:
+            if message.role == "assistant" and message.type == ContentType.text:
+                return str(message.content)
+        return None

wxo_agentic_evaluation/utils/parsers.py ADDED Viewed

@@ -0,0 +1,71 @@
+from typing import Any, List, Mapping, Optional
+from wxo_agentic_evaluation.metrics import (
+    Annotation,
+    FailedSemanticTestCases,
+    FailedStaticTestCases,
+)
+class ReferencelessEvalParser:
+    @staticmethod
+    def static_parser(
+        static_metrics: Mapping[str, Mapping[str, Any]],
+    ) -> List[FailedStaticTestCases]:
+        """
+        static.metrics
+        """
+        failed_test_cases = []
+        for metric, metric_data in static_metrics.items():
+            if not metric_data.get("valid", False):
+                fail = FailedStaticTestCases(
+                    metric_name=metric,
+                    description=metric_data.get("description"),
+                    explanation=metric_data.get("explanation"),
+                )
+                failed_test_cases.append(fail)
+        return failed_test_cases
+    @staticmethod
+    def parse_annotations(
+        actionable_reccomendations, filters: List[str]
+    ) -> Optional[List[Annotation]]:
+        annotations = [
+            Annotation(
+                parameter_name=recc.get("parameter_name"),
+                recommendation=recc.get("recommendation"),
+                details=recc.get("details"),
+                quote=recc.get("quote"),
+            )
+            for recc in actionable_reccomendations
+            if recc.get("recommendation") in filters
+        ]
+        annotations = annotations if annotations else None
+        return annotations
+    @staticmethod
+    def semantic_parser(
+        metric_name, data, annotation_filters: Optional[List[str]]
+    ):
+        semantic_metric = FailedSemanticTestCases(
+            metric_name=metric_name,
+            evidence=data.get("evidence"),
+            explanation=data.get("explanation"),
+            output=data.get("output"),
+            confidence=data.get("confidence"),
+        )
+        if annotation_filters and (
+            annotations := ReferencelessEvalParser.parse_annotations(
+                data.get("actionable_recommendations"), annotation_filters
+            )
+        ):
+            semantic_metric.annotations = annotations
+        return semantic_metric

wxo_agentic_evaluation/utils/utils.py CHANGED Viewed

@@ -1,10 +1,14 @@
+import csv
 import glob
 import json
 import os
 import re
-from typing import List, Optional, Union
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, List, Mapping, Optional, Tuple, Union
 from urllib.parse import urlparse
+import rich
 import yaml
 from rich import box, print
 from rich.console import Console, Group
@@ -15,17 +19,24 @@ from rich.table import Table
 from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
 from wxo_agentic_evaluation.metrics.metrics import (
+    EnhancedAnalyzeMetrics,
     KnowledgeBaseMetricSummary,
     ReferenceLessEvalMetrics,
     ToolCallAndRoutingMetrics,
 )
 from wxo_agentic_evaluation.type import (
     ConversationalConfidenceThresholdScore,
+    ExtendedMessage,
     Message,
 )
 console = Console()
+RUN_FILE_RE = re.compile(
+    r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
+)
+N_A = "N/A"
 class AttackResultsTable:
     def __init__(self, attack_results: dict):
@@ -65,10 +76,100 @@ class AttackResultsTable:
         console.print(self.table)
+class TestCaseResources:
+    def __init__(self, output_dir: str):
+        """Todo flesh out for all resources that are saved"""
+        self.output_dir = Path(output_dir)
+    @property
+    def get_summary(self):
+        summary = []
+        with open(self.output_dir / "summary_metrics.csv", "r") as f:
+            reader = csv.reader(f)
+            header = next(reader)
+            for row in reader:
+                summary.append(dict(zip(header, row)))
+        return summary
+    def get_analyze_messages(
+        self, test_case_name=None, path=None
+    ) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
+        test_messages = []
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.messages.analyze.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No analyze file found at {path}")
+            raise Exception(f"No analyze file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            temp = json.load(f)
+            meta = None
+            if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
+                meta = temp[-1]["meta"]
+                temp = temp[:-1]
+            for entry in temp:
+                msg = ExtendedMessage(**entry)
+                test_messages.append(msg)
+        return test_messages, meta
+    def get_messages(self, test_case_name=None, path=None) -> List[Message]:
+        test_messages = []
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.messages.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No messages file found at {path}")
+            raise Exception(f"No messages file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            temp = json.load(f)
+            for entry in temp:
+                msg = Message(**entry)
+                test_messages.append(msg)
+        return test_messages
+    def get_test_metrics(
+        self, test_case_name=None, path=None
+    ) -> ToolCallAndRoutingMetrics:
+        if test_case_name:
+            path = os.path.join(
+                self.output_dir,
+                "messages",
+                f"{test_case_name}.metrics.json",
+            )
+        if not Path(str(path)).is_file():
+            rich.print(f"[r]No metrics file found at {path}")
+            raise Exception(f"No metrics file found at {path}")
+        with open(path, "r", encoding="utf-8") as f:
+            metrics = ToolCallAndRoutingMetrics(**json.load(f))
+        return metrics
 class AgentMetricsTable:
-    def __init__(self, data):
+    def __init__(self, data, title: Optional[str] = None):
+        if title is None:
+            title = "Agent Metrics"
         self.table = Table(
-            title="Agent Metrics",
+            title=title,
             box=box.ROUNDED,
             show_lines=True,
         )
@@ -89,7 +190,9 @@ class AgentMetricsTable:
         console.print(self.table)
-def create_table(data: List[dict]) -> AgentMetricsTable:
+def create_table(
+    data: List[dict], title: Optional[str] = None
+) -> AgentMetricsTable:
     """
     Generate a Rich table from a list of dictionaries.
     Returns the AgentMetricsTable instance.
@@ -101,7 +204,7 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
         print("create_table() received an empty dataset. No table generated.")
         return None
-    return AgentMetricsTable(data)
+    return AgentMetricsTable(data, title=title)
 def safe_divide(nom, denom):
@@ -123,18 +226,27 @@ def is_ibm_cloud_url(service_url: str) -> bool:
 def add_line_seperator(
     style_config: Optional[Union[str, Style]] = None,
+    print=True,
 ):
+    """
+    Adds a lined seperator provided the style config.
+    `print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
+    Set `print` to False, the lined seperator is printed later as part of the pager view for example.
+    """
     if not style_config:
         style = "grey42"
     else:
         style = style_config
-    console.print(
-        Rule(
-            style=style,
+    if print:
+        console.print(
+            Rule(
+                style=style,
+            )
         )
-    )
+    else:
+        return Rule(style=style, characters="==")
 class FaithfulnessTable:
@@ -347,6 +459,7 @@ class ReferencelessEvalPanel:
 # Function to load messages from JSON file
 def load_messages(file_path):
+    """TODO: replace in favor of TestCaseResources.get_messages(...)"""
     with open(file_path, "r") as f:
         try:
             message_data = json.load(f)
@@ -362,7 +475,7 @@ def load_messages(file_path):
             return None
-def load_agents(agents_path: str):
+def load_agents_from_disk(agents_path: str):
     agents_json = glob.glob(os.path.join(agents_path, "*.json"))
     agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
@@ -379,32 +492,39 @@ def load_agents(agents_path: str):
     return agents
-RUN_FILE_RE = re.compile(
-    r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
-)
-def list_run_files(messages_dir: str, dataset_base: str):
+def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
     """
     Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
     (We only need analyze+metrics for this feature.)
+    `filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
+    For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
+    the files related to only the second run are retrieved.
     """
-    runs = {}
+    runs = defaultdict(
+        lambda: {"analyze": None, "metrics": None, "messages": None}
+    )
     for fn in os.listdir(messages_dir):
         m = RUN_FILE_RE.match(fn)
         if not m or m.group("base") != dataset_base:
             continue
         run_id = int(m.group("run"))
+        if filter_run != -1 and run_id != filter_run:
+            continue
         kind = m.group("kind")
-        entry = runs.setdefault(run_id, {"analyze": None, "metrics": None})
         full = os.path.join(messages_dir, fn)
         if kind == "messages.analyze":
-            entry["analyze"] = full
+            runs[run_id]["analyze"] = full
         elif kind == "metrics":
-            entry["metrics"] = full
+            runs[run_id]["metrics"] = full
+        elif kind == "messages":
+            runs[run_id]["messages"] = full
     return runs
 def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
+    """Todo remove in a later PR"""
     with open(metrics_path, "r", encoding="utf-8") as f:
         return ToolCallAndRoutingMetrics(**json.load(f))

wxo_agentic_evaluation/wxo_client.py ADDED Viewed

@@ -0,0 +1,81 @@
+import os
+from typing import Any, Dict, Optional
+import requests
+import urllib3
+from urllib3.exceptions import InsecureRequestWarning
+from wxo_agentic_evaluation.service_instance import tenant_setup
+class WXOClient:
+    def __init__(
+        self, service_url, api_key, env: Optional[Dict[str, Any]] = None
+    ):
+        self.service_url = service_url
+        self.api_key = api_key
+        ov = os.getenv("WO_SSL_VERIFY")
+        if ov and ov.strip().lower() in ("true", "false"):
+            self._verify_ssl = ov.strip().lower() == "true"
+        else:
+            v, bs = (env.get("verify") if env else None), (
+                env.get("bypass_ssl") if env else None
+            )
+            self._verify_ssl = (
+                False
+                if (
+                    (bs is True)
+                    or (isinstance(bs, str) and bs.strip().lower() == "true")
+                    or (v is None)
+                    or (
+                        isinstance(v, str)
+                        and v.strip().lower() in {"none", "null"}
+                    )
+                )
+                else (v if isinstance(v, bool) else True)
+            )
+        if not self._verify_ssl:
+            urllib3.disable_warnings(InsecureRequestWarning)
+    def _get_headers(self) -> dict:
+        headers = {}
+        if self.api_key:
+            headers["Authorization"] = f"Bearer {self.api_key}"
+        return headers
+    def post(self, payload: dict, path: str, stream=False):
+        url = f"{self.service_url}/{path}"
+        return requests.post(
+            url=url,
+            headers=self._get_headers(),
+            json=payload,
+            stream=stream,
+            verify=self._verify_ssl,
+        )
+    def get(self, path: str, params: dict = None):
+        url = f"{self.service_url}/{path}"
+        return requests.get(
+            url,
+            params=params,
+            headers=self._get_headers(),
+            verify=self._verify_ssl,
+        )
+def get_wxo_client(
+    service_url: Optional[str], tenant_name: str, token: Optional[str] = None
+) -> WXOClient:
+    token, resolved_url, env = tenant_setup(service_url, tenant_name)
+    service_url = service_url or resolved_url
+    if not (service_url and str(service_url).strip()):
+        raise ValueError(
+            f"service_url not provided and not found in config for tenant '{tenant_name}'"
+        )
+    wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
+    return wxo_client

{ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl