PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.2py3-none-any.whl → 1.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (27) hide show

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -8,6 +8,11 @@ from wxo_agentic_evaluation import __file__
 from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
 from wxo_agentic_evaluation.llm_matching import LLMMatcher
 from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
+from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
+from wxo_agentic_evaluation.metrics.llm_as_judge import (
+    AnswerDerailment,
+    AnswerUnsafeTopic,
+)
 from wxo_agentic_evaluation.metrics.metrics import (
     KeywordSemanticSearchMetric,
     KnowledgeBaseMetrics,
@@ -16,9 +21,11 @@ from wxo_agentic_evaluation.metrics.metrics import (
 )
 from wxo_agentic_evaluation.prompt.template_render import (
     AnswerRelevancyTemplateRenderer,
+    DerailmentTemplateRenderer,
     FaithfulnessTemplateRenderer,
     KeywordMatchingTemplateRenderer,
     SemanticMatchingTemplateRenderer,
+    UnsafeTopicTemplateRenderer,
 )
 from wxo_agentic_evaluation.resource_map import ResourceMap
 from wxo_agentic_evaluation.service_provider import get_provider
@@ -49,6 +56,14 @@ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
     "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
 )
+DERAILMENT_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "derailment_prompt.jinja2"
+)
+UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "unsafe_topic_prompt.jinja2"
+)
 """
 - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
 - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
@@ -68,16 +83,26 @@ class EvaluationPackage:
         resource_map: ResourceMap = None,
         is_attack_evaluation: bool = False,
     ):
-        self.tool_dictionary = {
-            goal_detail.name: goal_detail
-            for goal_detail in ground_truth.goal_details
-            if goal_detail.type == ContentType.tool_call
-        }
-        self.text_list = [
-            goal_detail
-            for goal_detail in ground_truth.goal_details
-            if goal_detail.type == ContentType.text
-        ]
+        self.tool_dictionary = (
+            {
+                goal_detail.name: goal_detail
+                for goal_detail in ground_truth.goal_details
+                if goal_detail.type == ContentType.tool_call
+            }
+            if ground_truth.goal_details
+            else {}
+        )
+        self.text_list = (
+            [
+                goal_detail
+                for goal_detail in ground_truth.goal_details
+                if goal_detail.type == ContentType.text
+            ]
+            if ground_truth.goal_details
+            else []
+        )
         self.messages = messages
         self.conversational_search_data = conversational_search_data
         self.is_attack_evaluation = is_attack_evaluation
@@ -118,6 +143,22 @@ class EvaluationPackage:
                 ANSWER_RELEVANCY_PROMPT_PATH
             ),
         )
+        self.safety_llm_as_a_judge = LLMSafetyJudge(
+            llm_client=get_provider(
+                model_id="meta-llama/llama-3-405b-instruct",
+                params={
+                    "min_new_tokens": 0,
+                    "decoding_method": "greedy",
+                    "max_new_tokens": 4096,
+                },
+            ),
+            answer_derailment=DerailmentTemplateRenderer(
+                DERAILMENT_PROMPT_PATH
+            ),
+            answer_unsafe_topic=UnsafeTopicTemplateRenderer(
+                UNSAFE_TOPIC_PROMPT_PATH
+            ),
+        )
     @staticmethod
     def find_ground_node(graph, start_node):
@@ -238,6 +279,29 @@ class EvaluationPackage:
                     f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
                 )
+    @staticmethod
+    def normalize_args(data):
+        if isinstance(data, dict):
+            # normalize keys (case-sensitive) and values
+            return {
+                str(k): EvaluationPackage.normalize_args(v)
+                for k, v in data.items()
+            }
+        elif isinstance(data, list):
+            normalized_list = [
+                EvaluationPackage.normalize_args(v) for v in data
+            ]
+            return sorted(
+                normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
+            )
+        else:
+            # don’t lowercase reserved keyword
+            if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
+                return str(data)
+            return str(data).lower()
     @staticmethod
     def _check_if_args_match_with_ignore(
         actual_args: dict[str, str], expected_args: dict[str, str]
@@ -257,8 +321,10 @@ class EvaluationPackage:
         for key in actual_args:
             if (
-                actual_args[key] != expected_args[key]
-                and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
+                EvaluationPackage.normalize_args(actual_args[key])
+                != EvaluationPackage.normalize_args(expected_args[key])
+                and EvaluationPackage.normalize_args(expected_args[key])
+                != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
             ):
                 return False
@@ -326,7 +392,12 @@ class EvaluationPackage:
                     for goal_detail in matching_goal_details:
                         # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
                         if goal_detail.args == {"IGNORE": None} or (
-                            msg_tool_call["args"] == goal_detail.args
+                            EvaluationPackage.normalize_args(
+                                msg_tool_call["args"]
+                            )
+                            == EvaluationPackage.normalize_args(
+                                goal_detail.args
+                            )
                             or self._check_if_args_match_with_ignore(
                                 msg_tool_call["args"], goal_detail.args
                             )
@@ -347,6 +418,9 @@ class EvaluationPackage:
                             )
                     if not found:
+                        tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
+                            1
+                        )
                         message_outcome = ExtendedMessage(message=message)
                         message_outcome.reason = {
                             "reason": "incorrect parameter",
@@ -588,6 +662,51 @@ class EvaluationPackage:
         return metrics
+    def evaluate_derailment(
+        self, instructions: str = None
+    ) -> List[AnswerDerailment]:
+        derailments = []
+        last_user_message = None
+        for message in self.messages:
+            if message.role == "user" and message.type == ContentType.text:
+                last_user_message = message
+            if message.role == "assistant" and message.type == ContentType.text:
+                derailment = (
+                    self.safety_llm_as_a_judge.judge_derailment_in_answer(
+                        question=last_user_message.content,
+                        instructions=instructions if instructions else "N/A",
+                        answer=message.content,
+                    )
+                )
+                derailments.append(derailment)
+                if derailment.in_scope == "no":
+                    return (
+                        derailments  # short-circuit if any derailment is found
+                    )
+        return derailments
+    def evaluate_unsafe_topics(
+        self, instructions: str = None
+    ) -> List[AnswerUnsafeTopic]:
+        unsafe_topics = []
+        last_user_message = None
+        for message in self.messages:
+            if message.role == "user" and message.type == ContentType.text:
+                last_user_message = message
+            if message.role == "assistant" and message.type == ContentType.text:
+                unsafe_topic = (
+                    self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
+                        question=last_user_message.content,
+                        instructions=instructions if instructions else "N/A",
+                        answer=message.content,
+                    )
+                )
+                unsafe_topics.append(unsafe_topic)
+                if unsafe_topic.is_safe == "no":
+                    return unsafe_topics  # short-circuit if any unsafe topic is found
+        return unsafe_topics
 if __name__ == "__main__":

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -2,19 +2,22 @@ import json
 import os
 import time
 from collections import deque
-import urllib3
-from urllib3.exceptions import InsecureRequestWarning
 from enum import Enum
-from typing import Any, Dict, Generator, List, Mapping, Tuple
+from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
 import requests
 import rich
+import urllib3
 import yaml
 from pydantic import BaseModel
+from urllib3.exceptions import InsecureRequestWarning
 from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.service_instance import tenant_setup
+from wxo_agentic_evaluation.service_instance import (
+    get_env_settings,
+    tenant_setup,
+)
 from wxo_agentic_evaluation.service_provider.watsonx_provider import (
     WatsonXProvider,
 )
@@ -80,13 +83,32 @@ class CallTracker(BaseModel):
 class WXOClient:
-    def __init__(self, service_url, api_key):
+    def __init__(
+        self, service_url, api_key, env: Optional[Dict[str, Any]] = None
+    ):
         self.service_url = service_url
         self.api_key = api_key
-        env_ssl_verify = os.getenv("WO_SSL_VERIFY", "true")
-        verify = isinstance(env_ssl_verify, str) and env_ssl_verify.strip().lower() == "true"
-        self._verify_ssl = verify
+        ov = os.getenv("WO_SSL_VERIFY")
+        if ov and ov.strip().lower() in ("true", "false"):
+            self._verify_ssl = ov.strip().lower() == "true"
+        else:
+            v, bs = (env.get("verify") if env else None), (
+                env.get("bypass_ssl") if env else None
+            )
+            self._verify_ssl = (
+                False
+                if (
+                    (bs is True)
+                    or (isinstance(bs, str) and bs.strip().lower() == "true")
+                    or (v is None)
+                    or (
+                        isinstance(v, str)
+                        and v.strip().lower() in {"none", "null"}
+                    )
+                )
+                else (v if isinstance(v, bool) else True)
+            )
         if not self._verify_ssl:
             urllib3.disable_warnings(InsecureRequestWarning)
@@ -100,12 +122,21 @@ class WXOClient:
     def post(self, payload: dict, path: str, stream=False):
         url = f"{self.service_url}/{path}"
         return requests.post(
-            url=url, headers=self._get_headers(), json=payload, stream=stream, verify=self._verify_ssl
+            url=url,
+            headers=self._get_headers(),
+            json=payload,
+            stream=stream,
+            verify=self._verify_ssl,
         )
     def get(self, path: str, params: dict = None):
         url = f"{self.service_url}/{path}"
-        return requests.get(url, params=params, headers=self._get_headers(), verify=self._verify_ssl)
+        return requests.get(
+            url,
+            params=params,
+            headers=self._get_headers(),
+            verify=self._verify_ssl,
+        )
 class WXOInferenceBackend:
@@ -757,11 +788,18 @@ class EvaluationController:
 def get_wxo_client(
-    service_url: str, tenant_name: str, token: str = None
+    service_url: Optional[str], tenant_name: str, token: Optional[str] = None
 ) -> WXOClient:
-    if not token:
-        token = tenant_setup(service_url, tenant_name)
-    wxo_client = WXOClient(service_url=service_url, api_key=token)
+    token, resolved_url, env = tenant_setup(service_url, tenant_name)
+    service_url = service_url or resolved_url
+    if not (service_url and str(service_url).strip()):
+        raise ValueError(
+            f"service_url not provided and not found in config for tenant '{tenant_name}'"
+        )
+    wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
     return wxo_client

wxo_agentic_evaluation/llm_safety_eval.py ADDED Viewed

@@ -0,0 +1,64 @@
+import json
+import re
+from wxo_agentic_evaluation.metrics.llm_as_judge import (
+    AnswerDerailment,
+    AnswerUnsafeTopic,
+)
+from wxo_agentic_evaluation.prompt.template_render import (
+    DerailmentTemplateRenderer,
+    UnsafeTopicTemplateRenderer,
+)
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+class LLMSafetyJudge:
+    def __init__(
+        self,
+        llm_client: Provider,
+        answer_derailment: DerailmentTemplateRenderer,
+        answer_unsafe_topic: UnsafeTopicTemplateRenderer,
+    ):
+        self.llm_client = llm_client
+        self.answer_derailment_template = answer_derailment
+        self.answer_unsafe_topic_template = answer_unsafe_topic
+    def judge_derailment_in_answer(
+        self, question: str, instructions: str, answer: str
+    ) -> AnswerDerailment:
+        prompt = self.answer_derailment_template.render(
+            question=question, instructions=instructions, answer=answer
+        )
+        output = self.llm_client.query(prompt)
+        result = output.strip().lower()
+        try:
+            json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
+            thejson = json.loads(json_str)
+            answer_derailment = AnswerDerailment.model_validate(thejson)
+        except Exception:
+            answer_derailment = AnswerDerailment(
+                in_scope="not sure", statement=answer, reason=result
+            )
+        return answer_derailment
+    def judge_unsafe_topic_in_answer(
+        self, question: str, instructions: str, answer: str
+    ) -> AnswerUnsafeTopic:
+        """
+        Use LLM to judge if the answer contains unsafe topics.
+        """
+        prompt = self.answer_unsafe_topic_template.render(
+            question=question, instructions=instructions, answer=answer
+        )
+        output = self.llm_client.query(prompt)
+        result = output.strip().lower()
+        try:
+            json_str = re.search(r"\{.*\}", result, re.DOTALL).group(0)
+            thejson = json.loads(json_str)
+            answer_unsafe = AnswerUnsafeTopic.model_validate(thejson)
+        except Exception:
+            answer_unsafe = AnswerUnsafeTopic(
+                is_safe="not sure", statement=answer, reason=result
+            )
+        return answer_unsafe

ibm-watsonx-orchestrate-evaluation-framework 1.1.2__py3-none-any.whl → 1.1.4__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.2py3-none-any.whl → 1.1.4py3-none-any.whl