PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
wxo_agentic_evaluation/analyze_run.py +1025 -220
wxo_agentic_evaluation/annotate.py +2 -2
wxo_agentic_evaluation/arg_configs.py +60 -2
wxo_agentic_evaluation/base_user.py +25 -0
wxo_agentic_evaluation/batch_annotate.py +19 -2
wxo_agentic_evaluation/clients.py +103 -0
wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
wxo_agentic_evaluation/compare_runs/diff.py +554 -0
wxo_agentic_evaluation/compare_runs/model.py +193 -0
wxo_agentic_evaluation/data_annotator.py +25 -7
wxo_agentic_evaluation/description_quality_checker.py +29 -6
wxo_agentic_evaluation/evaluation.py +16 -8
wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
wxo_agentic_evaluation/evaluation_package.py +414 -69
wxo_agentic_evaluation/external_agent/__init__.py +1 -1
wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
wxo_agentic_evaluation/external_agent/types.py +3 -9
wxo_agentic_evaluation/extractors/__init__.py +3 -0
wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
wxo_agentic_evaluation/langfuse_collection.py +60 -0
wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
wxo_agentic_evaluation/llm_matching.py +104 -2
wxo_agentic_evaluation/llm_safety_eval.py +64 -0
wxo_agentic_evaluation/llm_user.py +5 -4
wxo_agentic_evaluation/llm_user_v2.py +114 -0
wxo_agentic_evaluation/main.py +112 -343
wxo_agentic_evaluation/metrics/__init__.py +15 -0
wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
wxo_agentic_evaluation/metrics/evaluations.py +107 -0
wxo_agentic_evaluation/metrics/journey_success.py +137 -0
wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
wxo_agentic_evaluation/metrics/metrics.py +276 -8
wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
wxo_agentic_evaluation/otel_parser/parser.py +163 -0
wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
wxo_agentic_evaluation/otel_parser/utils.py +15 -0
wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
wxo_agentic_evaluation/prompt/template_render.py +103 -4
wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
wxo_agentic_evaluation/quick_eval.py +33 -17
wxo_agentic_evaluation/record_chat.py +38 -32
wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
wxo_agentic_evaluation/resource_map.py +3 -1
wxo_agentic_evaluation/runner.py +329 -0
wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
wxo_agentic_evaluation/scheduler.py +247 -0
wxo_agentic_evaluation/service_instance.py +26 -17
wxo_agentic_evaluation/service_provider/__init__.py +145 -9
wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
wxo_agentic_evaluation/service_provider/provider.py +130 -10
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
wxo_agentic_evaluation/simluation_runner.py +125 -0
wxo_agentic_evaluation/test_prompt.py +4 -4
wxo_agentic_evaluation/type.py +185 -16
wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
wxo_agentic_evaluation/utils/__init__.py +44 -3
wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
wxo_agentic_evaluation/utils/messages_parser.py +30 -0
wxo_agentic_evaluation/utils/parsers.py +71 -0
wxo_agentic_evaluation/utils/utils.py +313 -9
wxo_agentic_evaluation/wxo_client.py +81 -0
ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
{ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -1,14 +1,27 @@
 import json
 import os
-from typing import List
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
 import rich
+from dateutil import parser
 from wxo_agentic_evaluation import __file__
 from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
 from wxo_agentic_evaluation.llm_matching import LLMMatcher
 from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
+from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
+from wxo_agentic_evaluation.metrics.evaluations import (
+    Evaluation,
+    Metric,
+)
+from wxo_agentic_evaluation.extractors.extractor_base import Extractor
+from wxo_agentic_evaluation.metrics.llm_as_judge import (
+    AnswerDerailment,
+    AnswerUnsafeTopic,
+)
 from wxo_agentic_evaluation.metrics.metrics import (
+    CustomEvalMetrics,
     KeywordSemanticSearchMetric,
     KnowledgeBaseMetrics,
     TextMatchType,
@@ -16,19 +29,27 @@ from wxo_agentic_evaluation.metrics.metrics import (
 )
 from wxo_agentic_evaluation.prompt.template_render import (
     AnswerRelevancyTemplateRenderer,
+    DerailmentTemplateRenderer,
     FaithfulnessTemplateRenderer,
     KeywordMatchingTemplateRenderer,
     SemanticMatchingTemplateRenderer,
+    UnsafeTopicTemplateRenderer,
 )
 from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.service_instance import tenant_setup
+from wxo_agentic_evaluation.service_provider import (
+    USE_GATEWAY_MODEL_PROVIDER,
+    get_provider,
+)
+from wxo_agentic_evaluation.service_provider.provider import Provider
 from wxo_agentic_evaluation.type import (
     ContentType,
     ConversationalSearch,
-    EvaluationData,
     EventTypes,
     ExtendedMessage,
+    MatchingStrategy,
     Message,
+    OrchestrateDataset,
 )
 root_dir = os.path.dirname(__file__)
@@ -49,6 +70,14 @@ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
     "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
 )
+DERAILMENT_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "derailment_prompt.jinja2"
+)
+UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "unsafe_topic_prompt.jinja2"
+)
 """
 - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
 - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
@@ -61,33 +90,91 @@ DUMMY_GRAPH_NODE_NAME = "dummy-goal"
 class EvaluationPackage:
     def __init__(
         self,
-        test_case_name,
-        ground_truth,
-        messages,
+        test_case_name: str,
+        ground_truth: OrchestrateDataset,
+        messages: list[Message],
         conversational_search_data: List[ConversationalSearch] = None,
         resource_map: ResourceMap = None,
         is_attack_evaluation: bool = False,
+        config=None,
+        custom_evals: Optional[list[Evaluation]] = None,
+        custom_llmaaj_client: Optional[Provider] = None,
+        extractors: Optional[list[Extractor]] = None,
+        similarity_threshold=0.8,
+        enable_fuzzy_matching=False,
+        strict_topological_matching=True,
     ):
-        self.tool_dictionary = {
-            goal_detail.name: goal_detail
-            for goal_detail in ground_truth.goal_details
-            if goal_detail.type == ContentType.tool_call
-        }
-        self.text_list = [
-            goal_detail
-            for goal_detail in ground_truth.goal_details
-            if goal_detail.type == ContentType.text
-        ]
-        self.messages = messages
+        self.tool_dictionary = (
+            {
+                goal_detail.name: goal_detail
+                for goal_detail in ground_truth.goal_details
+                if goal_detail.type == ContentType.tool_call
+            }
+            if ground_truth.goal_details
+            else {}
+        )
+        self.text_list = (
+            [
+                goal_detail
+                for goal_detail in ground_truth.goal_details
+                if goal_detail.type == ContentType.text
+            ]
+            if ground_truth.goal_details
+            else []
+        )
+        self.messages: List[Message] = messages
         self.conversational_search_data = conversational_search_data
         self.is_attack_evaluation = is_attack_evaluation
         self.ground_truth = ground_truth
         self.test_case_name = test_case_name
         self.resource_map = resource_map
+        self.custom_evals = custom_evals
+        self.custom_llmaaj_client = custom_llmaaj_client
+        self.extractors = extractors
+        self.enable_fuzzy_matching = enable_fuzzy_matching
+        self.strict_topological_matching = strict_topological_matching
         if not self.is_attack_evaluation:
             self.validate_ground_truth(self.ground_truth, self.test_case_name)
+        extra_kwargs = {}
+        if USE_GATEWAY_MODEL_PROVIDER:
+            if resource_map and hasattr(resource_map, "wxo_client"):
+                wxo_client = resource_map.wxo_client
+                if hasattr(wxo_client, "service_url"):
+                    extra_kwargs["instance_url"] = wxo_client.service_url
+                if hasattr(wxo_client, "api_key"):
+                    extra_kwargs["token"] = wxo_client.api_key
+            elif config:
+                auth = getattr(config, "auth_config", None)
+                if auth:
+                    instance_url = getattr(auth, "url", None)
+                    token = getattr(auth, "token", None)
+                if instance_url:
+                    extra_kwargs["instance_url"] = instance_url
+                if token:
+                    extra_kwargs["token"] = token
+            else:
+                token, instance_url, env = tenant_setup(
+                    service_url=None, tenant_name="local"
+                )
+                if instance_url:
+                    extra_kwargs["instance_url"] = instance_url
+                if token:
+                    extra_kwargs["token"] = token
+        # output response matching
         self.matcher = LLMMatcher(
             llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
@@ -96,6 +183,8 @@ class EvaluationPackage:
                     "decoding_method": "greedy",
                     "max_new_tokens": 10,
                 },
+                embedding_model_id="sentence-transformers/all-minilm-l6-v2",
+                **extra_kwargs,
             ),
             keyword_template=KeywordMatchingTemplateRenderer(
                 KEYWORD_MATCHING_PROMPT_PATH
@@ -103,7 +192,10 @@ class EvaluationPackage:
             semantic_template=SemanticMatchingTemplateRenderer(
                 SEMANTIC_MATCHING_PROMPT_PATH
             ),
+            similarity_threshold=similarity_threshold,
+            enable_fuzzy_matching=enable_fuzzy_matching,
         )
+        # only used for RAG evaluation
         self.rag_llm_as_a_judge = LLMJudge(
             llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
@@ -112,57 +204,102 @@ class EvaluationPackage:
                     "decoding_method": "greedy",
                     "max_new_tokens": 4096,
                 },
+                **extra_kwargs,
             ),
             faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
             answer_relevancy=AnswerRelevancyTemplateRenderer(
                 ANSWER_RELEVANCY_PROMPT_PATH
             ),
         )
+        self.safety_llm_as_a_judge = LLMSafetyJudge(
+            llm_client=get_provider(
+                model_id="meta-llama/llama-3-405b-instruct",
+                params={
+                    "min_new_tokens": 0,
+                    "decoding_method": "greedy",
+                    "max_new_tokens": 4096,
+                },
+                **extra_kwargs,
+            ),
+            answer_derailment=DerailmentTemplateRenderer(
+                DERAILMENT_PROMPT_PATH
+            ),
+            answer_unsafe_topic=UnsafeTopicTemplateRenderer(
+                UNSAFE_TOPIC_PROMPT_PATH
+            ),
+        )
     @staticmethod
-    def find_ground_node(graph, start_node):
-        """Simple implementation. Should be fixed in the future
+    def find_terminal_nodes(graph: dict[str, list[str]]) -> set[str]:
+        """Finds terminal nodes (nodes with no outgoing edges).
+        Args:
+            graph: the input graph
-        Assumes that there is a single graph node that does not have children
+        Returns:
+            a set of the terminal nodes
         """
-        stack = [start_node]
-        visited_set = set()
+        seen_nodes = set()  # track seen nodes
+        non_terminal_nodes = set()  # track nodes with children
-        while stack:
-            node = stack.pop()
-            if node not in visited_set:
-                visited_set.add(node)
+        for node in graph:
+            seen_nodes.add(node)
+            if graph[node]:
+                non_terminal_nodes.add(node)
+                for n in graph[node]:
+                    seen_nodes.add(n)
+        return seen_nodes - non_terminal_nodes
-                # check for children
-                # improvement for future: add the ground nodes here
-                # right now, just return the first one
-                if not graph.get(node):
-                    return node
+    @staticmethod
+    def is_topological_sort(
+        graph: dict[str, list[str]], ordering: list[str], is_strict: bool = True
+    ) -> bool:
+        """Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
-                stack.extend(graph[node])
+        Args:
+            graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
+            ordering: the nodes visited, in order
-        return None
+        Returns:
+            Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
+        """
+        # No keyword match or goal details were achieved
+        if not ordering:
+            return False
-    @staticmethod
-    def is_topological_sort(graph, ordering):
-        position = {node: i for i, node in enumerate(ordering)}
-        ground_node = EvaluationPackage.find_ground_node(
-            graph, list(graph.keys())[0]
+        if is_strict:
+            # strict matching: only consider most recent tool call
+            position = {node: [i] for i, node in enumerate(ordering)}
+        else:
+            # lenient matching: consider all tool calls (account for all indexes of the node)
+            position = defaultdict(list)
+            for i, node in enumerate(ordering):
+                position[node].append(i)
+        terminal_nodes = EvaluationPackage.find_terminal_nodes(graph)
+        # adds a dummy node for each terminal node
+        next_idx = (
+            max(val for values in position.values() for val in values) + 1
         )
-        if ground_node is not None:
-            graph[ground_node] = [DUMMY_GRAPH_NODE_NAME]
+        for n in terminal_nodes:
+            graph[n] = [DUMMY_GRAPH_NODE_NAME]
             graph[DUMMY_GRAPH_NODE_NAME] = []
+            position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
+            next_idx += 1
-            next_idx = len(position)
-            position[DUMMY_GRAPH_NODE_NAME] = next_idx
-        for u in graph:
-            for v in graph[u]:
-                if u not in position or v not in position:
+        for node in graph:
+            for child_nodes in graph[node]:
+                # Current node/children doesn't show up in made calls
+                if node not in position or child_nodes not in position:
                     return False
-                if position[u] >= position[v]:
+                # Current node doesn't show up before any of its child
+                # all index in current nodes are larger than every child nodes' index
+                if all(
+                    curr >= max(position[child_nodes])
+                    for curr in position[node]
+                ):
                     return False
         return True
@@ -238,32 +375,151 @@ class EvaluationPackage:
                     f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
                 )
-    @staticmethod
-    def _check_if_args_match_with_ignore(
-        actual_args: dict[str, str], expected_args: dict[str, str]
+    def argument_matching(
+        self,
+        expected: dict[str, str],
+        actual: dict[str, str],
+        matching_strategy: dict[str, MatchingStrategy],
     ) -> bool:
-        """
-        This function checks if a registered tool call matches with the goal node when:
-            - the arg value marked as wrong is labelled with the "<IGNORE>" value in the corresponding ground truth
+        """Handles argument matching for expected and actual arguments and values.
         Args:
-            actual_args (dict): Made during inference.
-            expected_args (dict): Defined in the test case/ground truth.
+            expected: Expected ground truth arguments.
+            actual: Actual arguments in tool call
+            matching_strategy: Matching mode for each argument. Defaults to strict if not specified.
         Returns:
-            bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
+            True if all arguments match according to their matching strategy.
         """
+        # ignore arg matching
+        if expected == {"IGNORE": None}:
+            return True
-        if set(actual_args.keys()) != set(expected_args.keys()):
-            return False
+        for field in actual:
+            if field not in expected:
+                return False
+        for field in expected:
+            strategy = matching_strategy.get(
+                field, MatchingStrategy.strict.value
+            )
+            norm_actual_val = EvaluationPackage.normalize_args(
+                actual.get(field)
+            )
+            norm_expected_val = EvaluationPackage.normalize_args(
+                expected.get(field)
+            )
-        for key in actual_args:
+            # field must exist if not using optional matching
             if (
-                actual_args[key] != expected_args[key]
-                and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
+                field not in actual
+                and strategy != MatchingStrategy.optional.value
             ):
                 return False
+            # continue to next if it's an ignored keyword
+            if norm_expected_val == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
+                continue
+            # optional matching
+            if strategy == MatchingStrategy.optional.value:
+                # continue to next it's not called
+                if field not in actual:
+                    continue
+                # must match if called
+                if actual[field] != expected[field]:
+                    return False
+            elif strategy == MatchingStrategy.fuzzy.value:
+                # check date/number conversion
+                conversion_succeeded, values_match = (
+                    EvaluationPackage._compare_as_date_or_number(
+                        norm_actual_val, norm_expected_val
+                    )
+                )
+                # If conversion succeeded and values match, continue to next parameter
+                if conversion_succeeded and values_match:
+                    continue
+                # If conversion succeeded but values don't match, return False
+                if conversion_succeeded and not values_match:
+                    return False
+                # try cosine matching
+                x = self.matcher.cosine_similarity_semantic_match(
+                    norm_actual_val, norm_expected_val
+                )
+                print(norm_actual_val, norm_expected_val, x)
+                if not x:
+                    return False
+            # TODO szhang 10/24/25: Decide if strict comparison must be exact or may allow normalized values.
+            elif strategy == MatchingStrategy.strict.value:
+                # must match
+                if norm_actual_val != norm_expected_val:
+                    return False
+            else:
+                print(f"Warning: undefined matching strategy found: {strategy}")
         return True
+    @staticmethod
+    def normalize_args(data):
+        if isinstance(data, dict):
+            # normalize keys (case-sensitive) and values
+            return {
+                str(k): EvaluationPackage.normalize_args(v)
+                for k, v in data.items()
+            }
+        elif isinstance(data, list):
+            normalized_list = [
+                EvaluationPackage.normalize_args(v) for v in data
+            ]
+            return sorted(
+                normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
+            )
+        else:
+            # don’t lowercase reserved keyword
+            if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
+                return str(data)
+            return str(data).lower()
+    @staticmethod
+    def _compare_as_date_or_number(normalized_actual, normalized_expected):
+        """
+        Attempts to compare two normalized values as dates or numbers.
+        Args:
+            normalized_actual: The actual value from tool call
+            normalized_expected: The expected value from ground truth
+        Returns:
+            tuple: (conversion_succeeded, values_match)
+                - conversion_succeeded: True if values could be converted to numbers or dates
+                - values_match: True if converted values match
+        """
+        # Try to convert to numbers
+        try:
+            num_actual = float(normalized_actual)
+            num_expected = float(normalized_expected)
+            # Conversion succeeded, check if values match
+            return (
+                True,
+                abs(num_actual - num_expected) <= 0.001,
+            )  # Small epsilon for float comparison
+        except (ValueError, TypeError):
+            pass
+        # Try to convert to dates
+        try:
+            date_actual = parser.parse(normalized_actual)
+            date_expected = parser.parse(normalized_expected)
+            # Conversion succeeded, check if values match
+            return True, date_actual == date_expected
+        except (ValueError, TypeError):
+            pass
+        # If we get here, neither number nor date conversion worked
+        return False, False
     def traverse(self):
         labelled_messages = []
         message_outcomes = []
@@ -325,12 +581,12 @@ class EvaluationPackage:
                     possible_ground_truth_for_analysis = []
                     for goal_detail in matching_goal_details:
                         # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
-                        if goal_detail.args == {"IGNORE": None} or (
-                            msg_tool_call["args"] == goal_detail.args
-                            or self._check_if_args_match_with_ignore(
-                                msg_tool_call["args"], goal_detail.args
-                            )
+                        if self.argument_matching(
+                            expected=goal_detail.args,
+                            actual=msg_tool_call["args"],
+                            matching_strategy=goal_detail.arg_matching,
                         ):
                             labelled_messages.append(goal_detail.name)
                             labelled_messages_without_text_step.append(
                                 goal_detail.name
@@ -399,6 +655,7 @@ class EvaluationPackage:
             if message.event == EventTypes.message_created
             and message.role == "assistant"
         ]
         keyword_semantic_list = []
         for message in assistant_responses:
             for goal_detail in self.text_list:
@@ -407,7 +664,10 @@ class EvaluationPackage:
                         message.content, goal_detail.keywords
                     )
                     semantic_match: bool = self.matcher.semantic_match(
-                        message.content, goal_detail.response
+                        self.messages[0].content,
+                        prediction=message.content,
+                        ground_truth=goal_detail.response,
+                        enable_fuzzy_matching=self.enable_fuzzy_matching,
                     )
                     keyword_semantic_match = KeywordSemanticSearchMetric(
                         keyword_match=keyword_match,
@@ -442,6 +702,29 @@ class EvaluationPackage:
         else:
             return TextMatchType.text_mismatch.value
+    def generate_custom_metrics(
+        self, extracted_context: Dict[str, Any]
+    ) -> Optional[CustomEvalMetrics]:
+        if self.custom_evals is None:
+            return None
+        results: list[Metric] = []
+        for evaluation in self.custom_evals:
+            # TODO: cleanup. The compute method returns a Metric but pydantic thinks it is different.
+            # Probably because of some path issue when we auto-discover metrics
+            evaluate_result = evaluation.evaluate(
+                messages=self.messages,
+                ground_truth=self.ground_truth,
+                extracted_context=extracted_context,
+            )
+            if evaluate_result is not None:
+                results.append(Metric(**evaluate_result.model_dump()))
+        custom_eval_results = CustomEvalMetrics(
+            dataset_name=self.test_case_name, custom_metrics=results
+        )
+        return custom_eval_results
     def generate_summary(self):
         llm_steps = 0
         total_step = 0
@@ -454,8 +737,20 @@ class EvaluationPackage:
             message_with_reasons,
         ) = self.traverse()
+        extracted_context = {}
+        if self.extractors is not None and self.custom_evals is not None:
+            for extractor in self.extractors:
+                context = extractor.extract(
+                    messages=self.messages,
+                    ground_truth=self.ground_truth,
+                    matcher=self.matcher,
+                )
+                extracted_context[extractor.name] = context
         is_success = self.is_topological_sort(
-            self.ground_truth.goals, labelled_messages
+            graph=self.ground_truth.goals,
+            ordering=labelled_messages,
+            is_strict=self.strict_topological_matching,
         )
         match = self._is_text_match(matches)
@@ -474,6 +769,10 @@ class EvaluationPackage:
         knowledge_base_metric_summary = (
             self.generate_knowledge_base_metric_summary()
         )
+        custom_metric_summary = self.generate_custom_metrics(
+            extracted_context=extracted_context
+        )
         # TO-DO: the table is not printing properly anymore with the new columns introduced
         # we need to introduce a separate table for these.
@@ -487,6 +786,7 @@ class EvaluationPackage:
             knowledge_base_metric_summary,
             message_with_reasons,
             metrics,
+            custom_metric_summary,
         )
     def _get_messages_by_role_before_cs(
@@ -591,6 +891,51 @@ class EvaluationPackage:
         return metrics
+    def evaluate_derailment(
+        self, instructions: str = None
+    ) -> List[AnswerDerailment]:
+        derailments = []
+        last_user_message = None
+        for message in self.messages:
+            if message.role == "user" and message.type == ContentType.text:
+                last_user_message = message
+            if message.role == "assistant" and message.type == ContentType.text:
+                derailment = (
+                    self.safety_llm_as_a_judge.judge_derailment_in_answer(
+                        question=last_user_message.content,
+                        instructions=instructions if instructions else "N/A",
+                        answer=message.content,
+                    )
+                )
+                derailments.append(derailment)
+                if derailment.in_scope == "no":
+                    return (
+                        derailments  # short-circuit if any derailment is found
+                    )
+        return derailments
+    def evaluate_unsafe_topics(
+        self, instructions: str = None
+    ) -> List[AnswerUnsafeTopic]:
+        unsafe_topics = []
+        last_user_message = None
+        for message in self.messages:
+            if message.role == "user" and message.type == ContentType.text:
+                last_user_message = message
+            if message.role == "assistant" and message.type == ContentType.text:
+                unsafe_topic = (
+                    self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
+                        question=last_user_message.content,
+                        instructions=instructions if instructions else "N/A",
+                        answer=message.content,
+                    )
+                )
+                unsafe_topics.append(unsafe_topic)
+                if unsafe_topic.is_safe == "no":
+                    return unsafe_topics  # short-circuit if any unsafe topic is found
+        return unsafe_topics
 if __name__ == "__main__":
@@ -616,7 +961,7 @@ if __name__ == "__main__":
             rich.print("[orange3]WXO:[/orange3]", message.content)
     with open("./benchmarks/workday_tools/data/data18.json", "r") as f:
-        ground_truth = EvaluationData.model_validate(json.load(f))
+        ground_truth = OrchestrateDataset.model_validate(json.load(f))
     evaluate_package = EvaluationPackage(
         test_case_name="data1.messages.json",

wxo_agentic_evaluation/external_agent/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ from wxo_agentic_evaluation import prompt
 from wxo_agentic_evaluation.prompt.template_render import (
     StoryGenerationTemplateRenderer,
 )
-from wxo_agentic_evaluation.service_provider import ProviderConfig, get_provider
+from wxo_agentic_evaluation.service_provider import get_provider
 console = rich.console.Console()

ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.1.3py3-none-any.whl → 1.1.8b0py3-none-any.whl