PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -1,41 +1,82 @@
-from typing import List
 import json
-import os
+import os
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
 import rich
+from dateutil import parser
+from wxo_agentic_evaluation import __file__
 from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
-from wxo_agentic_evaluation.type import (
-    ContentType,
-    Message,
-    EvaluationData,
-    EventTypes,
-    ConversationalSearch,
-    ExtendedMessage,
+from wxo_agentic_evaluation.llm_matching import LLMMatcher
+from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
+from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
+from wxo_agentic_evaluation.metrics.evaluations import (
+    Evaluation,
+    Metric,
+)
+from wxo_agentic_evaluation.extractors.extractor_base import Extractor
+from wxo_agentic_evaluation.metrics.llm_as_judge import (
+    AnswerDerailment,
+    AnswerUnsafeTopic,
 )
-from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.service_provider import get_provider
 from wxo_agentic_evaluation.metrics.metrics import (
-    KnowledgeBaseMetrics,
+    CustomEvalMetrics,
     KeywordSemanticSearchMetric,
+    KnowledgeBaseMetrics,
+    TextMatchType,
     ToolCallAndRoutingMetrics,
-    TextMatchType
 )
 from wxo_agentic_evaluation.prompt.template_render import (
+    AnswerRelevancyTemplateRenderer,
+    DerailmentTemplateRenderer,
+    FaithfulnessTemplateRenderer,
     KeywordMatchingTemplateRenderer,
     SemanticMatchingTemplateRenderer,
-    FaithfulnessTemplateRenderer,
-    AnswerRelevancyTemplateRenderer,
+    UnsafeTopicTemplateRenderer,
+)
+from wxo_agentic_evaluation.resource_map import ResourceMap
+from wxo_agentic_evaluation.service_instance import tenant_setup
+from wxo_agentic_evaluation.service_provider import (
+    USE_GATEWAY_MODEL_PROVIDER,
+    get_provider,
+)
+from wxo_agentic_evaluation.service_provider.provider import Provider
+from wxo_agentic_evaluation.type import (
+    ContentType,
+    ConversationalSearch,
+    EventTypes,
+    ExtendedMessage,
+    MatchingStrategy,
+    Message,
+    OrchestrateDataset,
 )
-from wxo_agentic_evaluation.llm_matching import LLMMatcher
-from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
-from wxo_agentic_evaluation import __file__
 root_dir = os.path.dirname(__file__)
-KEYWORD_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "keyword_matching_prompt.jinja2")
-SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_matching_prompt.jinja2")
-FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
-ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
+KEYWORD_MATCHING_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "keyword_matching_prompt.jinja2"
+)
+SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "semantic_matching_prompt.jinja2"
+)
+FAITHFULNESS_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "faithfulness_prompt.jinja2"
+)
+ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "answer_relevancy_prompt.jinja2"
+)
+RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
+    "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
+)
+DERAILMENT_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "derailment_prompt.jinja2"
+)
+UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
+    root_dir, "prompt", "unsafe_topic_prompt.jinja2"
+)
 """
 - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
@@ -45,37 +86,105 @@ single, summary step goals.
 """
 DUMMY_GRAPH_NODE_NAME = "dummy-goal"
 class EvaluationPackage:
     def __init__(
         self,
-        test_case_name,
-        ground_truth,
-        messages,
+        test_case_name: str,
+        ground_truth: OrchestrateDataset,
+        messages: list[Message],
         conversational_search_data: List[ConversationalSearch] = None,
-        is_analyze_run=False,
         resource_map: ResourceMap = None,
+        is_attack_evaluation: bool = False,
+        config=None,
+        custom_evals: Optional[list[Evaluation]] = None,
+        custom_llmaaj_client: Optional[Provider] = None,
+        extractors: Optional[list[Extractor]] = None,
+        similarity_threshold=0.8,
+        enable_fuzzy_matching=False,
+        strict_topological_matching=True,
     ):
-        self.tool_dictionary = {
-            goal_detail.name: goal_detail
-            for goal_detail in ground_truth.goal_details
-            if goal_detail.type == ContentType.tool_call
-        }
-        self.text_list = [
-            goal_detail
-            for goal_detail in ground_truth.goal_details
-            if goal_detail.type == ContentType.text
-        ]
-        self.messages = messages
+        self.tool_dictionary = (
+            {
+                goal_detail.name: goal_detail
+                for goal_detail in ground_truth.goal_details
+                if goal_detail.type == ContentType.tool_call
+            }
+            if ground_truth.goal_details
+            else {}
+        )
+        self.text_list = (
+            [
+                goal_detail
+                for goal_detail in ground_truth.goal_details
+                if goal_detail.type == ContentType.text
+            ]
+            if ground_truth.goal_details
+            else []
+        )
+        self.messages: List[Message] = messages
         self.conversational_search_data = conversational_search_data
-        self.validate_ground_truth(ground_truth, test_case_name)
+        self.is_attack_evaluation = is_attack_evaluation
         self.ground_truth = ground_truth
         self.test_case_name = test_case_name
-        self.is_analyze_run = is_analyze_run
+        self.resource_map = resource_map
+        self.custom_evals = custom_evals
+        self.custom_llmaaj_client = custom_llmaaj_client
+        self.extractors = extractors
+        self.enable_fuzzy_matching = enable_fuzzy_matching
+        self.strict_topological_matching = strict_topological_matching
+        if not self.is_attack_evaluation:
+            self.validate_ground_truth(self.ground_truth, self.test_case_name)
+        extra_kwargs = {}
+        if USE_GATEWAY_MODEL_PROVIDER:
+            if resource_map and hasattr(resource_map, "wxo_client"):
+                wxo_client = resource_map.wxo_client
+                if hasattr(wxo_client, "service_url"):
+                    extra_kwargs["instance_url"] = wxo_client.service_url
+                if hasattr(wxo_client, "api_key"):
+                    extra_kwargs["token"] = wxo_client.api_key
+            elif config:
+                auth = getattr(config, "auth_config", None)
+                if auth:
+                    instance_url = getattr(auth, "url", None)
+                    token = getattr(auth, "token", None)
+                if instance_url:
+                    extra_kwargs["instance_url"] = instance_url
+                if token:
+                    extra_kwargs["token"] = token
+            else:
+                token, instance_url, env = tenant_setup(
+                    service_url=None, tenant_name="local"
+                )
+                if instance_url:
+                    extra_kwargs["instance_url"] = instance_url
+                if token:
+                    extra_kwargs["token"] = token
+        # output response matching
         self.matcher = LLMMatcher(
             llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
-                params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 10},
+                params={
+                    "min_new_tokens": 0,
+                    "decoding_method": "greedy",
+                    "max_new_tokens": 10,
+                },
+                embedding_model_id="sentence-transformers/all-minilm-l6-v2",
+                **extra_kwargs,
             ),
             keyword_template=KeywordMatchingTemplateRenderer(
                 KEYWORD_MATCHING_PROMPT_PATH
@@ -83,62 +192,114 @@ class EvaluationPackage:
             semantic_template=SemanticMatchingTemplateRenderer(
                 SEMANTIC_MATCHING_PROMPT_PATH
             ),
+            similarity_threshold=similarity_threshold,
+            enable_fuzzy_matching=enable_fuzzy_matching,
         )
+        # only used for RAG evaluation
         self.rag_llm_as_a_judge = LLMJudge(
             llm_client=get_provider(
-                    model_id="meta-llama/llama-3-405b-instruct",
-                    params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
-                ),
+                model_id="meta-llama/llama-3-405b-instruct",
+                params={
+                    "min_new_tokens": 0,
+                    "decoding_method": "greedy",
+                    "max_new_tokens": 4096,
+                },
+                **extra_kwargs,
+            ),
             faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
             answer_relevancy=AnswerRelevancyTemplateRenderer(
                 ANSWER_RELEVANCY_PROMPT_PATH
             ),
         )
+        self.safety_llm_as_a_judge = LLMSafetyJudge(
+            llm_client=get_provider(
+                model_id="meta-llama/llama-3-405b-instruct",
+                params={
+                    "min_new_tokens": 0,
+                    "decoding_method": "greedy",
+                    "max_new_tokens": 4096,
+                },
+                **extra_kwargs,
+            ),
+            answer_derailment=DerailmentTemplateRenderer(
+                DERAILMENT_PROMPT_PATH
+            ),
+            answer_unsafe_topic=UnsafeTopicTemplateRenderer(
+                UNSAFE_TOPIC_PROMPT_PATH
+            ),
+        )
-        self.resource_map = resource_map
     @staticmethod
-    def find_ground_node(graph, start_node):
-        """ Simple implementation. Should be fixed in the future
+    def find_terminal_nodes(graph: dict[str, list[str]]) -> set[str]:
+        """Finds terminal nodes (nodes with no outgoing edges).
+        Args:
+            graph: the input graph
-        Assumes that there is a single graph node that does not have children
+        Returns:
+            a set of the terminal nodes
         """
-        stack = [start_node]
-        visited_set = set()
-        while stack:
-            node = stack.pop()
-            if node not in visited_set:
-                visited_set.add(node)
-                # check for children
-                # improvement for future: add the ground nodes here
-                # right now, just return the first one
-                if not graph.get(node):
-                    return node
-                stack.extend(graph[node])
-        return None
+        seen_nodes = set()  # track seen nodes
+        non_terminal_nodes = set()  # track nodes with children
+        for node in graph:
+            seen_nodes.add(node)
+            if graph[node]:
+                non_terminal_nodes.add(node)
+                for n in graph[node]:
+                    seen_nodes.add(n)
+        return seen_nodes - non_terminal_nodes
     @staticmethod
-    def is_topological_sort(graph, ordering):
-        position = {node: i for i, node in enumerate(ordering)}
-        ground_node = EvaluationPackage.find_ground_node(graph, list(graph.keys())[0])
+    def is_topological_sort(
+        graph: dict[str, list[str]], ordering: list[str], is_strict: bool = True
+    ) -> bool:
+        """Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
-        if ground_node is not None:
-            graph[ground_node] = [DUMMY_GRAPH_NODE_NAME]
-            graph[DUMMY_GRAPH_NODE_NAME] = []
+        Args:
+            graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
+            ordering: the nodes visited, in order
+        Returns:
+            Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
+        """
+        # No keyword match or goal details were achieved
+        if not ordering:
+            return False
-            next_idx = len(position)
-            position[DUMMY_GRAPH_NODE_NAME] = next_idx
+        if is_strict:
+            # strict matching: only consider most recent tool call
+            position = {node: [i] for i, node in enumerate(ordering)}
+        else:
+            # lenient matching: consider all tool calls (account for all indexes of the node)
+            position = defaultdict(list)
+            for i, node in enumerate(ordering):
+                position[node].append(i)
+        terminal_nodes = EvaluationPackage.find_terminal_nodes(graph)
+        # adds a dummy node for each terminal node
+        next_idx = (
+            max(val for values in position.values() for val in values) + 1
+        )
-        for u in graph:
-            for v in graph[u]:
-                if u not in position or v not in position:
+        for n in terminal_nodes:
+            graph[n] = [DUMMY_GRAPH_NODE_NAME]
+            graph[DUMMY_GRAPH_NODE_NAME] = []
+            position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
+            next_idx += 1
+        for node in graph:
+            for child_nodes in graph[node]:
+                # Current node/children doesn't show up in made calls
+                if node not in position or child_nodes not in position:
                     return False
-                if position[u] >= position[v]:
+                # Current node doesn't show up before any of its child
+                # all index in current nodes are larger than every child nodes' index
+                if all(
+                    curr >= max(position[child_nodes])
+                    for curr in position[node]
+                ):
                     return False
         return True
@@ -181,7 +342,11 @@ class EvaluationPackage:
                     f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
                 )
             if goal_detail.name == "summarize":
-                if (not goal_detail.keywords or len(goal_detail.keywords) == 0) and (not goal_detail.response or len(goal_detail.response) == 0):
+                if (
+                    not goal_detail.keywords or len(goal_detail.keywords) == 0
+                ) and (
+                    not goal_detail.response or len(goal_detail.response) == 0
+                ):
                     rich.print(
                         f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
                     )
@@ -210,23 +375,176 @@ class EvaluationPackage:
                     f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
                 )
+    def argument_matching(
+        self,
+        expected: dict[str, str],
+        actual: dict[str, str],
+        matching_strategy: dict[str, MatchingStrategy],
+    ) -> bool:
+        """Handles argument matching for expected and actual arguments and values.
+        Args:
+            expected: Expected ground truth arguments.
+            actual: Actual arguments in tool call
+            matching_strategy: Matching mode for each argument. Defaults to strict if not specified.
+        Returns:
+            True if all arguments match according to their matching strategy.
+        """
+        # ignore arg matching
+        if expected == {"IGNORE": None}:
+            return True
+        for field in actual:
+            if field not in expected:
+                return False
+        for field in expected:
+            strategy = matching_strategy.get(
+                field, MatchingStrategy.strict.value
+            )
+            norm_actual_val = EvaluationPackage.normalize_args(
+                actual.get(field)
+            )
+            norm_expected_val = EvaluationPackage.normalize_args(
+                expected.get(field)
+            )
+            # field must exist if not using optional matching
+            if (
+                field not in actual
+                and strategy != MatchingStrategy.optional.value
+            ):
+                return False
+            # continue to next if it's an ignored keyword
+            if norm_expected_val == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
+                continue
+            # optional matching
+            if strategy == MatchingStrategy.optional.value:
+                # continue to next it's not called
+                if field not in actual:
+                    continue
+                # must match if called
+                if actual[field] != expected[field]:
+                    return False
+            elif strategy == MatchingStrategy.fuzzy.value:
+                # check date/number conversion
+                conversion_succeeded, values_match = (
+                    EvaluationPackage._compare_as_date_or_number(
+                        norm_actual_val, norm_expected_val
+                    )
+                )
+                # If conversion succeeded and values match, continue to next parameter
+                if conversion_succeeded and values_match:
+                    continue
+                # If conversion succeeded but values don't match, return False
+                if conversion_succeeded and not values_match:
+                    return False
+                # try cosine matching
+                x = self.matcher.cosine_similarity_semantic_match(
+                    norm_actual_val, norm_expected_val
+                )
+                print(norm_actual_val, norm_expected_val, x)
+                if not x:
+                    return False
+            # TODO szhang 10/24/25: Decide if strict comparison must be exact or may allow normalized values.
+            elif strategy == MatchingStrategy.strict.value:
+                # must match
+                if norm_actual_val != norm_expected_val:
+                    return False
+            else:
+                print(f"Warning: undefined matching strategy found: {strategy}")
+        return True
+    @staticmethod
+    def normalize_args(data):
+        if isinstance(data, dict):
+            # normalize keys (case-sensitive) and values
+            return {
+                str(k): EvaluationPackage.normalize_args(v)
+                for k, v in data.items()
+            }
+        elif isinstance(data, list):
+            normalized_list = [
+                EvaluationPackage.normalize_args(v) for v in data
+            ]
+            return sorted(
+                normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
+            )
+        else:
+            # don’t lowercase reserved keyword
+            if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
+                return str(data)
+            return str(data).lower()
+    @staticmethod
+    def _compare_as_date_or_number(normalized_actual, normalized_expected):
+        """
+        Attempts to compare two normalized values as dates or numbers.
+        Args:
+            normalized_actual: The actual value from tool call
+            normalized_expected: The expected value from ground truth
+        Returns:
+            tuple: (conversion_succeeded, values_match)
+                - conversion_succeeded: True if values could be converted to numbers or dates
+                - values_match: True if converted values match
+        """
+        # Try to convert to numbers
+        try:
+            num_actual = float(normalized_actual)
+            num_expected = float(normalized_expected)
+            # Conversion succeeded, check if values match
+            return (
+                True,
+                abs(num_actual - num_expected) <= 0.001,
+            )  # Small epsilon for float comparison
+        except (ValueError, TypeError):
+            pass
+        # Try to convert to dates
+        try:
+            date_actual = parser.parse(normalized_actual)
+            date_expected = parser.parse(normalized_expected)
+            # Conversion succeeded, check if values match
+            return True, date_actual == date_expected
+        except (ValueError, TypeError):
+            pass
+        # If we get here, neither number nor date conversion worked
+        return False, False
     def traverse(self):
         labelled_messages = []
         message_outcomes = []
         labelled_messages_without_text_step = []
         # Counters for tool-calling related metrics
-        tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
+        tool_call_and_routing_metrics = ToolCallAndRoutingMetrics()
+        tool_call_and_routing_metrics.expected_tool_calls = len(
+            self.tool_dictionary
         )
-        tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
+        correct_tool_calls = (
+            set()
+        )  # sometimes, tool with the same signature can be called more than once
         for message in self.messages:
             if message.type == ContentType.tool_call:
                 msg_tool_call = json.loads(message.content)
-                if self.resource_map and msg_tool_call["name"] in self.resource_map.agent2tools:
+                if (
+                    self.resource_map
+                    and msg_tool_call["name"] in self.resource_map.agent2tools
+                ):
                     tool_call_and_routing_metrics.total_routing_calls += 1
                     relevant = False
-                    for tool in self.resource_map.agent2tools[msg_tool_call["name"]]:
+                    for tool in self.resource_map.agent2tools[
+                        msg_tool_call["name"]
+                    ]:
                         for goal_detail in self.tool_dictionary.values():
                             if goal_detail.tool_name == tool:
                                 relevant = True
@@ -235,7 +553,9 @@ class EvaluationPackage:
                             break
                     if relevant:
-                        tool_call_and_routing_metrics.relevant_routing_calls += 1
+                        tool_call_and_routing_metrics.relevant_routing_calls += (
+                            1
+                        )
                     else:
                         message_outcome = ExtendedMessage(message=message)
                         message_outcome.reason = {
@@ -244,6 +564,7 @@ class EvaluationPackage:
                     continue
+                # TO-DO: re-think how deduplication works in the context of precision & recall
                 tool_call_and_routing_metrics.total_tool_calls += 1
                 # evaluating more than once is fine
@@ -259,19 +580,32 @@ class EvaluationPackage:
                     found = False
                     possible_ground_truth_for_analysis = []
                     for goal_detail in matching_goal_details:
-                        if msg_tool_call["args"] == goal_detail.args:
-                            labelled_messages.append(goal_detail.name)
-                            labelled_messages_without_text_step.append(goal_detail.name)
+                        # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
+                        if self.argument_matching(
+                            expected=goal_detail.args,
+                            actual=msg_tool_call["args"],
+                            matching_strategy=goal_detail.arg_matching,
+                        ):
-                            tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
+                            labelled_messages.append(goal_detail.name)
+                            labelled_messages_without_text_step.append(
+                                goal_detail.name
+                            )
+                            correct_tool_calls.add(goal_detail.name)
+                            # tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
                             found = True
                             message_outcome = ExtendedMessage(message=message)
                             message_outcomes.append(message_outcome)
                             break
                         else:
-                            possible_ground_truth_for_analysis.append(goal_detail.args)
+                            possible_ground_truth_for_analysis.append(
+                                goal_detail.args
+                            )
                     if not found:
+                        tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
+                            1
+                        )
                         message_outcome = ExtendedMessage(message=message)
                         message_outcome.reason = {
                             "reason": "incorrect parameter",
@@ -279,15 +613,17 @@ class EvaluationPackage:
                             "expected": possible_ground_truth_for_analysis,
                         }
                         message_outcomes.append(message_outcome)
-                        rich.print(
-                            f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
-                            f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
-                        )
+                        if not self.is_attack_evaluation:
+                            rich.print(
+                                f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
+                                f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
+                            )
                 else:
-                    rich.print(
-                        f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
-                    )
+                    if not self.is_attack_evaluation:
+                        rich.print(
+                            f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
+                        )
                     # note: this is incorrect after the 1.6 change
                     message_outcome = ExtendedMessage(message=message)
                     message_outcome.reason = {"reason": "irrelevant tool call"}
@@ -308,12 +644,18 @@ class EvaluationPackage:
             else:
                 message_outcome = ExtendedMessage(message=message)
                 message_outcomes.append(message_outcome)
+        tool_call_and_routing_metrics.correct_tool_calls = len(
+            correct_tool_calls
+        )
         assistant_responses = [
             message
             for message in self.messages
             if message.event == EventTypes.message_created
             and message.role == "assistant"
         ]
         keyword_semantic_list = []
         for message in assistant_responses:
             for goal_detail in self.text_list:
@@ -322,7 +664,10 @@ class EvaluationPackage:
                         message.content, goal_detail.keywords
                     )
                     semantic_match: bool = self.matcher.semantic_match(
-                        message.content, goal_detail.response
+                        self.messages[0].content,
+                        prediction=message.content,
+                        ground_truth=goal_detail.response,
+                        enable_fuzzy_matching=self.enable_fuzzy_matching,
                     )
                     keyword_semantic_match = KeywordSemanticSearchMetric(
                         keyword_match=keyword_match,
@@ -357,6 +702,29 @@ class EvaluationPackage:
         else:
             return TextMatchType.text_mismatch.value
+    def generate_custom_metrics(
+        self, extracted_context: Dict[str, Any]
+    ) -> Optional[CustomEvalMetrics]:
+        if self.custom_evals is None:
+            return None
+        results: list[Metric] = []
+        for evaluation in self.custom_evals:
+            # TODO: cleanup. The compute method returns a Metric but pydantic thinks it is different.
+            # Probably because of some path issue when we auto-discover metrics
+            evaluate_result = evaluation.evaluate(
+                messages=self.messages,
+                ground_truth=self.ground_truth,
+                extracted_context=extracted_context,
+            )
+            if evaluate_result is not None:
+                results.append(Metric(**evaluate_result.model_dump()))
+        custom_eval_results = CustomEvalMetrics(
+            dataset_name=self.test_case_name, custom_metrics=results
+        )
+        return custom_eval_results
     def generate_summary(self):
         llm_steps = 0
         total_step = 0
@@ -368,11 +736,21 @@ class EvaluationPackage:
             metrics,
             message_with_reasons,
         ) = self.traverse()
-        if self.is_analyze_run:
-            print(labelled_messages)
+        extracted_context = {}
+        if self.extractors is not None and self.custom_evals is not None:
+            for extractor in self.extractors:
+                context = extractor.extract(
+                    messages=self.messages,
+                    ground_truth=self.ground_truth,
+                    matcher=self.matcher,
+                )
+                extracted_context[extractor.name] = context
         is_success = self.is_topological_sort(
-            self.ground_truth.goals, labelled_messages
+            graph=self.ground_truth.goals,
+            ordering=labelled_messages,
+            is_strict=self.strict_topological_matching,
         )
         match = self._is_text_match(matches)
@@ -388,7 +766,13 @@ class EvaluationPackage:
                 llm_steps += 1
             total_step += 1
-        knowledge_base_metric_summary = self.generate_knowledge_base_metric_summary()
+        knowledge_base_metric_summary = (
+            self.generate_knowledge_base_metric_summary()
+        )
+        custom_metric_summary = self.generate_custom_metrics(
+            extracted_context=extracted_context
+        )
         # TO-DO: the table is not printing properly anymore with the new columns introduced
         # we need to introduce a separate table for these.
@@ -402,6 +786,7 @@ class EvaluationPackage:
             knowledge_base_metric_summary,
             message_with_reasons,
             metrics,
+            custom_metric_summary,
         )
     def _get_messages_by_role_before_cs(
@@ -433,7 +818,11 @@ class EvaluationPackage:
         for message in self.messages:
             if message.type == ContentType.tool_call:
                 content = json.loads(message.content)
-                id = content.get("tool_call_id", "")
+                """
+                - In ADK 1.9, for tool call events, the "tool_call_id" is now "id"
+                - still parse out "tool_call_id" for backwards compatibility
+                """
+                id = content.get("tool_call_id") or content.get("id")
                 if id == tool_call_id:
                     return content.get("name")
@@ -478,7 +867,8 @@ class EvaluationPackage:
             )  # name of knowledge base
             search_results = [
-                result.body for result in conversational_search_data.search_results
+                result.body
+                for result in conversational_search_data.search_results
             ]
             faithfulness = self.rag_llm_as_a_judge.faithfulness(
                 conversational_search_data.text, search_results
@@ -501,6 +891,51 @@ class EvaluationPackage:
         return metrics
+    def evaluate_derailment(
+        self, instructions: str = None
+    ) -> List[AnswerDerailment]:
+        derailments = []
+        last_user_message = None
+        for message in self.messages:
+            if message.role == "user" and message.type == ContentType.text:
+                last_user_message = message
+            if message.role == "assistant" and message.type == ContentType.text:
+                derailment = (
+                    self.safety_llm_as_a_judge.judge_derailment_in_answer(
+                        question=last_user_message.content,
+                        instructions=instructions if instructions else "N/A",
+                        answer=message.content,
+                    )
+                )
+                derailments.append(derailment)
+                if derailment.in_scope == "no":
+                    return (
+                        derailments  # short-circuit if any derailment is found
+                    )
+        return derailments
+    def evaluate_unsafe_topics(
+        self, instructions: str = None
+    ) -> List[AnswerUnsafeTopic]:
+        unsafe_topics = []
+        last_user_message = None
+        for message in self.messages:
+            if message.role == "user" and message.type == ContentType.text:
+                last_user_message = message
+            if message.role == "assistant" and message.type == ContentType.text:
+                unsafe_topic = (
+                    self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
+                        question=last_user_message.content,
+                        instructions=instructions if instructions else "N/A",
+                        answer=message.content,
+                    )
+                )
+                unsafe_topics.append(unsafe_topic)
+                if unsafe_topic.is_safe == "no":
+                    return unsafe_topics  # short-circuit if any unsafe topic is found
+        return unsafe_topics
 if __name__ == "__main__":
@@ -519,17 +954,19 @@ if __name__ == "__main__":
     for message in messages:
         if message.role == "user":
-            rich.print("[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content)
+            rich.print(
+                "[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content
+            )
         else:
             rich.print("[orange3]WXO:[/orange3]", message.content)
     with open("./benchmarks/workday_tools/data/data18.json", "r") as f:
-        ground_truth = EvaluationData.model_validate(json.load(f))
+        ground_truth = OrchestrateDataset.model_validate(json.load(f))
     evaluate_package = EvaluationPackage(
         test_case_name="data1.messages.json",
         ground_truth=ground_truth,
-        messages=messages
+        messages=messages,
     )
     print(evaluate_package.generate_summary())
     # print(evaluate_package.traverse())

ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.0.3py3-none-any.whl → 1.1.8b0py3-none-any.whl