PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.6py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (42) hide show

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -1,19 +1,27 @@
 import json
 import os
-from typing import List
+from gc import enable
+from typing import Any, Dict, List, Optional
 import rich
+from dateutil import parser
 from wxo_agentic_evaluation import __file__
 from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
 from wxo_agentic_evaluation.llm_matching import LLMMatcher
 from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
 from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
+from wxo_agentic_evaluation.metrics.evaluations import (
+    Evaluation,
+    Extractor,
+    Metric,
+)
 from wxo_agentic_evaluation.metrics.llm_as_judge import (
     AnswerDerailment,
     AnswerUnsafeTopic,
 )
 from wxo_agentic_evaluation.metrics.metrics import (
+    CustomEvalMetrics,
     KeywordSemanticSearchMetric,
     KnowledgeBaseMetrics,
     TextMatchType,
@@ -28,7 +36,12 @@ from wxo_agentic_evaluation.prompt.template_render import (
     UnsafeTopicTemplateRenderer,
 )
 from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.service_instance import tenant_setup
+from wxo_agentic_evaluation.service_provider import (
+    USE_GATEWAY_MODEL_PROVIDER,
+    get_provider,
+)
+from wxo_agentic_evaluation.service_provider.provider import Provider
 from wxo_agentic_evaluation.type import (
     ContentType,
     ConversationalSearch,
@@ -76,12 +89,18 @@ DUMMY_GRAPH_NODE_NAME = "dummy-goal"
 class EvaluationPackage:
     def __init__(
         self,
-        test_case_name,
+        test_case_name: str,
         ground_truth: EvaluationData,
-        messages,
+        messages: list[Message],
         conversational_search_data: List[ConversationalSearch] = None,
         resource_map: ResourceMap = None,
         is_attack_evaluation: bool = False,
+        config=None,
+        custom_evals: Optional[list[Evaluation]] = None,
+        custom_llmaaj_client: Optional[Provider] = None,
+        extractors: Optional[list[Extractor]] = None,
+        similarity_threshold=0.8,
+        enable_fuzzy_matching=False,
     ):
         self.tool_dictionary = (
             {
@@ -109,10 +128,49 @@ class EvaluationPackage:
         self.ground_truth = ground_truth
         self.test_case_name = test_case_name
         self.resource_map = resource_map
+        self.custom_evals = custom_evals
+        self.custom_llmaaj_client = custom_llmaaj_client
+        self.extractors = extractors
+        self.enable_fuzzy_matching = enable_fuzzy_matching
         if not self.is_attack_evaluation:
             self.validate_ground_truth(self.ground_truth, self.test_case_name)
+        extra_kwargs = {}
+        if USE_GATEWAY_MODEL_PROVIDER:
+            if resource_map and hasattr(resource_map, "wxo_client"):
+                wxo_client = resource_map.wxo_client
+                if hasattr(wxo_client, "service_url"):
+                    extra_kwargs["instance_url"] = wxo_client.service_url
+                if hasattr(wxo_client, "api_key"):
+                    extra_kwargs["token"] = wxo_client.api_key
+            elif config:
+                auth = getattr(config, "auth_config", None)
+                if auth:
+                    instance_url = getattr(auth, "url", None)
+                    token = getattr(auth, "token", None)
+                if instance_url:
+                    extra_kwargs["instance_url"] = instance_url
+                if token:
+                    extra_kwargs["token"] = token
+            else:
+                token, instance_url, env = tenant_setup(
+                    service_url=None, tenant_name="local"
+                )
+                if instance_url:
+                    extra_kwargs["instance_url"] = instance_url
+                if token:
+                    extra_kwargs["token"] = token
         # output response matching
         self.matcher = LLMMatcher(
             llm_client=get_provider(
@@ -122,6 +180,8 @@ class EvaluationPackage:
                     "decoding_method": "greedy",
                     "max_new_tokens": 10,
                 },
+                embedding_model_id="sentence-transformers/all-minilm-l6-v2",
+                **extra_kwargs,
             ),
             keyword_template=KeywordMatchingTemplateRenderer(
                 KEYWORD_MATCHING_PROMPT_PATH
@@ -129,6 +189,8 @@ class EvaluationPackage:
             semantic_template=SemanticMatchingTemplateRenderer(
                 SEMANTIC_MATCHING_PROMPT_PATH
             ),
+            similarity_threshold=similarity_threshold,
+            enable_fuzzy_matching=enable_fuzzy_matching,
         )
         # only used for RAG evaluation
         self.rag_llm_as_a_judge = LLMJudge(
@@ -139,6 +201,7 @@ class EvaluationPackage:
                     "decoding_method": "greedy",
                     "max_new_tokens": 4096,
                 },
+                **extra_kwargs,
             ),
             faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
             answer_relevancy=AnswerRelevancyTemplateRenderer(
@@ -153,6 +216,7 @@ class EvaluationPackage:
                     "decoding_method": "greedy",
                     "max_new_tokens": 4096,
                 },
+                **extra_kwargs,
             ),
             answer_derailment=DerailmentTemplateRenderer(
                 DERAILMENT_PROMPT_PATH
@@ -305,8 +369,48 @@ class EvaluationPackage:
             return str(data).lower()
     @staticmethod
+    def _compare_as_date_or_number(normalized_actual, normalized_expected):
+        """
+        Attempts to compare two normalized values as dates or numbers.
+        Args:
+            normalized_actual: The actual value from tool call
+            normalized_expected: The expected value from ground truth
+        Returns:
+            tuple: (conversion_succeeded, values_match)
+                - conversion_succeeded: True if values could be converted to numbers or dates
+                - values_match: True if converted values match
+        """
+        # Try to convert to numbers
+        try:
+            num_actual = float(normalized_actual)
+            num_expected = float(normalized_expected)
+            # Conversion succeeded, check if values match
+            return (
+                True,
+                abs(num_actual - num_expected) <= 0.001,
+            )  # Small epsilon for float comparison
+        except (ValueError, TypeError):
+            pass
+        # Try to convert to dates
+        try:
+            date_actual = parser.parse(normalized_actual)
+            date_expected = parser.parse(normalized_expected)
+            # Conversion succeeded, check if values match
+            return True, date_actual == date_expected
+        except (ValueError, TypeError):
+            pass
+        # If we get here, neither number nor date conversion worked
+        return False, False
     def _check_if_args_match_with_ignore(
-        actual_args: dict[str, str], expected_args: dict[str, str]
+        self,
+        actual_args: dict[str, str],
+        expected_args: dict[str, str],
+        enable_fuzzy_matching: bool = False,
     ) -> bool:
         """
         This function checks if a registered tool call matches with the goal node when:
@@ -315,21 +419,50 @@ class EvaluationPackage:
             actual_args (dict): Made during inference.
             expected_args (dict): Defined in the test case/ground truth.
         Returns:
-            bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
+            bool: True if match with keyword parameters ignored | False otherwise (arguments were not corrected).
         """
         if set(actual_args.keys()) != set(expected_args.keys()):
             return False
+        ## now we go through and check each parameter
         for key in actual_args:
+            normalized_actual = EvaluationPackage.normalize_args(
+                actual_args[key]
+            )
+            normalized_expected = EvaluationPackage.normalize_args(
+                expected_args[key]
+            )
+            # 1. If the args are an ignored keyword or exactly equal, continue to next parameter
             if (
-                EvaluationPackage.normalize_args(actual_args[key])
-                != EvaluationPackage.normalize_args(expected_args[key])
-                and EvaluationPackage.normalize_args(expected_args[key])
-                != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
-            ):
-                return False
+                normalized_expected == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
+            ) or (normalized_actual == normalized_expected):
+                continue
+            else:
+                # if they're not equal, and fuzzy matching is enabled, do fuzzy.
+                if enable_fuzzy_matching:
+                    # 3. Check date/number conversion
+                    conversion_succeeded, values_match = (
+                        EvaluationPackage._compare_as_date_or_number(
+                            normalized_actual, normalized_expected
+                        )
+                    )
+                    # If conversion succeeded and values match, continue to next parameter
+                    if conversion_succeeded and values_match:
+                        continue
+                    # If conversion succeeded but values don't match, return False
+                    if conversion_succeeded and not values_match:
+                        return False
+                    # 4. If conversion failed, try cosine matching. If this fails, return false for the function
+                    if not self.matcher.cosine_similarity_semantic_match(
+                        normalized_actual, normalized_expected
+                    ):
+                        return False
+                else:
+                    # If they're not equal and fuzzy matching is not enabled, return false
+                    return False
+        # If we've made it through all parameters without returning False, return True
         return True
     def traverse(self):
@@ -401,8 +534,10 @@ class EvaluationPackage:
                                 goal_detail.args
                             )
                             or self._check_if_args_match_with_ignore(
-                                msg_tool_call["args"], goal_detail.args
-                            )
+                                msg_tool_call["args"],
+                                goal_detail.args,
+                                enable_fuzzy_matching=self.enable_fuzzy_matching,
+                            )  # TODO arjun-gupta1 9/29/25: make this also return the method of matching (llm, fuzzy, cosine similarity) so we can write it out to analyze_run.py results
                         ):
                             labelled_messages.append(goal_detail.name)
                             labelled_messages_without_text_step.append(
@@ -484,6 +619,7 @@ class EvaluationPackage:
                         self.messages[0].content,
                         prediction=message.content,
                         ground_truth=goal_detail.response,
+                        enable_fuzzy_matching=self.enable_fuzzy_matching,
                     )
                     keyword_semantic_match = KeywordSemanticSearchMetric(
                         keyword_match=keyword_match,
@@ -518,6 +654,29 @@ class EvaluationPackage:
         else:
             return TextMatchType.text_mismatch.value
+    def generate_custom_metrics(
+        self, extracted_context: Dict[str, Any]
+    ) -> Optional[CustomEvalMetrics]:
+        if self.custom_evals is None:
+            return None
+        results: list[Metric] = []
+        for evaluation in self.custom_evals:
+            # TODO: cleanup. The compute method returns a Metric but pydantic thinks it is different.
+            # Probably because of some path issue when we auto-discover metrics
+            evaluate_result = evaluation.evaluate(
+                messages=self.messages,
+                ground_truth=self.ground_truth,
+                extracted_context=extracted_context,
+            )
+            if evaluate_result is not None:
+                results.append(Metric(**evaluate_result.model_dump()))
+        custom_eval_results = CustomEvalMetrics(
+            dataset_name=self.test_case_name, custom_metrics=results
+        )
+        return custom_eval_results
     def generate_summary(self):
         llm_steps = 0
         total_step = 0
@@ -530,6 +689,16 @@ class EvaluationPackage:
             message_with_reasons,
         ) = self.traverse()
+        extracted_context = {}
+        if self.extractors is not None and self.custom_evals is not None:
+            for extractor in self.extractors:
+                context = extractor.extract(
+                    messages=self.messages,
+                    ground_truth=self.ground_truth,
+                    matcher=self.matcher,
+                )
+                extracted_context[extractor.name] = context
         is_success = self.is_topological_sort(
             self.ground_truth.goals, labelled_messages
         )
@@ -550,6 +719,10 @@ class EvaluationPackage:
         knowledge_base_metric_summary = (
             self.generate_knowledge_base_metric_summary()
         )
+        custom_metric_summary = self.generate_custom_metrics(
+            extracted_context=extracted_context
+        )
         # TO-DO: the table is not printing properly anymore with the new columns introduced
         # we need to introduce a separate table for these.
@@ -563,6 +736,7 @@ class EvaluationPackage:
             knowledge_base_metric_summary,
             message_with_reasons,
             metrics,
+            custom_metric_summary,
         )
     def _get_messages_by_role_before_cs(

wxo_agentic_evaluation/external_agent/external_validate.py CHANGED Viewed

@@ -74,7 +74,9 @@ class ExternalAgentValidation:
         payload = {"stream": True}
         payload["messages"] = messages
         resp = requests.post(
-            url=self.service_url, headers=self.header, json=payload,
+            url=self.service_url,
+            headers=self.header,
+            json=payload,
         )
         success, logged_events = self._validate_streaming_response(resp)

wxo_agentic_evaluation/external_agent/types.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, List, Literal, Mapping, Union, Optional
+from typing import Any, List, Literal, Mapping, Optional, Union
 from pydantic import BaseModel

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -71,6 +71,36 @@ def is_transfer_response(step_detail: Dict):
     return False
+def _generate_user_input(
+    user_turn: int,
+    story: str,
+    conversation_history: list[Message],
+    llm_user: LLMUser,
+    enable_manual_user_input: bool = False,
+    starting_user_input: str | None = None,
+    attack_instructions: str | None = None,
+) -> Message:
+    """Generates the user input for the current turn."""
+    if user_turn == 0 and starting_user_input is not None:
+        return Message(
+            role="user",
+            content=starting_user_input,
+            type=ContentType.text,
+        )
+    if enable_manual_user_input:
+        content = input("[medium_orchid1]Enter your input[/medium_orchid1] ✍️: ")
+        return Message(role="user", content=content, type=ContentType.text)
+    # llm generated user input
+    return llm_user.generate_user_input(
+        story,
+        conversation_history,
+        attack_instructions=attack_instructions,
+    )
 class CallTracker(BaseModel):
     tool_call: List = []
     tool_response: List = []
@@ -211,7 +241,6 @@ class WXOInferenceBackend:
         start_time = time.time()
         for chunk in self._stream_events(user_input, agent_name, thread_id):
             event = chunk.get("event", "")
             if _thread_id := chunk.get("data", {}).get("thread_id"):
                 thread_id = _thread_id
@@ -422,7 +451,6 @@ class WXOInferenceBackend:
         messages = []
         for entry in result:
             tool_call_id = None
             if step_history := entry.get("step_history"):
                 for step_message in step_history:
@@ -551,7 +579,6 @@ class WXOInferenceBackend:
 class EvaluationController:
     MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
     MESSAGE_SIMILARITY_THRESHOLD = float(
         os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98)
@@ -585,37 +612,32 @@ class EvaluationController:
         task_n,
         story,
         agent_name: str,
-        starting_user_input: str = None,
-        attack_instructions: str = None,
+        starting_user_input: str | None = None,
+        attack_instructions: str | None = None,
+        max_user_turns: int | None = None,
     ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
-        step = 0
         thread_id = None
         conversation_history: List[Message] = []
         conversational_search_history_data = []
         call_tracker = CallTracker()
-        # make this configurable
-        while step < self.MAX_CONVERSATION_STEPS:
-            if step == 0 and starting_user_input:
-                user_input = Message(
-                    role="user",
-                    content=starting_user_input,
-                    type=ContentType.text,
-                )
-            else:
-                if self.config.enable_manual_user_input == True:
-                    content = input(
-                        "[medium_orchid1]Enter your input[/medium_orchid1] ✍️: "
-                    )
-                    user_input = Message(
-                        role="user", content=content, type=ContentType.text
-                    )
-                else:  # llm
-                    user_input = self.llm_user.generate_user_input(
-                        story,
-                        conversation_history,
-                        attack_instructions=attack_instructions,
-                    )
+        max_turns = (
+            self.MAX_CONVERSATION_STEPS
+            if max_user_turns is None
+            else max_user_turns
+        )
+        for user_turn in range(max_turns):
+            user_input = _generate_user_input(
+                user_turn=user_turn,
+                story=story,
+                conversation_history=conversation_history,
+                llm_user=self.llm_user,
+                enable_manual_user_input=self.config.enable_manual_user_input,
+                starting_user_input=starting_user_input,
+                attack_instructions=attack_instructions,
+            )
             if self.config.enable_verbose_logging:
                 rich.print(
                     f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
@@ -662,7 +684,7 @@ class EvaluationController:
                 # hook for subclasses
                 if self._post_message_hook(
                     task_n=task_n,
-                    step=step,
+                    step=user_turn,
                     message=message,
                     conversation_history=conversation_history,
                 ):
@@ -677,7 +699,6 @@ class EvaluationController:
                 conversational_search_data
             )
-            step += 1
         return (
             conversation_history,
             call_tracker,
@@ -742,15 +763,21 @@ class EvaluationController:
             ):
                 return True
-        return False  # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
+        # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
+        return False
 class AttackEvaluationController(EvaluationController):
-    def __init__(self, *args, attack_data=None, attack_evaluator=None, **kwargs):
+    def __init__(
+        self, *args, attack_data=None, attack_evaluator=None, **kwargs
+    ):
         super().__init__(*args, **kwargs)
         self.attack_data = attack_data
         self.attack_evaluator = attack_evaluator
-    def _post_message_hook(self, task_n, step, message, conversation_history) -> bool:
+    def _post_message_hook(
+        self, task_n, step, message, conversation_history
+    ) -> bool:
         """Override hook to add live attack evaluation."""
         if self.attack_evaluator and self.attack_data:
             success = self.attack_evaluator.evaluate(
@@ -762,7 +789,9 @@ class AttackEvaluationController(EvaluationController):
                 )
                 # persist the live result so the aggregator can pick it up later
                 try:
-                    self.attack_evaluator.save_evaluation_result(self.attack_data, True)
+                    self.attack_evaluator.save_evaluation_result(
+                        self.attack_data, True
+                    )
                 except Exception:
                     pass
                 conversation_history.append(message)
@@ -777,6 +806,7 @@ if __name__ == "__main__":
     )
     with open(auth_config_path, "r") as f:
         auth_config = yaml.safe_load(f)
     tenant_name = "local"
     token = auth_config["auth"][tenant_name]["wxo_mcsp_token"]

wxo_agentic_evaluation/llm_matching.py CHANGED Viewed

@@ -1,10 +1,22 @@
+"""
+LLM Matching Module with Cosine Similarity Support
+This module provides functionality for matching text using:
+1. LLM-based matching (using a language model to determine semantic equivalence)
+2. Embedding-based matching (using cosine similarity between text embeddings)
+"""
+import math
 from typing import List
+from fuzzywuzzy import fuzz
 from wxo_agentic_evaluation.prompt.template_render import (
     KeywordMatchingTemplateRenderer,
     SemanticMatchingTemplateRenderer,
 )
 from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
+from wxo_agentic_evaluation.utils.utils import safe_divide
 class LLMMatcher:
@@ -13,10 +25,18 @@ class LLMMatcher:
         llm_client: Provider,
         keyword_template: KeywordMatchingTemplateRenderer,
         semantic_template: SemanticMatchingTemplateRenderer,
+        use_llm_for_semantic: bool = True,
+        embedding_model_id: str = "sentence-transformers/all-minilm-l6-v2",
+        similarity_threshold: float = 0.8,
+        enable_fuzzy_matching: bool = False,
     ):
         self.llm_client = llm_client
         self.keyword_template = keyword_template
         self.semantic_template = semantic_template
+        self.embedding_model_id = embedding_model_id
+        self.use_llm_for_semantic = use_llm_for_semantic
+        self.similarity_threshold = similarity_threshold
+        self.enable_fuzzy_matching = enable_fuzzy_matching
     def keywords_match(self, response_text: str, keywords: List[str]) -> bool:
         if len(keywords) == 0:
@@ -31,8 +51,40 @@ class LLMMatcher:
         result = output.strip().lower()
         return result.startswith("true")
-    def semantic_match(
-        self, context: str, prediction: str, ground_truth: str
+    def generate_embeddings(
+        self, prediction: str, ground_truth: str
+    ) -> List[List[float]]:
+        embeddings = self.llm_client.encode([prediction, ground_truth])
+        return embeddings
+    def compute_cosine_similarity(
+        self, vec1: List[float], vec2: List[float]
+    ) -> float:
+        """Calculate cosine similarity between two vectors using pure Python"""
+        # Manual dot product calculation
+        dot_product = sum(a * b for a, b in zip(vec1, vec2))
+        # Manual magnitude calculations
+        magnitude1 = math.sqrt(sum(a * a for a in vec1))
+        magnitude2 = math.sqrt(sum(b * b for b in vec2))
+        return safe_divide(dot_product, (magnitude1 * magnitude2))
+    def cosine_similarity_semantic_match(
+        self, prediction: str, ground_truth: str
+    ) -> bool:
+        embeddings = self.generate_embeddings(prediction, ground_truth)
+        cosine_similarity = self.compute_cosine_similarity(
+            embeddings[0], embeddings[1]
+        )
+        return cosine_similarity >= self.similarity_threshold
+    def llm_semantic_match(
+        self, context, prediction: str, ground_truth: str
     ) -> bool:
         """Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
@@ -44,9 +96,47 @@ class LLMMatcher:
         Returns:
             a boolean indicating if the sentences match.
         """
         prompt = self.semantic_template.render(
             context=context, expected_text=ground_truth, actual_text=prediction
         )
         output: str = self.llm_client.query(prompt)
         result = output.strip().lower()
         return result.startswith("true")
+    def fuzzywuzzy_semantic_match(
+        self, prediction: str, ground_truth: str
+    ) -> bool:
+        similarity_score = fuzz.WRatio(prediction, ground_truth)
+        return similarity_score > self.similarity_threshold
+    def semantic_match(
+        self,
+        context: str,
+        prediction: str,
+        ground_truth: str,
+        enable_fuzzy_matching: bool = False,
+    ) -> bool:
+        ## TODO arjun-gupta1 10/06/2025: revist retry with exponential backoff. Opted for direct fallback to cosine similarity to avoid latency for now.
+        try:
+            return self.llm_semantic_match(context, prediction, ground_truth)
+        except Exception as e:
+            print(f"LLM semantic match failed: {e}")
+        if enable_fuzzy_matching:
+            print("falling back to fuzzy matching")
+            # Fallback to cosine similarity if LLM matching is not used or failed
+            try:
+                return self.cosine_similarity_semantic_match(
+                    prediction, ground_truth
+                )
+            except Exception as e:
+                print(
+                    f"Cosine similarity failed: {e}. Falling back to fuzzywuzzy."
+                )
+            # Final fallback to fuzzywuzzy
+            return self.fuzzywuzzy_semantic_match(prediction, ground_truth)

wxo_agentic_evaluation/llm_user.py CHANGED Viewed

@@ -21,8 +21,8 @@ class LLMUser:
         self,
         user_story,
         conversation_history: List[Message],
-        attack_instructions: str = None,
-    ) -> Message | None:
+        attack_instructions: str | None = None,
+    ) -> Message:
         # the tool response is already summarized, we don't need that to take over the chat history context window
         prompt_input = self.prompt_template.render(
             conversation_history=[

ibm-watsonx-orchestrate-evaluation-framework 1.1.6__py3-none-any.whl → 1.1.7__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.6py3-none-any.whl → 1.1.7py3-none-any.whl