PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (49) hide show

wxo_agentic_evaluation/arg_configs.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from dataclasses import dataclass, field
+from enum import StrEnum
 from typing import List, Optional, Union
 from wxo_agentic_evaluation import __file__
@@ -30,7 +31,27 @@ class LLMUserConfig:
 @dataclass
 class ProviderConfig:
     model_id: str = field(default="meta-llama/llama-3-405b-instruct")
-    provider: str = field(default="watsonx")
+    provider: str = field(
+        default_factory=lambda: (
+            "gateway"
+            if os.getenv("USE_GATEWAY_MODEL_PROVIDER", "").lower() == "true"
+            else "watsonx"
+        )
+    )
+    embedding_model_id: str = field(
+        default="sentence-transformers/all-minilm-l6-v2"
+    )
+@dataclass
+class CustomMetricsConfig:
+    paths: Optional[list[str]] = field(default=None)
+    llmaaj_config: ProviderConfig = field(default_factory=ProviderConfig)
+@dataclass
+class ExtractorsConfig:
+    paths: Optional[list[str]] = field(default=None)
 @dataclass
@@ -41,12 +62,18 @@ class TestConfig:
     wxo_lite_version: str
     provider_config: ProviderConfig = field(default_factory=ProviderConfig)
     llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
+    custom_metrics_config: CustomMetricsConfig = field(
+        default_factory=CustomMetricsConfig
+    )
+    extrators_config: ExtractorsConfig = field(default_factory=ExtractorsConfig)
     enable_verbose_logging: bool = True
     enable_manual_user_input: bool = False
     skip_available_results: bool = False
     data_annotation_run: bool = False
     num_workers: int = 2
     n_runs: int = 1
+    similarity_threshold: float = 0.8
+    enable_fuzzy_matching: bool = False
 @dataclass
@@ -59,22 +86,32 @@ class AttackConfig:
     enable_verbose_logging: bool = True
     enable_manual_user_input: bool = False
     num_workers: int = 2
+    skip_available_results: bool = True
 @dataclass
 class AttackGeneratorConfig:
     attacks_list: Union[List[str], str]
     datasets_path: Union[List[str], str]
-    agents_path: str
+    agents_list_or_path: Union[List[str], str]
     target_agent_name: str
+    auth_config: AuthConfig
     output_dir: str = None
     max_variants: int = None
+class AnalyzeMode(StrEnum):
+    default = "default"
+    enhanced = "enhanced"
 @dataclass
 class AnalyzeConfig:
     data_path: str
     tool_definition_path: Optional[str] = None
+    mode: str = AnalyzeMode.default
+    num_workers: int = 10
+    run: int = -1
 @dataclass

wxo_agentic_evaluation/data_annotator.py CHANGED Viewed

@@ -3,7 +3,10 @@ import collections
 import json
 from typing import Dict, List, Optional
-from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
+from wxo_agentic_evaluation.arg_configs import (
+    ChatRecordingConfig,
+    KeywordsGenerationConfig,
+)
 from wxo_agentic_evaluation.prompt.template_render import (
     LlamaKeywordsGenerationTemplateRenderer,
 )
@@ -223,11 +226,23 @@ class DataAnnotator:
         return goals, goal_details, previous
     def _process_summarization(
-        self, previous: str, goals: Dict, goal_details: List
+        self,
+        previous: str,
+        goals: Dict,
+        goal_details: List,
+        config: ChatRecordingConfig = None,
     ) -> None:
         """Process summarization step"""
         summarize_step = None
         # we assume single summary step at the end
+        extra_kwargs = {}
+        instance_url = getattr(config, "service_url", None)
+        token = getattr(config, "token", None)
+        if instance_url:
+            extra_kwargs["instance_url"] = instance_url
+        if token:
+            extra_kwargs["token"] = token
         for message in self.messages[::-1]:
             if message.role == "assistant":
                 provider = get_provider(
@@ -237,6 +252,7 @@ class DataAnnotator:
                         "decoding_method": "greedy",
                         "max_new_tokens": 256,
                     },
+                    **extra_kwargs,
                 )
                 kw_generator = KeywordsGenerationLLM(
                     provider=provider,
@@ -261,10 +277,12 @@ class DataAnnotator:
         else:
             goals[previous] = ["summarize"]
-    def generate(self) -> Dict:
+    def generate(self, config: ChatRecordingConfig = None) -> Dict:
         """Generate the final dataset"""
         goals, goal_details, previous = self._process_tool_calls()
-        self._process_summarization(previous, goals, goal_details)
+        self._process_summarization(
+            previous, goals, goal_details, config=config
+        )
         return {
             "agent": self.initial_data.agent,

wxo_agentic_evaluation/description_quality_checker.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import List
 import rich
+from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
 from wxo_agentic_evaluation.prompt.template_render import (
     BadToolDescriptionRenderer,
 )
@@ -15,6 +16,9 @@ from wxo_agentic_evaluation.tool_planner import (
     parse_json_string,
 )
 from wxo_agentic_evaluation.type import ToolDefinition
+from wxo_agentic_evaluation.utils.gateway_provider_utils import (
+    get_provider_kwargs,
+)
 from wxo_agentic_evaluation.utils.utils import safe_divide
@@ -60,12 +64,23 @@ class DescriptionQualityInspector:
         root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
     )
+    DEFAULT_PROVIDER_KWARGS = {
+        "model_id": LLM_MODEL_ID,
+        "params": LLM_PARAMS,
+    }
     def __init__(self, llm_client=None):
         if llm_client is None:
+            provider_kwargs = get_provider_kwargs(
+                **self.DEFAULT_PROVIDER_KWARGS,
+            )
             llm_client = get_provider(
-                model_id=self.LLM_MODEL_ID,
-                params=self.LLM_PARAMS,
+                **provider_kwargs,
             )
         self.llm_client = llm_client
         self.template = BadToolDescriptionRenderer(
             self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
@@ -106,7 +121,9 @@ class DescriptionQualityInspector:
                 )
         return tool_definitions
-    def detect_bad_description(self, tool_definition: ToolDefinition) -> bool:
+    def detect_bad_description(
+        self, tool_definition: ToolDefinition
+    ) -> DescriptionQualityMetric:
         """
         Detects if a tool description is 'bad' using an LLM judge.
         A 'bad' description is one that:
@@ -119,6 +136,10 @@ class DescriptionQualityInspector:
         Returns:
             bool: True if the description is 'bad', False otherwise.
         """
+        if tool_definition.tool_description is None:
+            return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
         prompt = self.template.render(tool_definition=tool_definition)
         response = self.llm_client.query(prompt)
@@ -137,7 +158,11 @@ class DescriptionQualityInspector:
             response_data=response_data
         )
-        return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
+        return DescriptionQualityMetric(
+            tool_name=tool_definition.tool_name,
+            description_score=final_description_score,
+            threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
+        )
     def _calculate_score(self, response_data: dict) -> float:
         """

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -1,19 +1,27 @@
 import json
 import os
-from typing import List
+from gc import enable
+from typing import Any, Dict, List, Optional
 import rich
+from dateutil import parser
 from wxo_agentic_evaluation import __file__
 from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
 from wxo_agentic_evaluation.llm_matching import LLMMatcher
 from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
 from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
+from wxo_agentic_evaluation.metrics.evaluations import (
+    Evaluation,
+    Extractor,
+    Metric,
+)
 from wxo_agentic_evaluation.metrics.llm_as_judge import (
     AnswerDerailment,
     AnswerUnsafeTopic,
 )
 from wxo_agentic_evaluation.metrics.metrics import (
+    CustomEvalMetrics,
     KeywordSemanticSearchMetric,
     KnowledgeBaseMetrics,
     TextMatchType,
@@ -28,7 +36,12 @@ from wxo_agentic_evaluation.prompt.template_render import (
     UnsafeTopicTemplateRenderer,
 )
 from wxo_agentic_evaluation.resource_map import ResourceMap
-from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.service_instance import tenant_setup
+from wxo_agentic_evaluation.service_provider import (
+    USE_GATEWAY_MODEL_PROVIDER,
+    get_provider,
+)
+from wxo_agentic_evaluation.service_provider.provider import Provider
 from wxo_agentic_evaluation.type import (
     ContentType,
     ConversationalSearch,
@@ -76,12 +89,18 @@ DUMMY_GRAPH_NODE_NAME = "dummy-goal"
 class EvaluationPackage:
     def __init__(
         self,
-        test_case_name,
-        ground_truth,
-        messages,
+        test_case_name: str,
+        ground_truth: EvaluationData,
+        messages: list[Message],
         conversational_search_data: List[ConversationalSearch] = None,
         resource_map: ResourceMap = None,
         is_attack_evaluation: bool = False,
+        config=None,
+        custom_evals: Optional[list[Evaluation]] = None,
+        custom_llmaaj_client: Optional[Provider] = None,
+        extractors: Optional[list[Extractor]] = None,
+        similarity_threshold=0.8,
+        enable_fuzzy_matching=False,
     ):
         self.tool_dictionary = (
             {
@@ -103,16 +122,56 @@ class EvaluationPackage:
             else []
         )
-        self.messages = messages
+        self.messages: List[Message] = messages
         self.conversational_search_data = conversational_search_data
         self.is_attack_evaluation = is_attack_evaluation
         self.ground_truth = ground_truth
         self.test_case_name = test_case_name
         self.resource_map = resource_map
+        self.custom_evals = custom_evals
+        self.custom_llmaaj_client = custom_llmaaj_client
+        self.extractors = extractors
+        self.enable_fuzzy_matching = enable_fuzzy_matching
         if not self.is_attack_evaluation:
             self.validate_ground_truth(self.ground_truth, self.test_case_name)
+        extra_kwargs = {}
+        if USE_GATEWAY_MODEL_PROVIDER:
+            if resource_map and hasattr(resource_map, "wxo_client"):
+                wxo_client = resource_map.wxo_client
+                if hasattr(wxo_client, "service_url"):
+                    extra_kwargs["instance_url"] = wxo_client.service_url
+                if hasattr(wxo_client, "api_key"):
+                    extra_kwargs["token"] = wxo_client.api_key
+            elif config:
+                auth = getattr(config, "auth_config", None)
+                if auth:
+                    instance_url = getattr(auth, "url", None)
+                    token = getattr(auth, "token", None)
+                if instance_url:
+                    extra_kwargs["instance_url"] = instance_url
+                if token:
+                    extra_kwargs["token"] = token
+            else:
+                token, instance_url, env = tenant_setup(
+                    service_url=None, tenant_name="local"
+                )
+                if instance_url:
+                    extra_kwargs["instance_url"] = instance_url
+                if token:
+                    extra_kwargs["token"] = token
+        # output response matching
         self.matcher = LLMMatcher(
             llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
@@ -121,6 +180,8 @@ class EvaluationPackage:
                     "decoding_method": "greedy",
                     "max_new_tokens": 10,
                 },
+                embedding_model_id="sentence-transformers/all-minilm-l6-v2",
+                **extra_kwargs,
             ),
             keyword_template=KeywordMatchingTemplateRenderer(
                 KEYWORD_MATCHING_PROMPT_PATH
@@ -128,7 +189,10 @@ class EvaluationPackage:
             semantic_template=SemanticMatchingTemplateRenderer(
                 SEMANTIC_MATCHING_PROMPT_PATH
             ),
+            similarity_threshold=similarity_threshold,
+            enable_fuzzy_matching=enable_fuzzy_matching,
         )
+        # only used for RAG evaluation
         self.rag_llm_as_a_judge = LLMJudge(
             llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
@@ -137,6 +201,7 @@ class EvaluationPackage:
                     "decoding_method": "greedy",
                     "max_new_tokens": 4096,
                 },
+                **extra_kwargs,
             ),
             faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
             answer_relevancy=AnswerRelevancyTemplateRenderer(
@@ -151,6 +216,7 @@ class EvaluationPackage:
                     "decoding_method": "greedy",
                     "max_new_tokens": 4096,
                 },
+                **extra_kwargs,
             ),
             answer_derailment=DerailmentTemplateRenderer(
                 DERAILMENT_PROMPT_PATH
@@ -303,8 +369,48 @@ class EvaluationPackage:
             return str(data).lower()
     @staticmethod
+    def _compare_as_date_or_number(normalized_actual, normalized_expected):
+        """
+        Attempts to compare two normalized values as dates or numbers.
+        Args:
+            normalized_actual: The actual value from tool call
+            normalized_expected: The expected value from ground truth
+        Returns:
+            tuple: (conversion_succeeded, values_match)
+                - conversion_succeeded: True if values could be converted to numbers or dates
+                - values_match: True if converted values match
+        """
+        # Try to convert to numbers
+        try:
+            num_actual = float(normalized_actual)
+            num_expected = float(normalized_expected)
+            # Conversion succeeded, check if values match
+            return (
+                True,
+                abs(num_actual - num_expected) <= 0.001,
+            )  # Small epsilon for float comparison
+        except (ValueError, TypeError):
+            pass
+        # Try to convert to dates
+        try:
+            date_actual = parser.parse(normalized_actual)
+            date_expected = parser.parse(normalized_expected)
+            # Conversion succeeded, check if values match
+            return True, date_actual == date_expected
+        except (ValueError, TypeError):
+            pass
+        # If we get here, neither number nor date conversion worked
+        return False, False
     def _check_if_args_match_with_ignore(
-        actual_args: dict[str, str], expected_args: dict[str, str]
+        self,
+        actual_args: dict[str, str],
+        expected_args: dict[str, str],
+        enable_fuzzy_matching: bool = False,
     ) -> bool:
         """
         This function checks if a registered tool call matches with the goal node when:
@@ -313,21 +419,50 @@ class EvaluationPackage:
             actual_args (dict): Made during inference.
             expected_args (dict): Defined in the test case/ground truth.
         Returns:
-            bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
+            bool: True if match with keyword parameters ignored | False otherwise (arguments were not corrected).
         """
         if set(actual_args.keys()) != set(expected_args.keys()):
             return False
+        ## now we go through and check each parameter
         for key in actual_args:
+            normalized_actual = EvaluationPackage.normalize_args(
+                actual_args[key]
+            )
+            normalized_expected = EvaluationPackage.normalize_args(
+                expected_args[key]
+            )
+            # 1. If the args are an ignored keyword or exactly equal, continue to next parameter
             if (
-                EvaluationPackage.normalize_args(actual_args[key])
-                != EvaluationPackage.normalize_args(expected_args[key])
-                and EvaluationPackage.normalize_args(expected_args[key])
-                != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
-            ):
-                return False
+                normalized_expected == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
+            ) or (normalized_actual == normalized_expected):
+                continue
+            else:
+                # if they're not equal, and fuzzy matching is enabled, do fuzzy.
+                if enable_fuzzy_matching:
+                    # 3. Check date/number conversion
+                    conversion_succeeded, values_match = (
+                        EvaluationPackage._compare_as_date_or_number(
+                            normalized_actual, normalized_expected
+                        )
+                    )
+                    # If conversion succeeded and values match, continue to next parameter
+                    if conversion_succeeded and values_match:
+                        continue
+                    # If conversion succeeded but values don't match, return False
+                    if conversion_succeeded and not values_match:
+                        return False
+                    # 4. If conversion failed, try cosine matching. If this fails, return false for the function
+                    if not self.matcher.cosine_similarity_semantic_match(
+                        normalized_actual, normalized_expected
+                    ):
+                        return False
+                else:
+                    # If they're not equal and fuzzy matching is not enabled, return false
+                    return False
+        # If we've made it through all parameters without returning False, return True
         return True
     def traverse(self):
@@ -399,8 +534,10 @@ class EvaluationPackage:
                                 goal_detail.args
                             )
                             or self._check_if_args_match_with_ignore(
-                                msg_tool_call["args"], goal_detail.args
-                            )
+                                msg_tool_call["args"],
+                                goal_detail.args,
+                                enable_fuzzy_matching=self.enable_fuzzy_matching,
+                            )  # TODO arjun-gupta1 9/29/25: make this also return the method of matching (llm, fuzzy, cosine similarity) so we can write it out to analyze_run.py results
                         ):
                             labelled_messages.append(goal_detail.name)
                             labelled_messages_without_text_step.append(
@@ -470,6 +607,7 @@ class EvaluationPackage:
             if message.event == EventTypes.message_created
             and message.role == "assistant"
         ]
         keyword_semantic_list = []
         for message in assistant_responses:
             for goal_detail in self.text_list:
@@ -478,7 +616,10 @@ class EvaluationPackage:
                         message.content, goal_detail.keywords
                     )
                     semantic_match: bool = self.matcher.semantic_match(
-                        message.content, goal_detail.response
+                        self.messages[0].content,
+                        prediction=message.content,
+                        ground_truth=goal_detail.response,
+                        enable_fuzzy_matching=self.enable_fuzzy_matching,
                     )
                     keyword_semantic_match = KeywordSemanticSearchMetric(
                         keyword_match=keyword_match,
@@ -513,6 +654,29 @@ class EvaluationPackage:
         else:
             return TextMatchType.text_mismatch.value
+    def generate_custom_metrics(
+        self, extracted_context: Dict[str, Any]
+    ) -> Optional[CustomEvalMetrics]:
+        if self.custom_evals is None:
+            return None
+        results: list[Metric] = []
+        for evaluation in self.custom_evals:
+            # TODO: cleanup. The compute method returns a Metric but pydantic thinks it is different.
+            # Probably because of some path issue when we auto-discover metrics
+            evaluate_result = evaluation.evaluate(
+                messages=self.messages,
+                ground_truth=self.ground_truth,
+                extracted_context=extracted_context,
+            )
+            if evaluate_result is not None:
+                results.append(Metric(**evaluate_result.model_dump()))
+        custom_eval_results = CustomEvalMetrics(
+            dataset_name=self.test_case_name, custom_metrics=results
+        )
+        return custom_eval_results
     def generate_summary(self):
         llm_steps = 0
         total_step = 0
@@ -525,6 +689,16 @@ class EvaluationPackage:
             message_with_reasons,
         ) = self.traverse()
+        extracted_context = {}
+        if self.extractors is not None and self.custom_evals is not None:
+            for extractor in self.extractors:
+                context = extractor.extract(
+                    messages=self.messages,
+                    ground_truth=self.ground_truth,
+                    matcher=self.matcher,
+                )
+                extracted_context[extractor.name] = context
         is_success = self.is_topological_sort(
             self.ground_truth.goals, labelled_messages
         )
@@ -545,6 +719,10 @@ class EvaluationPackage:
         knowledge_base_metric_summary = (
             self.generate_knowledge_base_metric_summary()
         )
+        custom_metric_summary = self.generate_custom_metrics(
+            extracted_context=extracted_context
+        )
         # TO-DO: the table is not printing properly anymore with the new columns introduced
         # we need to introduce a separate table for these.
@@ -558,6 +736,7 @@ class EvaluationPackage:
             knowledge_base_metric_summary,
             message_with_reasons,
             metrics,
+            custom_metric_summary,
         )
     def _get_messages_by_role_before_cs(

wxo_agentic_evaluation/external_agent/external_validate.py CHANGED Viewed

@@ -74,7 +74,9 @@ class ExternalAgentValidation:
         payload = {"stream": True}
         payload["messages"] = messages
         resp = requests.post(
-            url=self.service_url, headers=self.header, json=payload,
+            url=self.service_url,
+            headers=self.header,
+            json=payload,
         )
         success, logged_events = self._validate_streaming_response(resp)

wxo_agentic_evaluation/external_agent/types.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, List, Literal, Mapping, Union, Optional
+from typing import Any, List, Literal, Mapping, Optional, Union
 from pydantic import BaseModel

ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.1.5py3-none-any.whl → 1.1.7py3-none-any.whl