PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.7py3-none-any.whl → 1.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (63) hide show

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -37,6 +37,11 @@ SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_match
 FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
 ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
+RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
+    "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS",
+    "<IGNORE>"
+    )
 """
 - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
 - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
@@ -52,8 +57,8 @@ class EvaluationPackage:
         ground_truth,
         messages,
         conversational_search_data: List[ConversationalSearch] = None,
-        is_analyze_run=False,
         resource_map: ResourceMap = None,
+        is_attack_evaluation: bool = False,
     ):
         self.tool_dictionary = {
             goal_detail.name: goal_detail
@@ -67,10 +72,13 @@ class EvaluationPackage:
         ]
         self.messages = messages
         self.conversational_search_data = conversational_search_data
-        self.validate_ground_truth(ground_truth, test_case_name)
+        self.is_attack_evaluation = is_attack_evaluation
         self.ground_truth = ground_truth
         self.test_case_name = test_case_name
-        self.is_analyze_run = is_analyze_run
+        self.resource_map = resource_map
+        if not self.is_attack_evaluation:
+            self.validate_ground_truth(self.ground_truth, self.test_case_name)
         self.matcher = LLMMatcher(
             llm_client=get_provider(
@@ -94,8 +102,6 @@ class EvaluationPackage:
                 ANSWER_RELEVANCY_PROMPT_PATH
             ),
         )
-        self.resource_map = resource_map
     @staticmethod
     def find_ground_node(graph, start_node):
@@ -209,6 +215,33 @@ class EvaluationPackage:
                 rich.print(
                     f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
                 )
+    @staticmethod
+    def _check_if_args_match_with_ignore(
+            actual_args: dict[str, str],
+            expected_args: dict[str, str]
+    ) -> bool:
+        """
+        This function checks if a registered tool call matches with the goal node when:
+            - the arg value marked as wrong is labelled with the "<IGNORE>" value in the corresponding ground truth
+        Args:
+            actual_args (dict): Made during inference.
+            expected_args (dict): Defined in the test case/ground truth.
+        Returns:
+            bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
+        """
+        if(
+            set(actual_args.keys()) != set(expected_args.keys())
+            ):
+            return False
+        for key in actual_args:
+            if actual_args[key] != expected_args[key] \
+                and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
+                return False
+        return True
     def traverse(self):
         labelled_messages = []
@@ -218,7 +251,7 @@ class EvaluationPackage:
         tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
         )
         tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
+        correct_tool_calls = set() # sometimes, tool with the same signature can be called more than once
         for message in self.messages:
             if message.type == ContentType.tool_call:
@@ -244,6 +277,7 @@ class EvaluationPackage:
                     continue
+                # TO-DO: re-think how deduplication works in the context of precision & recall
                 tool_call_and_routing_metrics.total_tool_calls += 1
                 # evaluating more than once is fine
@@ -259,11 +293,16 @@ class EvaluationPackage:
                     found = False
                     possible_ground_truth_for_analysis = []
                     for goal_detail in matching_goal_details:
-                        if msg_tool_call["args"] == goal_detail.args:
+                        # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
+                        if goal_detail.args == {"IGNORE": None} or (msg_tool_call["args"] == goal_detail.args or
+                            self._check_if_args_match_with_ignore(
+                                msg_tool_call["args"],
+                                goal_detail.args
+                            )):
                             labelled_messages.append(goal_detail.name)
                             labelled_messages_without_text_step.append(goal_detail.name)
-                            tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
+                            correct_tool_calls.add(goal_detail.name)
+                            #tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
                             found = True
                             message_outcome = ExtendedMessage(message=message)
                             message_outcomes.append(message_outcome)
@@ -279,15 +318,17 @@ class EvaluationPackage:
                             "expected": possible_ground_truth_for_analysis,
                         }
                         message_outcomes.append(message_outcome)
+                        if not self.is_attack_evaluation:
+                            rich.print(
+                                f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
+                                f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
+                            )
+                else:
+                    if not self.is_attack_evaluation:
                         rich.print(
-                            f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
-                            f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
+                            f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
                         )
-                else:
-                    rich.print(
-                        f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
-                    )
                     # note: this is incorrect after the 1.6 change
                     message_outcome = ExtendedMessage(message=message)
                     message_outcome.reason = {"reason": "irrelevant tool call"}
@@ -308,6 +349,9 @@ class EvaluationPackage:
             else:
                 message_outcome = ExtendedMessage(message=message)
                 message_outcomes.append(message_outcome)
+        tool_call_and_routing_metrics.correct_tool_calls = len(correct_tool_calls)
         assistant_responses = [
             message
             for message in self.messages
@@ -368,8 +412,6 @@ class EvaluationPackage:
             metrics,
             message_with_reasons,
         ) = self.traverse()
-        if self.is_analyze_run:
-            print(labelled_messages)
         is_success = self.is_topological_sort(
             self.ground_truth.goals, labelled_messages
@@ -433,7 +475,11 @@ class EvaluationPackage:
         for message in self.messages:
             if message.type == ContentType.tool_call:
                 content = json.loads(message.content)
-                id = content.get("tool_call_id", "")
+                """
+                - In ADK 1.9, for tool call events, the "tool_call_id" is now "id"
+                - still parse out "tool_call_id" for backwards compatibility
+                """
+                id = content.get("tool_call_id") or content.get("id")
                 if id == tool_call_id:
                     return content.get("name")
@@ -501,7 +547,6 @@ class EvaluationPackage:
         return metrics
 if __name__ == "__main__":
     messages = []

wxo_agentic_evaluation/external_agent/__init__.py CHANGED Viewed

@@ -23,7 +23,7 @@ def generate_starting_sentence(annotated_data: dict):
         "decoding_method": "greedy",
         "max_new_tokens": 4096,
     }
-    wai_client = get_provider(config=ProviderConfig(), params=llm_decode_parameter)
+    wai_client = get_provider(model_id="meta-llama/llama-3-405b-instruct", params=llm_decode_parameter)
     prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
     res = wai_client.query(prompt)
     res = res.strip()

wxo_agentic_evaluation/external_agent/performance_test.py CHANGED Viewed

@@ -3,7 +3,7 @@ from rich.console import Console
 from wxo_agentic_evaluation.external_agent import generate_starting_sentence
 from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
-from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
+from wxo_agentic_evaluation.service_provider import get_provider
 from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
 class ExternalAgentPerformanceTest:
@@ -19,13 +19,12 @@ class ExternalAgentPerformanceTest:
         kw_gen_config = KeywordsGenerationConfig()
-        provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
         llm_decode_parameter = {
             "min_new_tokens": 0,
             "decoding_method": "greedy",
             "max_new_tokens": 256,
         }
-        wai_client = get_provider(config=provider_config, params=llm_decode_parameter)
+        wai_client = get_provider(model_id=kw_gen_config.model_id, params=llm_decode_parameter)
         self.kw_gen = KeywordsGenerationLLM(
             provider=wai_client,

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -6,6 +6,8 @@ import rich
 import time
 from pydantic import BaseModel
 from typing import List, Generator, Dict, Tuple, Mapping, Any
+from enum import Enum
+from collections import deque
 from wxo_agentic_evaluation.type import (
     ContentType,
@@ -21,14 +23,39 @@ from wxo_agentic_evaluation.llm_user import LLMUser
 from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
 from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.service_instance import tenant_setup
-from wxo_agentic_evaluation.utils.utils import is_saas_url
-def is_end(user_input: Message):
-    if "END" in user_input.content.strip():
-        return True
-    return False
+from wxo_agentic_evaluation.utils.utils import (
+    is_saas_url,
+    safe_divide,
+    Tokenizer
+)
+tokenizer = Tokenizer()
+class Roles(Enum):
+    ASSISTANT = "assistant"
+    USER = "user"
+def calculate_word_overlap_similarity_score(first_message_text: str, second_message_text: str) -> float:
+    """Calculate the word overlap similarity score between the .content field of two Message objects.
+    Args:
+        first_message_text (str): The .content field of the first message.
+        second_message_text (str): The .content field of the second message.
+    """
+    words_in_first_message = tokenizer(first_message_text)
+    words_in_second_message = tokenizer(second_message_text)
+    # Calculate the number of common words
+    common_words = set(words_in_first_message) & set(words_in_second_message)
+    unique_words = set(words_in_first_message + words_in_second_message)
+    unique_words_count = len(unique_words)
+    common_words_count = len(common_words)
+    return safe_divide(
+        common_words_count,
+        unique_words_count
+    )
 def is_transfer_response(step_detail: Dict):
     # this is not very reliable
@@ -394,6 +421,7 @@ class WXOInferenceBackend:
         messages = []
         for entry in result:
             tool_call_id = None
             if step_history := entry.get("step_history"):
                 for step_message in step_history:
@@ -406,6 +434,10 @@ class WXOInferenceBackend:
                                     tool_json = {"type": "tool_call"}
                                     tool_json.update(tool)
                                     content = json.dumps(tool_json)
+                                    # TO-DO: review do we even need the get messages for retry loop anymore?
+                                    if msg_content := entry.get("content"):
+                                        if msg_content[0].get("response_type") == "conversational_search":
+                                            continue
                                     messages.append(
                                         Message(
                                             role=role,
@@ -504,6 +536,11 @@ class WXOInferenceBackend:
 class EvaluationController:
+    MAX_CONVERSATION_STEPS = int(os.getenv("MAX_CONVERSATION_STEPS", 20))
+    MESSAGE_SIMILARITY_THRESHOLD = float(os.getenv("MESSAGE_SIMILARITY_THRESHOLD", 0.98))  # if any two consecutive messages are >98% similar, the inference loop will be terminated
+    MAX_REPEATING_MESSAGES = int(os.getenv("MAX_REPEATING_MESSAGES", 3)) # this is the maximum number of repeating messages by the user or assistant before terminating the inference loop
     def __init__(
         self,
         wxo_inference_backend: WXOInferenceBackend,
@@ -513,18 +550,24 @@ class EvaluationController:
         self.wxo_inference_backend = wxo_inference_backend
         self.llm_user = llm_user
         self.config = config
+        self.repeating_output_detection = self.MAX_REPEATING_MESSAGES >= 2
+        if self.repeating_output_detection:
+            # Use deque for efficient O(1) operations
+            self.recent_user_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
+            self.recent_assistant_messages = deque(maxlen=self.MAX_REPEATING_MESSAGES)
     def run(
-        self, task_n, story, agent_name: str, starting_user_input: str = None
+        self, task_n, story, agent_name: str, starting_user_input: str = None, attack_instructions: str = None
     ) -> Tuple[List[Message], List[CallTracker], List[ConversationalSearch]]:
         step = 0
         thread_id = None
         conversation_history: List[Message] = []
         conversational_search_history_data = []
         call_tracker = CallTracker()
-        # make this configurable
-        while step < 20:
+        # make this configurable
+        while step < self.MAX_CONVERSATION_STEPS:
             if step == 0 and starting_user_input:
                 user_input = Message(
                     role="user", content=starting_user_input, type=ContentType.text
@@ -539,16 +582,22 @@ class EvaluationController:
                     )
                 else:  # llm
                     user_input = self.llm_user.generate_user_input(
-                        story, conversation_history
+                        story, conversation_history, attack_instructions=attack_instructions
                     )
             if self.config.enable_verbose_logging:
                 rich.print(
                     f"[dark_khaki][Task-{task_n}][/dark_khaki] 👤[bold blue] User:[/bold blue]",
                     user_input.content,
                 )
-            if is_end(user_input):
+            if self._is_end(user_input):
                 break
+            if self.repeating_output_detection:
+                self.recent_user_messages.append(user_input.content)
             conversation_history.append(user_input)
             messages, thread_id, conversational_search_data = (
                 self.wxo_inference_backend.stream_messages(
                     user_input,
@@ -559,16 +608,70 @@ class EvaluationController:
             )
             if not messages:
                 raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
-            if self.config.enable_verbose_logging:
-                for message in messages:
+            for message in messages:
+                if self.repeating_output_detection:
+                    if message.role == Roles.ASSISTANT and message.type == ContentType.text:
+                        self.recent_assistant_messages.append(message.content)
+                if self.config.enable_verbose_logging:
                     rich.print(
                         f"[orange3][Task-{task_n}][/orange3] 🤖[bold cyan] WXO:[/bold cyan]",
                         message.content,
                     )
             conversation_history.extend(messages)
             conversational_search_history_data.extend(conversational_search_data)
             step += 1
         return conversation_history, call_tracker, conversational_search_history_data
+    def _is_looping(self, messages: deque) -> bool:
+        """Checks whether the user or assistant is stuck in a loop.
+        Args:
+            messages (deque): Defines the message cache to be assessed for similarity.
+        Returns:
+            bool: True if stuck in a loop, False otherwise.
+        """
+        sim_count = 0
+        if len(messages) >= self.MAX_REPEATING_MESSAGES:
+            oldest_cached_message = messages[0]
+            for i, old_message in enumerate(messages):
+                if i == 0:
+                    continue
+                if oldest_cached_message == old_message:
+                    sim_count += 1
+                elif calculate_word_overlap_similarity_score(oldest_cached_message, old_message) > self.MESSAGE_SIMILARITY_THRESHOLD:
+                    sim_count += 1
+        return sim_count >= self.MAX_REPEATING_MESSAGES - 1
+    def _is_end(self, current_user_input: Message) -> bool:
+        """
+        Check if the user input indicates the end of the conversation.
+        - This function checks if the user input contains 'END'.
+        - An END is also triggered when the message cache(s) is filled with messages that are too similar.
+        - Elaborate checking ONLY if EvaluationController.END_IF_MISBEHAVING=True
+        Args:
+            current_user_input (Message): The user message.
+        Returns:
+            bool: True if the user input indicates an END, False otherwise.
+        """
+        current_user_message_content = current_user_input.content.strip()
+        # Check if the user message contains 'END'
+        if "END" in current_user_message_content:
+            return True
+        if self.repeating_output_detection:
+            # Check for repeating user or assistant messages
+            if (self._is_looping(self.recent_user_messages) or
+                self._is_looping(self.recent_assistant_messages)):
+                return True
+        return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
 def get_wxo_client(
     service_url: str, tenant_name: str, token: str = None

wxo_agentic_evaluation/llm_user.py CHANGED Viewed

@@ -17,7 +17,7 @@ class LLMUser:
         )
     def generate_user_input(
-        self, user_story, conversation_history: List[Message]
+        self, user_story, conversation_history: List[Message], attack_instructions: str = None
     ) -> Message | None:
         # the tool response is already summarized, we don't need that to take over the chat history context window
         prompt_input = self.prompt_template.render(
@@ -28,6 +28,7 @@ class LLMUser:
             ],
             user_story=user_story,
             user_response_style=self.user_response_style,
+            attack_instructions=attack_instructions,
         )
         user_input = self.wai_client.query(prompt_input)
         user_input = Message(

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -107,6 +107,11 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
 def main(config: TestConfig):
     executor = ThreadPoolExecutor(max_workers=config.num_workers)
+    if config.num_workers > 1 and config.enable_manual_user_input:
+        rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
+        config.enable_manual_user_input = False # disable manual user input for parallel execution
+        # reason: threads continue to stream messages while waiting for user input, which is not desired
+        # and the manual input prompt is not labelled properly in the UI
     wxo_client = get_wxo_client(
         config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
     )

wxo_agentic_evaluation/metrics/metrics.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import math
-from typing import List, Mapping, Any
+from typing import List, Mapping, Any, Tuple, Optional
 from enum import Enum
 from pydantic import BaseModel, computed_field
@@ -166,3 +166,24 @@ class ToolCallAndRoutingMetrics(BaseModel):
             ),
             2,
         )
+class FailedStaticTestCases(BaseModel):
+    metric_name: str
+    description: str
+    explanation: str
+class FailedSemanticTestCases(BaseModel):
+    metric_name: str
+    evidence: str
+    explanation: str
+    output: int
+    confidence: float
+class ReferenceLessEvalMetrics(BaseModel):
+    dataset_name: str
+    number_of_tool_calls: int
+    number_of_successful_tool_calls: int
+    number_of_static_failed_tool_calls: int
+    number_of_semantic_failed_tool_calls: int
+    failed_static_tool_calls: Optional[List[Tuple[int, List[FailedStaticTestCases]]]]
+    failed_semantic_tool_calls: Optional[List[Tuple[int, List[FailedSemanticTestCases]]]]

wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 ADDED Viewed

@@ -0,0 +1,178 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are an expert at evaluating tool descriptions for AI agents.
+Your task is to analyze how well a tool description serves as documentation for a specific function, helping an AI agent understand when and how to use that tool effectively.
+## EVALUATION APPROACH
+- You will evaluate the tool description across **five distinct criteria**.
+- Each criterion examines a different aspect of description quality.
+- Consider the function name and parameters as important context, they tell you what the tool actually does, and the description should work together with this information.
+## EVALUATION CRITERIA
+1. **Uses Vague Language**
+Evaluate whether the description contains unclear, ambiguous, or non-specific terms that make it difficult to understand what the tool does.
+Signs of vague language:
+- Generic words that could apply to anything ("tool", "data", "information" without specifics)
+- Unclear abbreviations without context ("comp" instead of "compensation")
+- Ambiguous pronouns ("it", "this", "that" without clear references)
+- Non-specific qualifiers ("some", "various", "different" without elaboration)
+**2. Contains Redundant Information**
+Evaluate whether the description repeats the same information multiple times within itself, making it unnecessarily verbose.
+Signs of redundancy:
+- Providing the same information in different ways within the description itself
+- Repeating key terms or concepts multiple times unnecessarily
+- Including duplicate/unnecessary details that don't add clarity
+**3. Provides No New Information**
+Evaluate whether the description adds meaningful details beyond what you can already understand from the function name and parameters alone.
+In order to assess if a description provides meaningful, new information - ask yourself: If someone only saw the function name and parameter names, would this description teach them anything new about what the tool does, when to use it, or how it works?
+**4. Does Not Convey Tool Purpose**
+Evaluate whether the description fails to clearly explain what the tool actually does or accomplishes.
+Signs of unclear purpose:
+- Reader would be confused about what happens when they call this function
+- Description doesn't explain the tool's core functionality
+- Unclear what problem this tool solves or what outcome it produces
+**5. Does Not Help in Identifying Tool Uniquely**
+Evaluate whether the description is so generic that it could apply to many different tools, making it difficult for an agent to choose the right tool for a specific task.
+Signs of non-unique descriptions:
+- Description could accurately describe dozens of different functions
+- Lacks specific details that distinguish this tool from similar ones
+- Doesn't highlight what makes this tool different from alternatives
+Here are some example evaluations which isolate the aforementioned criteria for your reference:
+## EXAMPLES
+**Example 1 - Uses Vague Language:**
+**Tool Name:** get_employee_compensation_details
+**Description:** "Retrieves relevant employee data from the system"
+**Parameters:** employee_id, include_historical
+Expected Response:
+```json
+{
+    "uses_vague_language": "TRUE",
+    "contains_redundant_information": "FALSE",
+    "provides_no_new_information": "FALSE",
+    "does_not_convey_tool_purpose": "FALSE",
+    "does_not_help_in_identifying_tool_uniquely": "FALSE",
+    "reason": "Uses vague term 'relevant employee data' which doesn't indicate this tool specifically handles compensation information"
+}
+```
+**Example 2 - Contains Redundant Information:**
+**Tool Name:** update_employee_phone
+**Description:** "Updates and modifies the employee's phone number by changing their phone contact information"
+**Parameters:** employee_id, new_phone_number
+Expected Response:
+```json
+{
+    "uses_vague_language": "FALSE",
+    "contains_redundant_information": "TRUE",
+    "provides_no_new_information": "FALSE",
+    "does_not_convey_tool_purpose": "FALSE",
+    "does_not_help_in_identifying_tool_uniquely": "FALSE",
+    "reason": "Unnecessarily repeats the concept of updating/changing/modifying phone information multiple times"
+}
+```
+**Example 3 - Provides No New Information:**
+**Tool Name:** get_holiday_calendar
+**Description:** "Gets holiday calendar"
+**Parameters:** country_code, year
+Expected Response:
+```json
+{
+    "uses_vague_language": "FALSE",
+    "contains_redundant_information": "FALSE",
+    "provides_no_new_information": "TRUE",
+    "does_not_convey_tool_purpose": "FALSE",
+    "does_not_help_in_identifying_tool_uniquely": "FALSE",
+    "reason": "Description exactly mirrors the function name without explaining what the calendar contains or how parameters are used"
+}
+```
+**Example 4 - Does Not Convey Tool Purpose**
+**Tool Name:** initiate_promotion_workflow
+**Description:** "Creates a workflow entry in the system"
+**Parameters:** employee_id, new_position, effective_date, manager_approval
+Expected Response:
+```json
+{
+    "uses_vague_language": "FALSE",
+    "contains_redundant_information": "FALSE",
+    "provides_no_new_information": "FALSE",
+    "does_not_convey_tool_purpose": "TRUE",
+    "does_not_help_in_identifying_tool_uniquely": "FALSE",
+    "reason": "Describes the technical implementation (creating a workflow entry) rather than the business purpose or outcome of initiating a promotion process"
+}
+```
+**Example 5 - Does Not Help Identify Tool Uniquely:**
+**Tool Name:** get_employee_contact_details
+**Description:** "Retrieves employee information from the HR system"
+**Parameters:** employee_id
+Expected Response:
+```json
+{
+    "uses_vague_language": "FALSE",
+    "contains_redundant_information": "FALSE",
+    "provides_no_new_information": "FALSE",
+    "does_not_convey_tool_purpose": "FALSE",
+    "does_not_help_in_identifying_tool_uniquely": "TRUE",
+    "reason": "Generic description could apply to any employee data retrieval function - doesn't specify that it returns contact details"
+}
+```
+**Here are some instructions on how you should respond, whenever you are asked to evaluate a tool description:**
+## REQUIRED RESPONSE FORMAT
+Respond with a single JSON object containing your evaluation with the following keys:
+- uses_vague_language: (TRUE/FALSE) your assessment of whether the description uses vague language.
+- contains_redundant_information: (TRUE/FALSE) your assessment of whether the description contains redundant information.
+- provides_no_new_information: (TRUE/FALSE) your assessment of whether the description provides additional insight not observed in the function name and parameters.
+- does_not_convey_tool_purpose: (TRUE/FALSE) your assessment of whether the description clarifies tool purpose and usage.
+- does_not_help_in_identifying_tool_uniquely: (TRUE/FALSE) your assessment of whether the description will help identify it uniquely amongst other possible tools in an agent's toolkit.
+Here is a sample JSON object embdedded inside a code block for reference:
+```json
+{
+    "uses_vague_language": "TRUE",
+    "contains_redundant_information": "FALSE",
+    "provides_no_new_information": "TRUE",
+    "does_not_convey_tool_purpose": "FALSE",
+    "does_not_help_in_identifying_tool_uniquely": "TRUE",
+    "reason": "Brief explanation of the main issues found"
+}
+```
+**IMPORTANT:**
+- Follow the same syntax for your JSON response as you saw in the sample provided within the code block.
+- Use only "TRUE" or "FALSE" for each criterion, except for the "reason" field in the JSON object.
+- Do NOT add any text, comments or explanations outside the JSON object.
+- Do NOT use any markdown formatting.
+- Provide a concise explanation only inside the "reason" field of the JSON object, and nowhere else.
+<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+**Evaluate this tool's description using the aforementioned criteria:**
+**Tool Name:** {{ tool_definition.tool_name }}
+**Description:** "{{ tool_definition.tool_description }}"
+**Parameters:** {{ tool_definition.tool_params | join(', ') }}
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>

ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.0.7py3-none-any.whl → 1.0.9py3-none-any.whl