PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.2py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

wxo_agentic_evaluation/evaluation_package.py CHANGED Viewed

@@ -9,15 +9,17 @@ from wxo_agentic_evaluation.type import (
     ContentType,
     Message,
     EvaluationData,
-    ToolCallAndRoutingMetrics,
     EventTypes,
     ConversationalSearch,
     ExtendedMessage,
 )
-from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
+from wxo_agentic_evaluation.resource_map import ResourceMap
+from wxo_agentic_evaluation.service_provider import get_provider
 from wxo_agentic_evaluation.metrics.metrics import (
     KnowledgeBaseMetrics,
     KeywordSemanticSearchMetric,
+    ToolCallAndRoutingMetrics,
+    TextMatchType
 )
 from wxo_agentic_evaluation.prompt.template_render import (
     KeywordMatchingTemplateRenderer,
@@ -35,6 +37,13 @@ SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_match
 FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
 ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
+"""
+- hyphens are not allowed in python function names, so it is safe to use as a dummy function name
+- purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
+a dummy node to the ground truth and the labelled messages to take into account
+single, summary step goals.
+"""
+DUMMY_GRAPH_NODE_NAME = "dummy-goal"
 class EvaluationPackage:
     def __init__(
@@ -44,6 +53,7 @@ class EvaluationPackage:
         messages,
         conversational_search_data: List[ConversationalSearch] = None,
         is_analyze_run=False,
+        resource_map: ResourceMap = None,
     ):
         self.tool_dictionary = {
             goal_detail.name: goal_detail
@@ -63,13 +73,9 @@ class EvaluationPackage:
         self.is_analyze_run = is_analyze_run
         self.matcher = LLMMatcher(
-            llm_client=WatsonXProvider(
+            llm_client=get_provider(
                 model_id="meta-llama/llama-3-405b-instruct",
-                llm_decode_parameter={
-                    "min_new_tokens": 0,
-                    "decoding_method": "greedy",
-                    "max_new_tokens": 10,
-                },
+                params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 10},
             ),
             keyword_template=KeywordMatchingTemplateRenderer(
                 KEYWORD_MATCHING_PROMPT_PATH
@@ -79,23 +85,55 @@ class EvaluationPackage:
             ),
         )
         self.rag_llm_as_a_judge = LLMJudge(
-            llm_client=WatsonXProvider(
-                model_id="meta-llama/llama-3-405b-instruct",
-                llm_decode_parameter={
-                    "min_new_tokens": 0,
-                    "decoding_method": "greedy",
-                    "max_new_tokens": 4096,
-                },
-            ),
+            llm_client=get_provider(
+                    model_id="meta-llama/llama-3-405b-instruct",
+                    params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
+                ),
             faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
             answer_relevancy=AnswerRelevancyTemplateRenderer(
                 ANSWER_RELEVANCY_PROMPT_PATH
             ),
         )
+        self.resource_map = resource_map
+    @staticmethod
+    def find_ground_node(graph, start_node):
+        """ Simple implementation. Should be fixed in the future
+        Assumes that there is a single graph node that does not have children
+        """
+        stack = [start_node]
+        visited_set = set()
+        while stack:
+            node = stack.pop()
+            if node not in visited_set:
+                visited_set.add(node)
+                # check for children
+                # improvement for future: add the ground nodes here
+                # right now, just return the first one
+                if not graph.get(node):
+                    return node
+                stack.extend(graph[node])
+        return None
     @staticmethod
     def is_topological_sort(graph, ordering):
         position = {node: i for i, node in enumerate(ordering)}
+        ground_node = EvaluationPackage.find_ground_node(graph, list(graph.keys())[0])
+        if ground_node is not None:
+            graph[ground_node] = [DUMMY_GRAPH_NODE_NAME]
+            graph[DUMMY_GRAPH_NODE_NAME] = []
+            next_idx = len(position)
+            position[DUMMY_GRAPH_NODE_NAME] = next_idx
         for u in graph:
             for v in graph[u]:
                 if u not in position or v not in position:
@@ -143,7 +181,7 @@ class EvaluationPackage:
                     f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
                 )
             if goal_detail.name == "summarize":
-                if len(goal_detail.keywords) == 0 and len(goal_detail.response) == 0:
+                if (not goal_detail.keywords or len(goal_detail.keywords) == 0) and (not goal_detail.response or len(goal_detail.response) == 0):
                     rich.print(
                         f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
                     )
@@ -178,23 +216,35 @@ class EvaluationPackage:
         labelled_messages_without_text_step = []
         # Counters for tool-calling related metrics
         tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
-            total_tool_calls=0,
-            expected_tool_calls=0,
-            relevant_tool_calls=0,
-            correct_tool_calls=0,
-            total_routing_calls=0,
-            expected_routing_calls=0,
         )
         tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
         for message in self.messages:
             if message.type == ContentType.tool_call:
-                tool_call_and_routing_metrics.total_tool_calls += 1
-                msg_tool_call = json.loads(message.content)
-                # Check for transfer_* calls
-                if msg_tool_call["name"].startswith("transfer_to_"):
+                msg_tool_call = json.loads(message.content)
+                if self.resource_map and msg_tool_call["name"] in self.resource_map.agent2tools:
                     tool_call_and_routing_metrics.total_routing_calls += 1
+                    relevant = False
+                    for tool in self.resource_map.agent2tools[msg_tool_call["name"]]:
+                        for goal_detail in self.tool_dictionary.values():
+                            if goal_detail.tool_name == tool:
+                                relevant = True
+                                break
+                        if relevant:
+                            break
+                    if relevant:
+                        tool_call_and_routing_metrics.relevant_routing_calls += 1
+                    else:
+                        message_outcome = ExtendedMessage(message=message)
+                        message_outcome.reason = {
+                            "reason": "irrelevant routing call",
+                        }
+                    continue
+                tool_call_and_routing_metrics.total_tool_calls += 1
                 # evaluating more than once is fine
                 # agent could make repeated calls with the same function signature
@@ -207,57 +257,41 @@ class EvaluationPackage:
                 if len(matching_goal_details) > 0:
                     tool_call_and_routing_metrics.relevant_tool_calls += 1  # tool name matches one of the expected tool names, as defined in the ground truth
                     found = False
-                    possible_ground_truth = []
+                    possible_ground_truth_for_analysis = []
                     for goal_detail in matching_goal_details:
-                        if (
-                            is_transfer := msg_tool_call["name"].startswith(
-                                "transfer_to_"
-                            )
-                        ) or msg_tool_call["args"] == goal_detail.args:
+                        if msg_tool_call["args"] == goal_detail.args:
                             labelled_messages.append(goal_detail.name)
                             labelled_messages_without_text_step.append(goal_detail.name)
-                            if is_transfer:
-                                tool_call_and_routing_metrics.expected_routing_calls += (
-                                    1
-                                )
-                            else:
-                                tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
+                            tool_call_and_routing_metrics.correct_tool_calls += 1  # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
                             found = True
                             message_outcome = ExtendedMessage(message=message)
                             message_outcomes.append(message_outcome)
                             break
                         else:
-                            possible_ground_truth.append(goal_detail.args)
+                            possible_ground_truth_for_analysis.append(goal_detail.args)
                     if not found:
                         message_outcome = ExtendedMessage(message=message)
                         message_outcome.reason = {
                             "reason": "incorrect parameter",
                             "actual": msg_tool_call["args"],
-                            "expected": possible_ground_truth,
+                            "expected": possible_ground_truth_for_analysis,
                         }
                         message_outcomes.append(message_outcome)
                         rich.print(
                             f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
                             f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
                         )
-                        labelled_messages.append(
-                            msg_tool_call["name"] + "_WRONG_PARAMETERS"
-                        )
                 else:
-                    # TO-DO: we need a way to backtrack agent/tool pairs.
-                    # if we route to an agent without the right toolset, that makes it a routing error.
-                    # this will remove the need to label routing calls explicitly
-                    if not msg_tool_call["name"].startswith("transfer_to_"):
-                        rich.print(
-                            f"[red][ERROR] Wrong function call: {msg_tool_call['name']}[/red]"
-                        )
-                        labelled_messages.append(
-                            msg_tool_call["name"] + "_WRONG_FUNCTION_CALL"
-                        )
-                        message_outcome = ExtendedMessage(message=message)
-                        message_outcome.reason = {"reason": "irrelevant tool call"}
-                        message_outcomes.append(message_outcome)
+                    rich.print(
+                        f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
+                    )
+                    # note: this is incorrect after the 1.6 change
+                    message_outcome = ExtendedMessage(message=message)
+                    message_outcome.reason = {"reason": "irrelevant tool call"}
+                    message_outcomes.append(message_outcome)
             elif message.type == ContentType.tool_response:
                 found = False
@@ -272,7 +306,6 @@ class EvaluationPackage:
                     message_outcome = ExtendedMessage(message=message)
                     message_outcomes.append(message_outcome)
             else:
                 message_outcome = ExtendedMessage(message=message)
                 message_outcomes.append(message_outcome)
         assistant_responses = [
@@ -318,15 +351,16 @@ class EvaluationPackage:
     ):
         if len(self.text_list) == 0:
-            return "NA"
+            return TextMatchType.na.value
         elif len(self.text_list) == len(keyword_semantic_match_list):
-            return "Summary Matched"
+            return TextMatchType.text_match.value
         else:
-            return "Summary MisMatched"
+            return TextMatchType.text_mismatch.value
     def generate_summary(self):
         llm_steps = 0
         total_step = 0
+        metrics: ToolCallAndRoutingMetrics
         (
             labelled_messages,
             labelled_messages_without_text_step,
@@ -336,9 +370,7 @@ class EvaluationPackage:
         ) = self.traverse()
         if self.is_analyze_run:
             print(labelled_messages)
-        wrong_call_count = sum(
-            1 for msg in labelled_messages if "_WRONG_FUNCTION_CALL" in msg
-        )
         is_success = self.is_topological_sort(
             self.ground_truth.goals, labelled_messages
         )
@@ -359,28 +391,13 @@ class EvaluationPackage:
         knowledge_base_metric_summary = self.generate_knowledge_base_metric_summary()
         # TO-DO: the table is not printing properly anymore with the new columns introduced
         # we need to introduce a separate table for these.
-        data = {
-            "Dataset": self.test_case_name,
-            "Total Step": total_step,
-            "Agent Step": llm_steps,
-            "Ground Truth Calls": len(self.tool_dictionary),
-            "Wrong Function Calls": wrong_call_count,
-            # "Bad Calls": 0,
-            "Wrong Parameters": sum(
-                1 for msg in labelled_messages if "_WRONG_PARAMETERS" in msg
-            ),
-            "Wrong Routing Calls": sum(
-                1 for msg in labelled_messages if "_WRONG_ROUTING_CALL" in msg
-            ),
-            "Text Match": match,
-            "Journey Success": is_success,
-            # "Tool Call Accuracy": metrics.tool_call_accuracy,
-            # "Tool Call Relevancy": metrics.tool_call_relevancy,
-            # "Agent Routing Accuracy": metrics.agent_routing_accuracy
-        }
+        metrics.total_steps = total_step
+        metrics.llm_step = llm_steps
+        metrics.text_match = match
+        metrics.is_success = is_success
         return (
-            data,
             matches,
             knowledge_base_metric_summary,
             message_with_reasons,
@@ -512,7 +529,7 @@ if __name__ == "__main__":
     evaluate_package = EvaluationPackage(
         test_case_name="data1.messages.json",
         ground_truth=ground_truth,
-        messages=messages,
+        messages=messages
     )
     print(evaluate_package.generate_summary())
     # print(evaluate_package.traverse())

wxo_agentic_evaluation/external_agent/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+import importlib.resources
+import json
+import rich
+from wxo_agentic_evaluation.prompt.template_render import StoryGenerationTemplateRenderer
+from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
+from wxo_agentic_evaluation import prompt
+console = rich.console.Console()
+def starting_sentence_generation_prompt():
+    with importlib.resources.path(prompt, "starting_sentence_generation_prompt.jinja2") as fp:
+        # reuse the StoryGenerationTemplateRenderer class, even though we are generating a "starting_sentence" instead of a "story"
+        # the starting sentence generation prompts uses the same input variable
+        render = StoryGenerationTemplateRenderer(str(fp))
+    return render
+def generate_starting_sentence(annotated_data: dict):
+    renderer = starting_sentence_generation_prompt()
+    llm_decode_parameter = {
+        "min_new_tokens": 0,
+        "decoding_method": "greedy",
+        "max_new_tokens": 4096,
+    }
+    wai_client = get_provider(config=ProviderConfig(), params=llm_decode_parameter)
+    prompt = renderer.render(input_data=json.dumps(annotated_data, indent=4))
+    res = wai_client.query(prompt)
+    res = res.strip()
+    try:
+        # ideally the LLM outputted a dictionary like: {"starting_sentence": "lorem ipsum"}
+        res = json.loads(res)
+        return res["starting_sentence"]
+    except Exception:
+        console.log(f"The generated `starting_sentence` had incorrect format: '{res}'")
+        return res

wxo_agentic_evaluation/external_agent/external_validate.py CHANGED Viewed

@@ -1,12 +1,16 @@
-from wxo_agentic_evaluation.external_agent.types import UniversalData
-import requests
 from typing import Generator
+import requests
 import json
+import rich
+from wxo_agentic_evaluation.external_agent.types import UniversalData, SchemaValidationResults
-MESSAGES = [{"role": "assistant", "content": "how can i help you"}, {"role": "user", "content": "what's the holiday is June 13th in us?"},
-            {"role": "assistant", "content": "tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}"},
-            {"role": "assistant", "content":"it's National Sweing Machine Day"}]
+MESSAGES = [
+    {"role": "user", "content": "what's the holiday is June 13th in us?"},
+    {"role": "assistant", "content": "tool_name: calendar_lookup, args {\"location\": \"USA\", \"data\": \"06-13-2025\"}}"},
+    {"role": "assistant", "content":"it's National Sewing Machine Day"}
+]
 class ExternalAgentValidation:
@@ -14,20 +18,20 @@ class ExternalAgentValidation:
         self.credential = credential
         self.auth_scheme = auth_scheme
         self.service_url = service_url
-    def get_auth_header(self):
+    @property
+    def header(self):
+        header = {"Content-Type": "application/json"}
         if self.auth_scheme == "API_KEY":
-            header = {"x-api-key": self.credential}
+            header = {"X-API-Key": self.credential}
         elif self.auth_scheme == "BEARER_TOKEN":
             header = {"Authorization": f"Bearer {self.credential}"}
         else:
             raise Exception(f"Auth scheme: {self.auth_scheme} is not supported")
         return header
-    def _parse_streaming_evenst(self, resp: Generator[bytes, None, None]):
+    def _parse_streaming_events(self, resp: Generator[bytes, None, None]):
         data = b''
         for chunk in resp:
             for line in chunk.splitlines(True):
@@ -37,31 +41,72 @@ class ExternalAgentValidation:
                     return
                 data += line
                 if data.endswith((b'\r\r', b'\n\n', b'\r\n\r\n')):
+                    # NOTE: edge case, "data" can be sent in two different chunks
+                    if data.startswith(b'data:'):
+                        data = data.replace(b'data:', b'')
                     yield data
                     data = b''
         if data:
             yield data
-    def call_validation(self, input: str):
-        header = {"Content-Type": "application/json"}
-        header.update(self.get_auth_header())
-        new_messages = []
-        new_messages.extend(MESSAGES)
-        new_messages.append({"role": "user", "content": input})
-        payload = {"messages": new_messages}
-        resp = requests.post(url=self.service_url, headers=header, json=payload, stream=True)
-        results = []
-        for json_str in self._parse_streaming_evenst(resp):
+    def _validate_streaming_response(self, resp):
+        success = True
+        logged_events = []
+        for json_str in self._parse_streaming_events(resp):
             json_dict = None
+            logged_events.append(json_str)
             try:
                 json_dict = json.loads(json_str)
                 UniversalData(**json_dict)
-                results.append(json_dict)
             except Exception as e:
-                print(f"event parsing failed with {e}")
-                raise e
+                success = False
+                break
+        return success, logged_events
+    def _validate_schema_compliance(self, messages):
+        payload = {"stream": True}
+        payload["messages"] = messages
+        resp = requests.post(url=self.service_url, headers=self.header, json=payload)
+        success, logged_events = self._validate_streaming_response(resp)
+        msg = ", ".join([msg["content"] for msg in payload["messages"]])
+        if success:
+            rich.print(f":white_check_mark: External Agent streaming response validation succeeded for '{msg}'.")
+        else:
+            rich.print(f":heavy_exclamation_mark:Schema validation failed for messages: '{msg}':heavy_exclamation_mark:\n The last logged event was {logged_events[-1]}.\n")
+        return success, logged_events
+    def call_validation(self, input_str: str, add_context: bool = False) -> SchemaValidationResults:
+        if add_context:
+            return self.block_validation(input_str)
+        msg = {
+            "role": "user",
+            "content": input_str
+        }
+        success, logged_events = self._validate_schema_compliance([msg])
+        results = SchemaValidationResults(success=success, logged_events=logged_events, messages=[msg])
+        return results.model_dump()
+    def block_validation(self, input_str: str) -> SchemaValidationResults:
+        """ Tests a block of messages
+        """
+        rich.print(
+            f"[gold3]The following prebuilt messages, '{MESSAGES}' is prepended to the input message, '{input_str}'"
+        )
+        msg = {
+            "role": "user",
+            "content": input_str
+        }
+        messages = MESSAGES + [msg]
+        success, logged_events = self._validate_schema_compliance(messages)
+        results = SchemaValidationResults(success=success, logged_events=logged_events, messages=messages)
-        return results
+        return results.model_dump()

wxo_agentic_evaluation/external_agent/performance_test.py ADDED Viewed

@@ -0,0 +1,66 @@
+from typing import List, Mapping, Any
+from rich.console import Console
+from wxo_agentic_evaluation.external_agent import generate_starting_sentence
+from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
+from wxo_agentic_evaluation.service_provider import get_provider, ProviderConfig
+from wxo_agentic_evaluation.data_annotator import KeywordsGenerationLLM, LlamaKeywordsGenerationTemplateRenderer
+class ExternalAgentPerformanceTest:
+    def __init__(self, agent_name: str, test_data: List[str]):
+        self.test_data = test_data
+        self.goal_template = {
+            "agent": agent_name,
+            "goals": {"summarize": []},
+            "goal_details": [
+            ],
+            "story": "<placeholder>",
+        }
+        kw_gen_config = KeywordsGenerationConfig()
+        provider_config = ProviderConfig(model_id=kw_gen_config.model_id)
+        llm_decode_parameter = {
+            "min_new_tokens": 0,
+            "decoding_method": "greedy",
+            "max_new_tokens": 256,
+        }
+        wai_client = get_provider(config=provider_config, params=llm_decode_parameter)
+        self.kw_gen = KeywordsGenerationLLM(
+            provider=wai_client,
+            template=LlamaKeywordsGenerationTemplateRenderer(
+                kw_gen_config.prompt_config
+                ),
+            )
+    def generate_tests(self) -> List[Mapping[str, Any]]:
+        console = Console()
+        goal_templates = []
+        with console.status("[gold3]Creating starting sentence for user story from input file for performance testing") as status:
+            for sentence, response in self.test_data:
+                goal_temp = self.goal_template.copy()
+                goal_temp["story"] = sentence
+                keywords = self.kw_gen.genereate_keywords(response)
+                summarize_step = {
+                        "name": "summarize",
+                        "type": "text",
+                        "response": response,
+                        "keywords": keywords
+                    }
+                goal_temp["goal_details"] = [summarize_step]
+                goal_temp["starting_sentence"] = generate_starting_sentence(goal_temp)
+                goal_templates.append(goal_temp)
+            status.stop()
+            console.print("[bold green]Done creating starting sentence from provided input data")
+            return goal_templates
+if __name__ == "__main__":
+    t = ExternalAgentPerformanceTest("test")
+    t.generate_tests()

wxo_agentic_evaluation/external_agent/types.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pydantic import BaseModel
-from typing import List, Union, Literal
+from typing import List, Union, Literal, Mapping, Any
 class ThinkingStepDetails(BaseModel):
@@ -62,4 +62,10 @@ class UniversalData(BaseEventData):
     object: Union[Literal["thread.message.delta"], Literal["thread.run.step.delta"],
                   Literal["thread.run.step.created"], Literal["thread.run.step.completed"]]
     choices: List[ThreadMessageDeltaChoice]
-    choices: List[Union[ThreadMessageDeltaChoice, dict]]
+    choices: List[Union[ThreadMessageDeltaChoice, dict]]
+class SchemaValidationResults(BaseModel):
+    success: bool
+    logged_events: List[str]
+    messages: List[Mapping[Any, Any]]

ibm-watsonx-orchestrate-evaluation-framework 1.0.2__py3-none-any.whl → 1.0.3__py3-none-any.whl

ibm-watsonx-orchestrate-evaluation-framework 1.0.2py3-none-any.whl → 1.0.3py3-none-any.whl