PyPI - ibm-watsonx-orchestrate-evaluation-framework - Versions diffs - 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl - Mend

ibm-watsonx-orchestrate-evaluation-framework 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (41) hide show

wxo_agentic_evaluation/inference_backend.py CHANGED Viewed

@@ -17,10 +17,8 @@ from wxo_agentic_evaluation.type import (
     ConversationalSearchResults,
     ConversationSearchMetadata,
 )
 from wxo_agentic_evaluation.llm_user import LLMUser
-from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
-from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
+from wxo_agentic_evaluation.service_provider.watsonx_provider import WatsonXProvider
 from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.service_instance import tenant_setup
 from wxo_agentic_evaluation.utils.utils import is_saas_url
@@ -33,8 +31,9 @@ def is_end(user_input: Message):
 def is_transfer_response(step_detail: Dict):
-    if step_detail["type"] == "tool_response" and step_detail["name"].startswith(
-        "transfer_to_"
+    # this is not very reliable
+    if step_detail["type"] == "tool_response" and step_detail["name"].endswith(
+        "_agent"
     ):
         return True
     return False
@@ -80,9 +79,11 @@ class WXOInferenceBackend:
             payload["thread_id"] = thread_id
         if self.enable_saas_mode:
+            # TO-DO: this is not validated after the v1 prefix change
+            # need additional validation
             path = "/v1/orchestrate/runs"
         else:
-            path = "/orchestrate/runs"
+            path = "v1/orchestrate/runs"
         response: requests.Response = self.wxo_client.post(payload, path)
@@ -101,9 +102,11 @@ class WXOInferenceBackend:
             payload["thread_id"] = thread_id
         if self.enable_saas_mode:
+            # TO-DO: this is not validated after the v1 prefix change
+            # need additional validation
             path = "/v1/orchestrate/runs?stream=true"
         else:
-            path = "/orchestrate/runs?stream=true"
+            path = "v1/orchestrate/runs?stream=true"
         response: requests.Response = self.wxo_client.post(payload, path, stream=True)
         import json
@@ -381,7 +384,7 @@ class WXOInferenceBackend:
         if self.enable_saas_mode:
             path = f"v1/orchestrate/threads/{thread_id}/messages"
         else:
-            path = f"threads/{thread_id}/messages"
+            path = f"v1/threads/{thread_id}/messages"
         response = self.wxo_client.get(path)
         if response.status_code == 200:
             result = response.json()
@@ -462,7 +465,7 @@ class WXOInferenceBackend:
         if self.enable_saas_mode:
             path = "v1/orchestrate/agents"
         else:
-            path = "orchestrate/agents"
+            path = "v1/orchestrate/agents"
         response = self.wxo_client.get(path)
@@ -477,6 +480,28 @@ class WXOInferenceBackend:
         else:
             response.raise_for_status()
+    def get_agent_name_from_thread_id(self, thread_id: str) -> str:
+        if self.enable_saas_mode:
+            thread_path = f"v1/orchestrate/threads/{thread_id}"
+            agents_path = "v1/orchestrate/agents"
+        else:
+            thread_path = f"v1/threads/{thread_id}"
+            agents_path = "v1/orchestrate/agents"
+        thread_response = self.wxo_client.get(thread_path)
+        thread_response.raise_for_status()
+        thread_data = thread_response.json()
+        agent_id = thread_data.get("agent_id", "")
+        agents_response = self.wxo_client.get(agents_path)
+        agents_response.raise_for_status()
+        agents_list = agents_response.json()
+        for agent in agents_list:
+            if agent.get("id", "") == agent_id:
+                return agent.get("name")
+        return None
 class EvaluationController:
     def __init__(
@@ -532,6 +557,8 @@ class EvaluationController:
                     call_tracker=call_tracker,
                 )
             )
+            if not messages:
+                raise RuntimeError(f"[Task-{task_n}] No messages is produced. Exiting task.")
             if self.config.enable_verbose_logging:
                 for message in messages:
                     rich.print(
@@ -543,31 +570,17 @@ class EvaluationController:
             step += 1
         return conversation_history, call_tracker, conversational_search_history_data
-def get_wxo_client(service_url: str, token: str):
-    wxo_client = WXOClient(service_url=service_url, api_key=token)
-    return wxo_client
-def get_wxo_inference_backend(
+def get_wxo_client(
     service_url: str, tenant_name: str, token: str = None
-) -> WXOInferenceBackend:
+) -> WXOClient:
     if not token:
         token = tenant_setup(service_url, tenant_name)
-    wxo_client = get_wxo_client(service_url, token)
-    inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
-    return inference_backend
+    wxo_client = WXOClient(service_url=service_url, api_key=token)
+    return wxo_client
 if __name__ == "__main__":
     wai_client = WatsonXProvider(model_id="meta-llama/llama-3-3-70b-instruct")
-    llm_user = LLMUser(
-        wai_client=wai_client,
-        template=LlamaUserTemplateRenderer(
-            "src/wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2"
-        ),
-        user_response_style=None,
-    )
     auth_config_path = f"{os.path.expanduser('~')}/.cache/orchestrate/credentials.yaml"
     with open(auth_config_path, "r") as f:
         auth_config = yaml.safe_load(f)
@@ -576,26 +589,8 @@ if __name__ == "__main__":
     wxo_client = WXOClient(service_url="http://localhost:4321", api_key=token)
     inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
-    config = TestConfig(
-        test_paths=[],
-        output_dir="./wxo_agentic_evaluation/results",
-        auth_config=auth_config,
-        wxo_lite_version="0.1.3",
-    )
-    evaluation_controller = EvaluationController(
-        wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
-    )
-    history, _, _ = evaluation_controller.run(
-        0,
-        "Your username is nken and you want to find out the timeoff schedule of your reports from 20250101 o 202505t",
-        agent_name="hr_agent",
-    )
-    # starting_user_input="my username is nken, i want to know the timeoff schedule for my reports from 20250101 to 202505")
-    result = list()
-    for message in history:
-        result.append(message.model_dump())
-    os.makedirs("./wxo_agentic_evaluation/results", exist_ok=True)
-    with open("./wxo_agentic_evaluation/results/messages.json", "w") as f:
-        json.dump(result, f)
+    resp = wxo_client.get("orchestrate/agents")
+    resp = resp.json()
+    print(resp[0])
+    for agent in resp:
+        print(agent["name"], agent["display_name"])

wxo_agentic_evaluation/llm_matching.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
 from wxo_agentic_evaluation.prompt.template_render import (
     KeywordMatchingTemplateRenderer,
     SemanticMatchingTemplateRenderer,
@@ -9,7 +9,7 @@ from typing import List
 class LLMMatcher:
     def __init__(
         self,
-        llm_client: WatsonXProvider,
+        llm_client: Provider,
         keyword_template: KeywordMatchingTemplateRenderer,
         semantic_template: SemanticMatchingTemplateRenderer,
     ):
@@ -26,14 +26,14 @@ class LLMMatcher:
         prompt = self.keyword_template.render(
             keywords_text=keywords_text, response_text=response_text
         )
-        output = self.llm_client.query(prompt)
-        result = output["generated_text"].strip().lower()
+        output:str = self.llm_client.query(prompt)
+        result = output.strip().lower()
         return result.startswith("true")
     def semantic_match(self, prediction: str, ground_truth: str) -> bool:
         prompt = self.semantic_template.render(
             expected_text=ground_truth, actual_text=prediction
         )
-        output = self.llm_client.query(prompt)
-        result = output["generated_text"].strip().lower()
+        output: str = self.llm_client.query(prompt)
+        result = output.strip().lower()
         return result.startswith("true")

wxo_agentic_evaluation/llm_rag_eval.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List
 import json
-from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
 from wxo_agentic_evaluation.prompt.template_render import (
     FaithfulnessTemplateRenderer,
     AnswerRelevancyTemplateRenderer,
@@ -12,7 +12,7 @@ from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRele
 class LLMJudge:
     def __init__(
         self,
-        llm_client: WatsonXProvider,
+        llm_client: Provider,
         faithfulness: FaithfulnessTemplateRenderer,
         answer_relevancy: AnswerRelevancyTemplateRenderer,
     ):
@@ -27,7 +27,7 @@ class LLMJudge:
             claim=claim, retrieval_context=retrieval_context
         )
         output = self.llm_client.query(prompt)
-        result = output["generated_text"].strip().lower()
+        result = output.strip().lower()
         faithfulness = Faithfulness.model_validate(json.loads(result))
@@ -40,7 +40,7 @@ class LLMJudge:
             question=question, context=context, answer=answer
         )
         output = self.llm_client.query(prompt)
-        result = output["generated_text"].strip().lower()
+        result = output.strip().lower()
         answer_relevancy = AnswerRelevancy(answer_relevancy=json.loads(result))

wxo_agentic_evaluation/llm_user.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import List, TypeVar
 from wxo_agentic_evaluation.type import Message, ContentType
-from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
+from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
 from wxo_agentic_evaluation.prompt.template_render import JinjaTemplateRenderer
 T = TypeVar("T", bound=JinjaTemplateRenderer)
@@ -8,7 +8,7 @@ T = TypeVar("T", bound=JinjaTemplateRenderer)
 class LLMUser:
     def __init__(
-        self, wai_client: WatsonXProvider, template: T, user_response_style: List[str]
+        self, wai_client: Provider, template: T, user_response_style: List[str]
     ):
         self.wai_client = wai_client
         self.prompt_template = template
@@ -32,7 +32,7 @@ class LLMUser:
         user_input = self.wai_client.query(prompt_input)
         user_input = Message(
             role="user",
-            content=user_input["generated_text"].strip(),
+            content=user_input.strip(),
             type=ContentType.text,
         )
         return user_input

wxo_agentic_evaluation/main.py CHANGED Viewed

@@ -1,24 +1,27 @@
-from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
+from wxo_agentic_evaluation.service_provider import get_provider
+from wxo_agentic_evaluation.resource_map import ResourceMap
 from wxo_agentic_evaluation.llm_user import LLMUser
 from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
 from wxo_agentic_evaluation.inference_backend import (
     EvaluationController,
-    get_wxo_inference_backend,
+    get_wxo_client,
+    WXOInferenceBackend
 )
+from typing import List
 from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
 from wxo_agentic_evaluation.type import EvaluationData
 from wxo_agentic_evaluation.arg_configs import TestConfig
 from wxo_agentic_evaluation.utils.utils import (
     create_table,
-    create_average_row,
     SummaryPanel,
+    safe_divide
 )
 from wxo_agentic_evaluation.utils import json_dump
-from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
+from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ToolCallAndRoutingMetrics, TextMatchType
 import os
 import json
+import traceback
 import yaml
 import dataclasses
 import glob
@@ -30,7 +33,7 @@ from concurrent.futures import ThreadPoolExecutor
 from jsonargparse import CLI
-def process_test_case(task_n, test_case, config, inference_backend, llm_user):
+def process_test_case(task_n, test_case, config, inference_backend, resource_map, llm_user):
     summary_results_for_path = []
     tc_name = os.path.basename(test_case).replace(".json", "")
     with open(test_case, "r") as f:
@@ -70,9 +73,9 @@ def process_test_case(task_n, test_case, config, inference_backend, llm_user):
         messages=history,
         ground_truth=test_case,
         conversational_search_data=conversational_search_data,
+        resource_map=resource_map
     )
     (
-        tool_call_metrics,
         keyword_semantic_matches,
         knowledge_base_metrics,
         messages_with_reason,
@@ -91,27 +94,26 @@ def process_test_case(task_n, test_case, config, inference_backend, llm_user):
         metrics.model_dump(),
     )
-    tool_call_metrics["Avg Resp Time (Secs)"] = (
+    metrics.dataset_name = tc_name
+    metrics.avg_resp_time = (
         sum(call_tracker.generic) + sum(call_tracker.tool_call)
     ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
-    tool_call_metrics["Avg Resp Time (Secs)"] = round(
-        tool_call_metrics["Avg Resp Time (Secs)"], 2
-    )
+    metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
-    summary_results_for_path.append((tool_call_metrics, knowledge_base_metrics))
+    summary_results_for_path.append((metrics, knowledge_base_metrics))
     return summary_results_for_path
 def main(config: TestConfig):
     executor = ThreadPoolExecutor(max_workers=config.num_workers)
-    wai_client = WatsonXProvider(model_id=config.llm_user_config.model_id)
-    inference_backend = get_wxo_inference_backend(
+    wxo_client = get_wxo_client(
         config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
     )
+    resource_map = ResourceMap(wxo_client)
+    inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
     llm_user = LLMUser(
-        wai_client=wai_client,
+        wai_client=get_provider(config=config.provider_config, model_id=config.llm_user_config.model_id),
         template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
         user_response_style=config.llm_user_config.user_response_style,
     )
@@ -163,6 +165,7 @@ def main(config: TestConfig):
             test_case,
             config,
             inference_backend,
+            resource_map,
             llm_user,
         )
@@ -179,6 +182,7 @@ def main(config: TestConfig):
                     results_list.extend(future.result())
                 except Exception as e:
                     rich.print(f"test case {test_case} fails with {e}")
+                    traceback.print_exc()
                 finally:
                     progress.update(task1, advance=1)
@@ -199,17 +203,53 @@ def main(config: TestConfig):
     if len(tool_call_metrics) > 0:
         # remove the average row if exist
         tool_call_metrics = [
-            row for row in tool_call_metrics if row["Dataset"] != "Summary (Average)"
+            row for row in tool_call_metrics if row.dataset_name != "Summary (Average)"
         ]
-        avg_row = create_average_row(tool_call_metrics)
-        tool_call_metrics.append(avg_row)
-    tool_call_table = create_table(tool_call_metrics)
+        def filter_display_only_values(tool_call_metric: ToolCallAndRoutingMetrics):
+            row = {"Dataset": tool_call_metric.dataset_name, "Total Steps": tool_call_metric.total_steps,
+                   "LLM Steps": tool_call_metric.llm_step, "Total Tool Calls":tool_call_metric.total_tool_calls, "Tool Call Precision": tool_call_metric.tool_call_precision, "Tool Call Recall": tool_call_metric.tool_call_recall,
+                   "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy, "Text Match": tool_call_metric.text_match, "Journey Success": tool_call_metric.is_success, "Avg Resp Time (sec)": tool_call_metric.avg_resp_time}
+            return row
+        def create_avg_row(metrics: List[dict]):
+            avg_row = {"Dataset": "Summary (Average)", "Total Steps": 0,
+                   "LLM Steps": 0, "Total Tool Calls":0,  "Tool Call Precision": 0, "Tool Call Recall": 0, "Agent Routing Accuracy": 0,
+                   "Text Match": 0, "Journey Success": 0, "Avg Resp Time (sec)": 0}
+            if metrics:
+                for row in metrics:
+                    avg_row["Total Steps"] += row["Total Steps"]
+                    avg_row["LLM Steps"] += row["LLM Steps"]
+                    avg_row["Total Tool Calls"] += row["Total Tool Calls"]
+                    avg_row["Tool Call Precision"] += row["Tool Call Precision"]
+                    avg_row["Tool Call Recall"] += row["Tool Call Recall"]
+                    avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
+                    avg_row["Text Match"] += row["Text Match"] == TextMatchType.text_match.value
+                    avg_row["Journey Success"] += row["Journey Success"]
+                    avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
+                avg_row["Total Steps"] = round(safe_divide(avg_row["Total Steps"], len(metrics)), 2)
+                avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], len(metrics)), 2)
+                avg_row["Total Tool Calls"] = round(safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2)
+                avg_row["Tool Call Precision"] = round(safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2)
+                avg_row["Tool Call Recall"] = round(safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2)
+                avg_row["Agent Routing Accuracy"] = round(safe_divide(avg_row["Agent Routing Accuracy"], len(metrics)), 2)
+                avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], len([row for row in metrics if row["Text Match"] != TextMatchType.text_match.na])), 2)
+                avg_row["Journey Success"] = round(safe_divide(avg_row["Journey Success"], len(metrics)), 2)
+                avg_row["Avg Resp Time (sec)"] = round(safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2)
+            return avg_row
+        tool_call_metrics_for_display = []
+        for row in tool_call_metrics:
+            tool_call_metrics_for_display.append(filter_display_only_values(row))
+        tool_call_metrics_for_display.append(create_avg_row(tool_call_metrics_for_display))
+        tool_call_table_for_display = create_table(tool_call_metrics_for_display)
-    if tool_call_table:
-        tool_call_table.print()
+        if tool_call_table_for_display:
+            tool_call_table_for_display.print()
     if len(tool_call_metrics) > 0:
+        tool_call_metrics = [metric.model_dump() for metric in tool_call_metrics]
         output_file = os.path.join(config.output_dir, "summary_metrics.csv")
         header = list(tool_call_metrics[0].keys())

wxo_agentic_evaluation/metrics/metrics.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import math
 from typing import List, Mapping, Any
+from enum import Enum
 from pydantic import BaseModel, computed_field
@@ -107,3 +108,61 @@ class KeywordSemanticSearchMetric(BaseModel):
     semantic_match: bool
     message: str
     goal_detail: str
+class TextMatchType(Enum):
+    text_match = "Summary Matched"
+    text_mismatch = "Summary MisMatched"
+    na = "NA"
+class ToolCallAndRoutingMetrics(BaseModel):
+    dataset_name: str = ""
+    total_steps: int=0
+    llm_step: int =0
+    total_tool_calls: int = 0
+    expected_tool_calls: int = 0
+    correct_tool_calls: int = 0
+    relevant_tool_calls: int = 0 # calls with the same function but different args
+    total_routing_calls: int = 0
+    relevant_routing_calls: int = 0
+    tool_calls_with_incorrect_parameter: int = 0
+    text_match: TextMatchType = TextMatchType.na
+    is_success: bool = False
+    avg_resp_time: float = -1
+    @computed_field
+    @property
+    def tool_call_recall(self) -> float:
+        return round(
+            (
+                self.correct_tool_calls/self.expected_tool_calls
+                if self.expected_tool_calls > 0
+                else 0.0
+            ),
+            2,
+        )
+    @computed_field
+    @property
+    def tool_call_precision(self) -> float:
+        return round(
+            (
+                (self.correct_tool_calls)
+                / self.total_tool_calls
+                if self.total_tool_calls > 0
+                else 0.0
+            ),
+            2,
+        )
+    @computed_field
+    @property
+    def agent_routing_accuracy(self) -> float:
+        return round(
+            (
+                self.relevant_routing_calls / self.total_routing_calls
+                if self.total_routing_calls > 0
+                else 0.0
+            ),
+            2,
+        )

wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2 ADDED Viewed

@@ -0,0 +1,23 @@
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+You are trying to make tool calls. Given a raw input and tool output. Try to extract the information to make the tool call
+Example:
+    Tool description:
+    def get_payslips(user_id: str) -> PayslipsResponse:
+    Gets a user's payslips from Workday.
+    :param user_id: The user's id uniquely identifying them within the Workday API.
+    :return: The user's payslips.
+Raw inputs: {"tool_name": "get_payslips", "inputs": {"user_id": '$get_user_workday_ids'}}
+Tool output: {'user_id': UserWorkdayIDs(person_id='', user_id='6dcb8106e8b74b5aabb1fc3ab8ef2b92')}
+<|start_header_id|>ipython<|end_header_id|>
+{"tool_name": "get_payslips", "inputs": {"user_id": "6dcb8106e8b74b5aabb1fc3ab8ef2b92"}}
+<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
+    Tool description:
+    {{ tool_signature }}
+Raw inputs: {{ step }}
+Tool output: {{ inputs }}
+<|start_header_id|>ipython<|end_header_id|>

wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 CHANGED Viewed

@@ -40,6 +40,8 @@ Please use the following format for your response:
 ]
 {% endraw %}
+NO EXTRA TEXT OR COMMENTS. Just return the JSON array of test cases as specified above.
 The final summarize step must use actual values from tool outputs (no placeholders).
 Here is one complete example to follow:

wxo_agentic_evaluation/prompt/examples/data_simple.json CHANGED Viewed

@@ -88,6 +88,5 @@
         "2025-02-20"
       ]
     }
-  ],
-  "mine_fields": []
+  ]
 }

ibm-watsonx-orchestrate-evaluation-framework 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

Potentially problematic release.

ibm-watsonx-orchestrate-evaluation-framework 1.0.2py3-none-any.whl → 1.0.4py3-none-any.whl