ibm-watsonx-orchestrate-evaluation-framework 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (63) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
  3. wxo_agentic_evaluation/analytics/tools/main.py +1 -18
  4. wxo_agentic_evaluation/analyze_run.py +358 -97
  5. wxo_agentic_evaluation/arg_configs.py +28 -1
  6. wxo_agentic_evaluation/description_quality_checker.py +149 -0
  7. wxo_agentic_evaluation/evaluation_package.py +65 -20
  8. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  9. wxo_agentic_evaluation/external_agent/performance_test.py +2 -3
  10. wxo_agentic_evaluation/inference_backend.py +117 -14
  11. wxo_agentic_evaluation/llm_user.py +2 -1
  12. wxo_agentic_evaluation/main.py +5 -0
  13. wxo_agentic_evaluation/metrics/metrics.py +22 -1
  14. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  15. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
  16. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  17. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  18. wxo_agentic_evaluation/prompt/template_render.py +34 -3
  19. wxo_agentic_evaluation/quick_eval.py +342 -0
  20. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
  21. wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
  22. wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
  23. wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
  24. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  26. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  30. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
  40. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
  41. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  42. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
  43. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  44. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
  45. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
  46. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
  47. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
  48. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
  49. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  50. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
  51. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
  52. wxo_agentic_evaluation/service_instance.py +2 -2
  53. wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
  54. wxo_agentic_evaluation/tool_planner.py +3 -1
  55. wxo_agentic_evaluation/type.py +33 -2
  56. wxo_agentic_evaluation/utils/__init__.py +0 -1
  57. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
  58. wxo_agentic_evaluation/utils/rich_utils.py +174 -0
  59. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  60. wxo_agentic_evaluation/utils/utils.py +167 -5
  61. ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info/RECORD +0 -56
  62. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
  63. {ibm_watsonx_orchestrate_evaluation_framework-1.0.7.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.0.7
3
+ Version: 1.0.9
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
@@ -10,7 +10,7 @@ Requires-Dist: rich~=13.9.4
10
10
  Requires-Dist: pydantic<3.0.0,>=2.10.3
11
11
  Requires-Dist: pyyaml~=6.0.2
12
12
  Requires-Dist: jinja2~=3.1.5
13
- Requires-Dist: python-dotenv~=1.0.1
13
+ Requires-Dist: python-dotenv
14
14
  Requires-Dist: dataclasses-json~=0.6.7
15
15
  Requires-Dist: jsonargparse~=4.37.0
16
16
  Provides-Extra: dev
@@ -32,29 +32,46 @@ Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
32
32
  Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
33
33
  Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
34
34
 
35
- # WXO-agent evaluation framework
36
-
37
- - This framework is designed to test a tool-calling agent's ability to make real API calls against a `wxo-dev` testing tenant on your local wxo-lite server instance.
38
-
35
+ # WXO Agent Evaluation Framework
36
+
37
+ ## Table of Contents
38
+ - [Overview](#overview)
39
+ - [ADK Setup Guide](#adk-setup-guide)
40
+ - [Setup](#setup-for-evaluation-framework)
41
+ - [Quick Experiment](#quick-experiment-against-the-default-wxo-dev-env)
42
+ - [Run Against a Deployed Local Env](#run-against-a-deployed-local-env)
43
+ - [Run Against a SaaS Tenant](#run-against-a-saas-tenant)
44
+ - [Analyze Results](#analyze-results)
45
+ - [Record Chat Sessions](#record-chat-sessions)
46
+ - [Batch Test Case Generation](#batch-test-case-generation)
47
+ - [Using Model Proxy Provider](#using-model-proxy-provider)
48
+ - [Using Ollama](#using-ollama)
49
+ - [Workflow Diagram](#workflow-diagram)
50
+ - [Results](#results)
51
+ - [Metrics](#metrics)
52
+
53
+ ## Overview
54
+
55
+ - This framework is designed to test a tool-calling agent's ability to make real API calls against a `wxo-dev` testing tenant on your local wxo-lite server instance. To run evaluation against a remote tenant on SaaS, follow [Run Against a SaaS Tenant](#run-against-a-saas-tenant).
39
56
  - As an LLM-as-agent evaluation framework, we aim to test the agent's ability to do the following:
40
57
  - We use a ground truth to evaluate our conversation against after inference. The process of inference is manifested through a user-LLM and agent simulation. Please set `enable_verbose_logging: True` in your configuration.
41
- - Make real API calls correctly and efficiently. We provide metrics which measure the number of bad tool calls made by the agent, normalized against the number of ground truth calls made.
42
-
58
+ - Make real API calls correctly and efficiently. We provide metrics such as tool call precision, recall, and routing accuracy to measure the agent's performance against the ground truth.
43
59
  - The `benchmarks/` folder contains test-cases for the different agents we have evaluated so far. They are segmented by release versions of the `wxo-domains` repository.
44
60
  - The agent calls the `runs/` endpoint of the wxo-lite server instance, and the actual tool code is executed on the server side. The server database is not visible to our framework.
45
61
 
46
- ## prerequisite
47
- Follow the [SDK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the SDK.
48
- The current framework is compatible with ADK version >= 1.20, <= 1.6.0
62
+ ## ADK Setup Guide
63
+ Follow the [ADK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the ADK.
49
64
 
50
- ## setup for evaluation framework
65
+ The current framework is compatible with ADK version >= 1.20, <= 1.7.0
66
+
67
+ ## Setup for Evaluation Framework
51
68
  Run the following command to install evaluation framework in the same env:
52
69
  ```
53
70
  pip install -e .
54
71
  ```
55
72
 
56
- ## contribution guide
57
- ### secret resolution
73
+ ## Contribution Guide
74
+ ### Secret Resolution
58
75
  install detect secret utilities:
59
76
  ```
60
77
  pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
@@ -65,7 +82,7 @@ detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseli
65
82
  ```
66
83
 
67
84
 
68
- ## quick experiment against the default wxo-dev env
85
+ ## Quick Experiment Against the Default wxo-dev Env
69
86
  ```bash
70
87
  orchestrate server start
71
88
  export WATSONX_SPACE_ID=""
@@ -88,8 +105,7 @@ Note:
88
105
  1. This approach uses the default `wxo-dev` tenant already available in your orchestrate env if you have used wxo-lite before.
89
106
  2. ADK also reads the env environments variable. If you have an env conflict, start the wxo-lite server before exporting the envs.
90
107
 
91
-
92
- ## run against a deployed local env
108
+ ## Run Against a Deployed Local Env
93
109
 
94
110
  1. start the orchestrated server: `orchestrate server start`
95
111
  2. create a simple test case like the following save in a folder like `benchmarks/TEST_CASE_NAME`:
@@ -116,7 +132,6 @@ Note:
116
132
  - The target agent name can be found `orchestrate agents list`
117
133
  - the example shown only evaluate the final response for the agent. For more sophisticated examples, follow `benchmarks/hr_sample/data_simple.json` or `benchmarks/hr_sample/data_complex.json`.
118
134
 
119
-
120
135
  3. create a test config yaml like the following:
121
136
  ```YAML
122
137
  test_paths:
@@ -129,7 +144,6 @@ auth_config:
129
144
  output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
130
145
  ```
131
146
 
132
-
133
147
  NOTE: run `orchestrate env list` to find the name of the active tenant. for default `local` tenant, the name should be `wxo-dev`
134
148
 
135
149
  4. Run the test:
@@ -141,27 +155,17 @@ python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
141
155
 
142
156
  NOTE: if your run fails for any reason and doesn't cover all the test cases, you can re-run the main script with `--skip_available_results=True` to skip the test cases that are already completed.
143
157
 
144
- ## analyze error
145
- ```bash
146
- python -m wxo_agentic_evaluation.analyze_run --input_data results/hr_sample/llama_3_2_90b/messages/data_simple.messages.json --ground_truth benchmarks/hr_sample/data_simple.json --enable_verbose_logging False
147
- ```
148
- You can also run the analyze script on a batch of test cases in a folder
149
- ```bash
150
- python -m wxo_agentic_evaluation.analyze_run --input_data results/hr_sample/llama_3_2_90b/messages/ --ground_truth benchmarks/hr_sample/ --enable_verbose_logging False --enable_verbose_logging False
151
- ```
152
-
158
+ ## Run Against a SaaS Tenant
153
159
 
154
- ## Run Against a SaaS Tenant (Orchestrate SDK ≥ 1.2)
160
+ Orchestrate ADK 1.2 is required for this section.
155
161
 
156
162
  This section describes how to run benchmark tests using a **SaaS-based Orchestrate tenant**. The rest of the setup (test case creation, config structure, etc.) is similar to the [local setup](#run-against-a-deployed-local-env) and can be referred to as needed.
157
163
 
158
164
  ### Prerequisites
159
165
 
160
- - **Orchestrate SDK version ≥ 1.2** is required.
166
+ - **Orchestrate ADK version ≥ 1.2** is required.
161
167
  - Access to the **production SaaS Orchestrate instance** or **staging SaaS Orchestrate instance**.
162
168
 
163
- ---
164
-
165
169
  ### 1. Get Authentication Details
166
170
 
167
171
  1. Visit the Orchestrate UI [ Prod /staging]:
@@ -178,8 +182,6 @@ For other locations, please use the designated url for your data center.
178
182
  4. For more detailed instructions, refer to this guide:
179
183
  https://developer.ibm.com/apis/catalog/watsonorchestrate--custom-assistants/Getting+the+API+endpoint
180
184
 
181
- ---
182
-
183
185
  ### 2. Add the SaaS Tenant
184
186
 
185
187
  Run the following command:
@@ -209,16 +211,12 @@ orchestrate env add -n saas \
209
211
 
210
212
  > When prompted, paste the API key generated above.
211
213
 
212
- ---
213
-
214
- ### 3. Set the IAM API Key Environment Variable
214
+ ### 3. Set `WO_API_KEY` Environment Variable
215
215
 
216
216
  ```bash
217
- export WATSONX_IAM_SAAS_APIKEY=[your_generated_api_key]
217
+ export WO_API_KEY=[your_generated_api_key]
218
218
  ```
219
219
 
220
- ---
221
-
222
220
  ### 4. Update Your Test Config YAML
223
221
 
224
222
  Make sure your YAML config includes the correct SaaS tenant name:
@@ -234,17 +232,55 @@ auth_config:
234
232
  output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
235
233
  ```
236
234
  - Use staging url if using the staging set-up.
237
- ---
238
-
239
235
  ### 5. Run the Simulation in SaaS Mode
240
236
 
241
237
  ```bash
242
238
  python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
243
239
  ```
244
240
 
245
- ---
241
+ ## Analyze Results
242
+
243
+ The `analyze_run.py` script summarizes agent evaluation results, showing successes, failures, and reasons for errors to help improve agent performance. After running an evaluation, analyze the results with:
244
+
245
+ ```bash
246
+ python -m wxo_agentic_evaluation.analyze_run --data_path path/to/results
247
+ ```
248
+
249
+ Additionally, the script comes with a feature to analyze the quality of tool descriptions for failing tools where the reason for failure is incorrect parameter usage by the agent.
246
250
 
247
- ### Batch Test case Generation
251
+ In order to analyze the description(s) of your failing tools, consider passing the optional flag `--tool_definition_path` like so:
252
+
253
+ ```bash
254
+ python -m wxo_agentic_evaluation.analyze_run --data_path path/to/results --tool_definition_path path/to/.py/source/file/containing/tool/definitions
255
+ ```
256
+
257
+ **Note:** If the flag `tool_definition_path` is not provided, description quality analysis is simply skipped.
258
+
259
+ ## Record Chat Sessions
260
+
261
+ The `record_chat.py` script lets you capture your chat sessions in the chat UI and automatically generate ground truth data for evaluating your agents. This is valuable for benchmarking and experimenting with agent behavior under different configurations.
262
+
263
+ Start the chat interface:
264
+
265
+ ```bash
266
+ orchestrate chat start
267
+ ```
268
+
269
+ Then open your browser to [http://localhost:3000/chat-lite](http://localhost:3000/chat-lite) and select the agent you wish to interact with.
270
+
271
+ To begin recording, run:
272
+
273
+ ```bash
274
+ python -m wxo_agentic_evaluation.record_chat --output_dir dir/to/save/recordings
275
+ ```
276
+
277
+ While this process is running, for every chat session, annotated ground truth data is generated in your output directory: `<THREAD_ID>_annotated_data.json`
278
+
279
+ Review the generated annotated data for accuracy before using it for evaluation.
280
+
281
+ Press `Ctrl+C` in the terminal to stop recording when your session is complete.
282
+
283
+ ## Batch Test Case Generation
248
284
 
249
285
  For full instructions on setting up tools, writing stories, configuring the pipeline, and generating batch test cases, see the [Batch Test case Generation Guide](./benchmarks/batch_sample/README.MD).
250
286
 
@@ -311,9 +347,9 @@ To use model from Ollama (local LLM deployment), follow these steps:
311
347
  python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
312
348
  ```
313
349
 
314
- ## Workflow diagram
350
+ ## Workflow Diagram
315
351
 
316
- To help better understand the workflow, this is a diagram of how this repo works together with wxO lite python SDK and a wxO runtime.
352
+ To help better understand the workflow, this is a diagram of how this repo works together with wxO ADK and a wxO runtime.
317
353
 
318
354
  ![Alt text](./doc/assets/workflow.png "Workflow")
319
355
 
@@ -322,70 +358,28 @@ Inputs:
322
358
  - a json file containing test cases, see [example 1](benchmarks/hr_sample/data_complex.json) or [example 2](benchmarks/hr_sample/data_simple.json) as a reference
323
359
  - optionally, a `tools.py` file for tools definition and one or more agent definitions e.g. `benchmarks/hr_sample/hr_agent.json`. Alternatively, these files are not needed if you have a tenant already set up with such tools and agents
324
360
 
325
- Steps:
326
- 1. (optional) this repo will call wxO Lite python SDK (which calls several endpoints exposed by the wxO runtime) to set up the environment needed for the evaluation run, tools and agents will be imported
327
- 2. Create test cases by following the sample instructions at [benchmarks/sap_successfactors_sample/annotation/README.md](benchmarks/sap_successfactors_sample/annotation/README.md)
328
- 3. Start the evaluation run by calling the `wxo_agentic_evaluation.main` script of this repo, which will invoke the `/runs` endpoint of the wxO runtime to simulate conversations with the agent
329
- 4. Reports and metrics will be generated by this repo
330
- 5. (optional) this repo will call wxO Lite python SDK (which calls several endpoints exposed by the wxO runtime) to clean up the environment, to avoid the tools and agents affecting subsequent runs with the same tenant
331
- 6. (optional) You can generate further error analysis by using the `wxo_agentic_evaluation.analyze_run` script from this repo
332
-
333
-
334
- ## results
335
- ### workday
336
-
337
- | Model | User Setting | Total Step | Agent Step | Journey Success | Wrong Function Calls | Bad Calls | Wrong Parameters | Wrong Routing Calls | Text Match | Test Cases | WXO Avg. Response Time |
338
- |-------------------------------|------------- |-------------|------------|------------------|-----------------------|-----------|-------------------|----------------------|------------|------------|-------------------------|
339
- | llama-3-2-90b-vision-instruct | normal | 8.13 | 4.21 | 0.87 | 0.01 | 0.0 | 0.20 | 0.00 | 0.95 | 38 | 15.09 |
340
- | llama-3-2-90b-vision-instruct | verbose | 11.76 | 6.11 | 0.79 | 0.02 | 0.0 | 0.19 | 0.00 | 0.86 | 38 | 14.32 |
341
- | llama-3-405b-instruct | normal | 9.66 | 5.03 | 0.82 | 0.02 | 0.0 | 0.47 | 0.04 | 0.89 | 38 | 13.36 |
342
- | llama-3-405b-instruct | verbose | 11.76 | 6.11 | 0.84 | 0.05 | 0.0 | 0.70 | 0.04 | 0.92 | 38 | 12.21 |
343
-
344
- You can find the detailed results under [results/workday](results/workday)
345
-
346
- ### sap successfactor (rel-1.7)
347
-
348
- | Model | User Setting | Total Step | Agent Step | Journey Success | Wrong Function Calls | Bad Calls | Wrong Parameters | Wrong Routing Calls | Text Match | Test Cases | WXO Avg. Response Time |
349
- |------------------------------ |--------------|-------------|------------|------------------|-----------------------|-----------|-------------------|----------------------|------------|------------|-------------------------|
350
- | llama-3-2-90b-vision-instruct | normal | 10.32 | 5.84 | 0.73 | 0.04 | 0.0 | 0.06 | 0.08 | 0.84 | 38 | - |
351
- | llama-3-2-90b-vision-instruct | verbose | 11.19 | 6.35 | 0.68 | 0.04 | 0.0 | 0.08 | 0.16 | 0.81 | 38 | - |
352
- | llama-3-405b-instruct | normal | 11.41 | 6.24 | 0.46 | 0.01 | 0.0 | 0.23 | 0.02 | 0.62 | 38 | - |
353
- | llama-3-405b-instruct | verbose | 15.32 | 8.38 | 0.46 | 0.04 | 0.0 | 0.40 | 0.06 | 0.62 | 38 | - |
354
-
355
- You can find the detailed results under [results/sap_successfactor_4](results/sap_successfactor_4)
356
-
357
-
358
- ## METRICS KEY
359
-
360
- | Metric | Description | Calculation | Range/Type |
361
- |--------|-------------|-------------|------------|
362
- | **Total Step** | Total number of messages/steps in the conversation | Count of all messages in the conversation | Integer ≥ 0 |
363
- | **Agent Step** | Number of assistant responses (text or tool calls) | Count of messages where `role == "assistant"` and `type` is text or tool_call | Integer ≥ 0 |
364
- | **Ground Truth Calls** | Expected number of tool calls based on ground truth | Count of goal_details with `type == ContentType.tool_call` | Integer ≥ 0 |
365
- | **Journey Success** | Whether the agent completed tasks in the correct order | `is_topological_sort(ground_truth.goals, labelled_messages)` | Boolean |
366
- | **Wrong Function Calls** | Number of calls to non-existent or unexpected functions | Count of labelled_messages containing "_WRONG_FUNCTION_CALL" | Integer ≥ 0 |
367
- | **Bad Calls** | Reserved metric for future use | Currently hardcoded to 0 | Integer (0) |
368
- | **Wrong Parameters** | Number of tool calls with incorrect parameters | Count of labelled_messages containing "_WRONG_PARAMETERS" | Integer ≥ 0 |
369
- | **Wrong Routing Calls** | Number of incorrect agent routing calls | Count of labelled_messages containing "_WRONG_ROUTING_CALL" | Integer ≥ 0 |
370
- | **Text Match** | Quality of final text summary | "Keyword Mismatch" \| "Semantic Mismatch" \| "Summary Matched" | Categorical |
371
- | **Tool Call Accuracy** | Percentage of non-routing tool calls that were executed correctly | `correct_tool_calls / non_transfer_tool_calls` | Float 0.0-1.0 |
372
- | **Tool Call Relevancy** | Percentage of non-routing tool calls that were relevant to the task | `(relevant_tool_calls - expected_routing_calls) / non_transfer_tool_calls` | Float 0.0-1.0 |
373
- | **Agent Routing Accuracy** | Percentage of routing calls that were executed correctly | `expected_routing_calls / total_routing_calls` | Float 0.0-1.0 |
374
- | **WXO Average Response Time (Secs)** | Average response time for agent responses | Mean response time across all agent interactions | Float ≥ 0.0 |
361
+ ## Results
362
+ You can find benchmark results [here](benchmarks/domain_1.8/README.md)
375
363
 
376
- ### Key Definitions
364
+ ## Metrics
377
365
 
378
- - **Relevant Tool Call**: A tool call whose name matches one of the expected tool names defined in the ground truth
379
- - **Correct Tool Call**: A relevant tool call that also has the correct parameters/arguments
380
- - **Routing Call**: A tool call whose name starts with "transfer_" (used for agent-to-agent routing)
381
- - **Non-Transfer Tool Call**: Regular tool calls excluding routing calls (`total_tool_calls - total_routing_calls`)
382
- - **Expected Routing Call**: A routing call that was both expected and executed correctly
366
+ | Metric | Description | Calculation | Range/Type |
367
+ |----------------------------|--------------------------------------------------------------------|------------------------------------------------------------------------------|--------------------|
368
+ | **Total Steps** | Total number of messages/steps in the conversation | Count of all messages in the conversation | Integer ≥ 0 |
369
+ | **LLM Steps** | Number of assistant (LLM) responses (text or tool calls) | Count of messages where `role == "assistant"` | Integer ≥ 0 |
370
+ | **Total Tool Calls** | Number of tool calls made by the agent | Count of all tool calls | Integer ≥ 0 |
371
+ | **Tool Call Precision** | Fraction of correct tool calls out of all tool calls | `correct_tool_calls / total_tool_calls` | Float 0.0–1.0 |
372
+ | **Tool Call Recall** | Fraction of correct tool calls out of expected tool calls | `correct_tool_calls / expected_tool_calls` | Float 0.0–1.0 |
373
+ | **Agent Routing Accuracy** | Fraction of correct agents visited (relevant_routing_calls) out of total number of agents visited (total_routing_calls) | `relevant_routing_calls / total_routing_calls` | Float 0.0–1.0 |
374
+ | **Text Match** | Whether the final summary text matches the ground truth | `Summary Matched` \| `Summary MisMatched` | Categorical |
375
+ | **Journey Success** | Whether the agent completed tasks in the correct order | Boolean (`True`/`False`) | Boolean |
376
+ | **Avg Resp Time (sec)** | Average response time for agent responses | Mean response time across all agent interactions | Float ≥ 0.0 |
383
377
 
384
- ### Averaging Behavior
385
-
386
- - **Per Test Case Average**: Total Step, Agent Step, Tool Call Accuracy, Tool Call Relevancy, Agent Routing Accuracy, WXO Average Response Time
387
- - **Per Ground Truth Calls Average**: Wrong Function Calls, Bad Calls, Wrong Parameters, Wrong Routing Calls
388
- - **Special Calculations**:
389
- - Journey Success: Proportion of test cases that succeeded (0.0-1.0)
390
- - Text Match: Proportion of test cases with "Summary Matched" (0.0-1.0)
378
+ ### Key Definitions
391
379
 
380
+ - **Correct Tool Call**: A tool call that matches both the expected function and arguments.
381
+ - **Expected Tool Call**: A tool call that is required by the ground truth.
382
+ - **Routing Call**: When an agent routes to another agent.
383
+ - **Relevant Routing Call**: An agent is relevant when it's either the entry point agent or it includes a tool that is presented in the ground-truth.
384
+ - **Text Match**: Indicates if the agent's final summary matches the expected summary ("Summary Matched") or does not match ("Summary MisMatched").
385
+ - **Journey Success**: Indicates if the agent completed all required tasks in the correct order.
@@ -0,0 +1,96 @@
1
+ wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ wxo_agentic_evaluation/analyze_run.py,sha256=4QLlo_NQjCh5M52ztFHoMvk_jtwptKpVXDmdTxj2ikQ,13054
3
+ wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=a3Lo3RurTOLysxmsliMKIqvld7T3ZTb4Kw_FPEeBC78,2997
5
+ wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
+ wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
7
+ wxo_agentic_evaluation/description_quality_checker.py,sha256=7vvGpPwa8J8ArTWAXRp865e_cHzSTMFLxkpI-rfj2ZQ,6097
8
+ wxo_agentic_evaluation/evaluation_package.py,sha256=9NrpKaGOUnAkslP7t3vU3Uv4lFUs-XLu0IUO7q0Muik,23575
9
+ wxo_agentic_evaluation/inference_backend.py,sha256=ItnwjhEJHX28sBS7CIVe7hmcy9FLd1HQEpzhdsJ1jDk,30341
10
+ wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
11
+ wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
12
+ wxo_agentic_evaluation/llm_user.py,sha256=LhS7Ti9v3TLMrEv0og9N6yUF4y8lLMcMycEqVhwtGAE,1493
13
+ wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
14
+ wxo_agentic_evaluation/quick_eval.py,sha256=nROa-xZ265-k8JJ1M4t1LZe4ucdJi8GuRNVuCWPiZTU,12525
15
+ wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
16
+ wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
17
+ wxo_agentic_evaluation/service_instance.py,sha256=6Y7byxdQakB3NMP288Rhne3ygOumSSgJjBT5Q-YY1OA,6468
18
+ wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
19
+ wxo_agentic_evaluation/tool_planner.py,sha256=00e_d2Ju5J61priEaKWLkSK2yW0donK8KJCq0PfKUuw,13013
20
+ wxo_agentic_evaluation/type.py,sha256=R_s2kFn3VydHI4y5aWSBEaYPpDODHF5yPb7MKbysxwk,4014
21
+ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
22
+ wxo_agentic_evaluation/analytics/tools/main.py,sha256=dxjjIlVQY-ZJ3NC6knW8r-kmTo8WWEhwlwZfP38uj8Q,6105
23
+ wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
24
+ wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
25
+ wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
26
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
27
+ wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
28
+ wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
29
+ wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
+ wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
31
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=V9tcGHuwG1_m0Aa8ztmduBR8gufr6rpvZjlzPtPnDZQ,6236
32
+ wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
34
+ wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
35
+ wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
36
+ wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
37
+ wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
38
+ wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
39
+ wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
40
+ wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=GAHtEJvFNtgWBQma1I9KJdhXdhmqbEQf7JY66Z1JLMU,1113
41
+ wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=yrLtXfmVIJ_C3XIaTvpqlQGlg9kKIibrVR3UzpzBEmo,1288
42
+ wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
43
+ wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
44
+ wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
45
+ wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
46
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=BVRT-BKyBJn5cM6Dze4GhFmMLyvGlyilFKQsfUhrklQ,4722
47
+ wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
48
+ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
49
+ wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
+ wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
51
+ wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=rlkSAb7QDHUoXg-LLK_wOyaTtYNrhV2SXbpnJxSUrD0,4714
52
+ wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=YQi9xoaFATBNGe_NebndH6o1eQalcSKvWKSjbZ8dzP4,11526
53
+ wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
54
+ wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=qBZY4GK1352NUMyED5LVjjbcvpdCcxG6mDIN1HvxKIc,4340
55
+ wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
56
+ wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=82jutvZ21GO76rRcjGWux5kAanDtzE728BijC7trSxY,4297
57
+ wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
+ wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
59
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=th36x0RMpGx1MAzqOUxjuhAcroUgjT2CJkT6tlMUbPg,843
61
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=5ZOWW82V0VFgpiaXpQ3hZIVKO7JAsoYRhwwb2ZDGxxk,7481
62
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
64
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
65
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
67
+ wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
68
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=kMMFq4ABX5q6cPnDdublLMVqXu4Ij-x4OlxZyePWIjc,3599
70
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=rKKBtL47gvN_fFy3FVbQTaa0U1bhv8bls0HPZi66EZ8,9279
71
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=z_k-qdFoUJqstkPYn9Zmhlp2YTVQKJtoDZCIdKow664,17306
72
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=_Er2KfCkc3HFmOmxZT6eb-e7qF7ukqsf6Si5CJTqPPg,6016
73
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
74
+ wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=QHIEHmr3GnCoQPIPyLAMiT2IdYJKUUhqSPJDLefVY2U,16983
75
+ wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
76
+ wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=z4S5QJJi1acshC0YFzblppgtm1oxNEgMKYjaJdfzkn4,8324
77
+ wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=mSoJAjYRSEpq8zBm-EP0UwF0zmZ4gDRjoUe4jT9nJt0,12212
78
+ wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=JHZhoSfGJYYp3sFx3XP9cTsDQgpgajzZ7TV5c4hmKCs,5980
79
+ wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=CGQ5LvhQrmxAyZDHBHds47rjJYWsx670c56yOHCrEAI,15074
80
+ wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=jurmc4KFFKH4hwnvor2xg97H91b-xJc3cUKYaU2I8uM,1370
81
+ wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
+ wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=FFmcSWXQnLmylpYyj8LZuPwb6nqwQp-jj6Mv9g8zby0,5052
83
+ wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
84
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=Y36Ryv4nPG8RdVP_zsQsRlEWv8F_hGi7-wOppWPQTwc,4026
85
+ wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
86
+ wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
87
+ wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=C_uezsx4FHqkvtCIQGIii82nu9_vSOKP70uAVXWj5fM,10619
88
+ wxo_agentic_evaluation/utils/__init__.py,sha256=ItryTgc1jVc32rB3XktTFaYGA_A6bRIDZ1Pts_JGmv8,144
89
+ wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=Vyji_edgou2xMLbsGwFG-QI7xRBNvO3-1nbeOc8ZuFo,5646
90
+ wxo_agentic_evaluation/utils/rich_utils.py,sha256=J9lzL4ETQeiAJcXKsUzXh82XdKvlDY7jmcgTQlwmL9s,6252
91
+ wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
92
+ wxo_agentic_evaluation/utils/utils.py,sha256=qQR_2W5p0Rk6KSE3-llRyZrWXkO5zG9JW7H1692L4PI,11428
93
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/METADATA,sha256=OgVKCRnQdk7cYnzrexY8VPPgQW73uHdX2jfI9a7o5IE,16105
94
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
95
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
96
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD,,
@@ -5,8 +5,8 @@ import rich
5
5
  from type import ContentType
6
6
  from analytics.tools.analyzer import ToolErrorAnalyzer
7
7
  from analytics.tools.ux import ToolErrorDisplayManager
8
- from type import Message
9
8
  from shutil import get_terminal_size
9
+ from utils.utils import load_messages
10
10
 
11
11
  if __name__ == "__main__":
12
12
  parser = argparse.ArgumentParser(description="tool-analytics-resources")
@@ -47,23 +47,6 @@ if __name__ == "__main__":
47
47
  """Count total tool calls in the conversation."""
48
48
  return sum(1 for msg in messages if msg.type == ContentType.tool_call)
49
49
 
50
- # Function to load messages from JSON file
51
- def load_messages(file_path):
52
- with open(file_path, "r") as f:
53
-
54
- try:
55
- message_data = json.load(f)
56
- messages = []
57
- for msg in message_data:
58
- messages.append(Message.model_validate(msg))
59
-
60
- return messages
61
-
62
- except Exception as e:
63
- print(file_path)
64
- print(e)
65
- return None
66
-
67
50
  # Function to load ground truth from JSON file
68
51
  def load_ground_truth(file_path):
69
52
  with open(file_path, "r") as f: