ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,380 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.0.3
4
- Summary: The WxO evaluation framework
5
- Author-email: Haode Qi <Haode.Qi@ibm.com>
6
- License: MIT
7
- Requires-Python: <3.14,>=3.11
8
- Description-Content-Type: text/markdown
9
- Requires-Dist: rich~=13.9.4
10
- Requires-Dist: pydantic~=2.10.6
11
- Requires-Dist: pyyaml~=6.0.2
12
- Requires-Dist: jinja2~=3.1.5
13
- Requires-Dist: python-dotenv~=1.0.1
14
- Requires-Dist: dataclasses-json~=0.6.7
15
- Requires-Dist: jsonargparse~=4.37.0
16
- Provides-Extra: dev
17
- Requires-Dist: setuptools~=70.3.0; extra == "dev"
18
- Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
19
- Requires-Dist: pytest-cov==6.0.0; extra == "dev"
20
- Requires-Dist: pytest-mock==3.14.0; extra == "dev"
21
- Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
22
- Requires-Dist: coverage[toml]>=6.5; extra == "dev"
23
- Requires-Dist: black~=22.3.0; extra == "dev"
24
- Requires-Dist: pylint~=2.16.4; extra == "dev"
25
- Provides-Extra: rag-eval
26
- Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
27
- Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
28
- Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
29
- Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
30
- Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
31
- Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
32
- Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
33
- Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
34
-
35
- # WXO-agent evaluation framework
36
-
37
- - This framework is designed to test a tool-calling agent's ability to make real API calls against a `wxo-dev` testing tenant on your local wxo-lite server instance.
38
-
39
- - As an LLM-as-agent evaluation framework, we aim to test the agent's ability to do the following:
40
- - We use a ground truth to evaluate our conversation against after inference. The process of inference is manifested through a user-LLM and agent simulation. Please set `enable_verbose_logging: True` in your configuration.
41
- - Make real API calls correctly and efficiently. We provide metrics which measure the number of bad tool calls made by the agent, normalized against the number of ground truth calls made.
42
-
43
- - The `benchmarks/` folder contains test-cases for the different agents we have evaluated so far. They are segmented by release versions of the `wxo-domains` repository.
44
- - The agent calls the `runs/` endpoint of the wxo-lite server instance, and the actual tool code is executed on the server side. The server database is not visible to our framework.
45
-
46
- ## prerequisite
47
- Follow the [SDK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the SDK.
48
- The current framework is compatible with ADK version >= 1.20, <= 1.6.0
49
-
50
- ## setup for evaluation framework
51
- Run the following command to install evaluation framework in the same env:
52
- ```
53
- pip install -e .
54
- ```
55
-
56
-
57
- ## quick experiment against the default wxo-dev env
58
- ```bash
59
- orchestrate server start
60
- export WATSONX_SPACE_ID=""
61
- export WATSONX_APIKEY=""
62
- ```
63
-
64
- NOTE: If you want to use `WO_INSTANCE` and `WO_API_KEY` instead, follow the [model proxy section](#using-model-proxy-provider).
65
-
66
- Import sample hr tools and agent into your default `wxo-dev` env:
67
- ```bash
68
- orchestrate tools import -f benchmarks/hr_sample/tools.py -k python
69
- orchestrate agents import -f benchmarks/hr_sample/hr_agent.json
70
- ```
71
-
72
- Run the main script:
73
- ```bash
74
- python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml --output_dir=results/test --num_workers=2
75
- ```
76
- Note:
77
- 1. This approach uses the default `wxo-dev` tenant already available in your orchestrate env if you have used wxo-lite before.
78
- 2. ADK also reads the env environments variable. If you have an env conflict, start the wxo-lite server before exporting the envs.
79
-
80
-
81
- ## run against a deployed local env
82
-
83
- 1. start the orchestrated server: `orchestrate server start`
84
- 2. create a simple test case like the following save in a folder like `benchmarks/TEST_CASE_NAME`:
85
- ```JSON
86
- {
87
- "agent": "NAME_OF_THE_AGENT",
88
- "goals": {
89
- "summarize": []
90
- },
91
- "goal_details": [
92
- {
93
- "type": "text",
94
- "name": "summarize",
95
- "response": "Your timeoff schedule for 20250101 to 20250303 is: 20250105",
96
- "keywords": [
97
- "20250105"
98
- ]
99
- }
100
- ],
101
- "story": "Your username is nwaters and you want to find out timeoff schedule from 20250101 to 20250303."
102
- }
103
- ```
104
- Note:
105
- - The target agent name can be found `orchestrate agents list`
106
- - the example shown only evaluate the final response for the agent. For more sophisticated examples, follow `benchmarks/hr_sample/data_simple.json` or `benchmarks/hr_sample/data_complex.json`.
107
-
108
-
109
- 3. create a test config yaml like the following:
110
- ```YAML
111
- test_paths:
112
- - benchmarks/TEST_CASE_NAME
113
-
114
- auth_config:
115
- url: http://localhost:4321
116
- tenant_name: wxo-dev
117
-
118
- output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
119
- ```
120
-
121
-
122
- NOTE: run `orchestrate env list` to find the name of the active tenant. for default `local` tenant, the name should be `wxo-dev`
123
-
124
- 4. Run the test:
125
- ```bash
126
- export WATSONX_SPACE_ID=""
127
- export WATSONX_APIKEY=""
128
- python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
129
- ```
130
-
131
- NOTE: if your run fails for any reason and doesn't cover all the test cases, you can re-run the main script with `--skip_available_results=True` to skip the test cases that are already completed.
132
-
133
- ## analyze error
134
- ```bash
135
- python -m wxo_agentic_evaluation.analyze_run --input_data results/hr_sample/llama_3_2_90b/messages/data_simple.messages.json --ground_truth benchmarks/hr_sample/data_simple.json --enable_verbose_logging False
136
- ```
137
- You can also run the analyze script on a batch of test cases in a folder
138
- ```bash
139
- python -m wxo_agentic_evaluation.analyze_run --input_data results/hr_sample/llama_3_2_90b/messages/ --ground_truth benchmarks/hr_sample/ --enable_verbose_logging False --enable_verbose_logging False
140
- ```
141
-
142
-
143
- ## Run Against a SaaS Tenant (Orchestrate SDK ≥ 1.2)
144
-
145
- This section describes how to run benchmark tests using a **SaaS-based Orchestrate tenant**. The rest of the setup (test case creation, config structure, etc.) is similar to the [local setup](#run-against-a-deployed-local-env) and can be referred to as needed.
146
-
147
- ### Prerequisites
148
-
149
- - **Orchestrate SDK version ≥ 1.2** is required.
150
- - Access to the **production SaaS Orchestrate instance** or **staging SaaS Orchestrate instance**.
151
-
152
- ---
153
-
154
- ### 1. Get Authentication Details
155
-
156
- 1. Visit the Orchestrate UI [ Prod /staging]:
157
-
158
- - **AWS Production us-east-1:** [https://dl.watson-orchestrate.ibm.com](https://dl.watson-orchestrate.ibm.com)
159
- For other locations, please use the designated url for your data center.
160
- - **AWS Staging:** [https://staging-wa.watson-orchestrate.ibm.com](https://staging-wa.watson-orchestrate.ibm.com)
161
- - **IBM Cloud Production us-south:** [https://us-south.watson-orchestrate.cloud.ibm.com](https://us-south.watson-orchestrate.cloud.ibm.com)
162
-
163
- 2. Log in and click the **Settings** button (top-right corner).
164
-
165
- 3. Open the **API details** tab, then copy the **Instance URL** and generate an **API Key**.
166
-
167
- 4. For more detailed instructions, refer to this guide:
168
- https://developer.ibm.com/apis/catalog/watsonorchestrate--custom-assistants/Getting+the+API+endpoint
169
-
170
- ---
171
-
172
- ### 2. Add the SaaS Tenant
173
-
174
- Run the following command:
175
-
176
- ```bash
177
- orchestrate env add -n saas \
178
- -u [INSTANCE_URL] \
179
- -t mcsp \
180
- -a
181
- ```
182
- if using stagging setup then pass the --iam-url argument as follow:
183
- - For AWS:
184
- ```bash
185
- orchestrate env add -n saas \
186
- -u [INSTANCE_URL] \
187
- --iam-url https://iam.platform.test.saas.ibm.com \
188
- -a
189
- ```
190
-
191
- - For IBM Cloud:
192
- ```bash
193
- orchestrate env add -n saas \
194
- -u [INSTANCE_URL] \
195
- --iam-url https://iam.test.cloud.ibm.com \
196
- -a
197
- ```
198
-
199
- > When prompted, paste the API key generated above.
200
-
201
- ---
202
-
203
- ### 3. Set the IAM API Key Environment Variable
204
-
205
- ```bash
206
- export WATSONX_IAM_SAAS_APIKEY=[your_generated_api_key]
207
- ```
208
-
209
- ---
210
-
211
- ### 4. Update Your Test Config YAML
212
-
213
- Make sure your YAML config includes the correct SaaS tenant name:
214
-
215
- ```yaml
216
- test_paths:
217
- - benchmarks/TEST_CASE_NAME
218
-
219
- auth_config:
220
- url: [INSTANCE_URL]
221
- tenant_name: saas
222
-
223
- output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
224
- ```
225
- - Use staging url if using the staging set-up.
226
- ---
227
-
228
- ### 5. Run the Simulation in SaaS Mode
229
-
230
- ```bash
231
- python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
232
- ```
233
-
234
- ---
235
-
236
- ### Batch Test case Generation
237
-
238
- For full instructions on setting up tools, writing stories, configuring the pipeline, and generating batch test cases, see the [Batch Test case Generation Guide](./benchmarks/batch_sample/README.MD).
239
-
240
- ## Using Model Proxy Provider
241
-
242
- To use the model proxy provider (which allows direct access to LLM models), follow these steps:
243
-
244
- 1. Set up environment variables:
245
- ```sh
246
- export WO_INSTANCE=<your-instance-url>
247
- export WO_API_KEY=<your-api-key>
248
- ```
249
-
250
- 2. Create a configuration file similar to [benchmarks/hr_sample/config_model_proxy.yaml](benchmarks/hr_sample/config_model_proxy.yaml):
251
- ```yaml
252
- test_paths:
253
- - <your-test-path>
254
-
255
- auth_config:
256
- url: http://localhost:4321
257
- tenant_name: wxo-dev
258
-
259
- provider_config:
260
- provider: "model_proxy"
261
- model_id: "<model-id>"
262
-
263
- output_dir: "<output-dir>"
264
- ```
265
-
266
- 3. Run the evaluation:
267
- ```sh
268
- python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
269
- ```
270
-
271
- ## Using Ollama
272
-
273
- To use model from Ollama (local LLM deployment), follow these steps:
274
-
275
- 1. Make sure you have [Ollama](https://ollama.com) installed and running on your system.
276
-
277
- 2. Pull your desired model using Ollama (e.g. llama3.1:8b):
278
- ```sh
279
- ollama pull <model-id>
280
- ```
281
-
282
- 3. Create a configuration file similar to [benchmarks/hr_sample/config_ollama.yaml](benchmarks/hr_sample/config_ollama.yaml):
283
- ```yaml
284
- test_paths:
285
- - <your-test-path>
286
-
287
- auth_config:
288
- url: http://localhost:4321
289
- tenant_name: wxo-dev
290
-
291
- provider_config:
292
- provider: "ollama"
293
- model_id: "<model-id>"
294
-
295
- output_dir: "results/ollama/<model-name>"
296
- ```
297
-
298
- 4. Run the evaluation:
299
- ```sh
300
- python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
301
- ```
302
-
303
- ## Workflow diagram
304
-
305
- To help better understand the workflow, this is a diagram of how this repo works together with wxO lite python SDK and a wxO runtime.
306
-
307
- ![Alt text](./doc/assets/workflow.png "Workflow")
308
-
309
- Inputs:
310
- - [a test config yaml](benchmarks/hr_sample/config.yaml)
311
- - a json file containing test cases, see [example 1](benchmarks/hr_sample/data_complex.json) or [example 2](benchmarks/hr_sample/data_simple.json) as a reference
312
- - optionally, a `tools.py` file for tools definition and one or more agent definitions e.g. `benchmarks/hr_sample/hr_agent.json`. Alternatively, these files are not needed if you have a tenant already set up with such tools and agents
313
-
314
- Steps:
315
- 1. (optional) this repo will call wxO Lite python SDK (which calls several endpoints exposed by the wxO runtime) to set up the environment needed for the evaluation run, tools and agents will be imported
316
- 2. Create test cases by following the sample instructions at [benchmarks/sap_successfactors_sample/annotation/README.md](benchmarks/sap_successfactors_sample/annotation/README.md)
317
- 3. Start the evaluation run by calling the `wxo_agentic_evaluation.main` script of this repo, which will invoke the `/runs` endpoint of the wxO runtime to simulate conversations with the agent
318
- 4. Reports and metrics will be generated by this repo
319
- 5. (optional) this repo will call wxO Lite python SDK (which calls several endpoints exposed by the wxO runtime) to clean up the environment, to avoid the tools and agents affecting subsequent runs with the same tenant
320
- 6. (optional) You can generate further error analysis by using the `wxo_agentic_evaluation.analyze_run` script from this repo
321
-
322
-
323
- ## results
324
- ### workday
325
-
326
- | Model | User Setting | Total Step | Agent Step | Journey Success | Wrong Function Calls | Bad Calls | Wrong Parameters | Wrong Routing Calls | Text Match | Test Cases | WXO Avg. Response Time |
327
- |-------------------------------|------------- |-------------|------------|------------------|-----------------------|-----------|-------------------|----------------------|------------|------------|-------------------------|
328
- | llama-3-2-90b-vision-instruct | normal | 8.13 | 4.21 | 0.87 | 0.01 | 0.0 | 0.20 | 0.00 | 0.95 | 38 | 15.09 |
329
- | llama-3-2-90b-vision-instruct | verbose | 11.76 | 6.11 | 0.79 | 0.02 | 0.0 | 0.19 | 0.00 | 0.86 | 38 | 14.32 |
330
- | llama-3-405b-instruct | normal | 9.66 | 5.03 | 0.82 | 0.02 | 0.0 | 0.47 | 0.04 | 0.89 | 38 | 13.36 |
331
- | llama-3-405b-instruct | verbose | 11.76 | 6.11 | 0.84 | 0.05 | 0.0 | 0.70 | 0.04 | 0.92 | 38 | 12.21 |
332
-
333
- You can find the detailed results under [results/workday](results/workday)
334
-
335
- ### sap successfactor (rel-1.7)
336
-
337
- | Model | User Setting | Total Step | Agent Step | Journey Success | Wrong Function Calls | Bad Calls | Wrong Parameters | Wrong Routing Calls | Text Match | Test Cases | WXO Avg. Response Time |
338
- |------------------------------ |--------------|-------------|------------|------------------|-----------------------|-----------|-------------------|----------------------|------------|------------|-------------------------|
339
- | llama-3-2-90b-vision-instruct | normal | 10.32 | 5.84 | 0.73 | 0.04 | 0.0 | 0.06 | 0.08 | 0.84 | 38 | - |
340
- | llama-3-2-90b-vision-instruct | verbose | 11.19 | 6.35 | 0.68 | 0.04 | 0.0 | 0.08 | 0.16 | 0.81 | 38 | - |
341
- | llama-3-405b-instruct | normal | 11.41 | 6.24 | 0.46 | 0.01 | 0.0 | 0.23 | 0.02 | 0.62 | 38 | - |
342
- | llama-3-405b-instruct | verbose | 15.32 | 8.38 | 0.46 | 0.04 | 0.0 | 0.40 | 0.06 | 0.62 | 38 | - |
343
-
344
- You can find the detailed results under [results/sap_successfactor_4](results/sap_successfactor_4)
345
-
346
-
347
- ## METRICS KEY
348
-
349
- | Metric | Description | Calculation | Range/Type |
350
- |--------|-------------|-------------|------------|
351
- | **Total Step** | Total number of messages/steps in the conversation | Count of all messages in the conversation | Integer ≥ 0 |
352
- | **Agent Step** | Number of assistant responses (text or tool calls) | Count of messages where `role == "assistant"` and `type` is text or tool_call | Integer ≥ 0 |
353
- | **Ground Truth Calls** | Expected number of tool calls based on ground truth | Count of goal_details with `type == ContentType.tool_call` | Integer ≥ 0 |
354
- | **Journey Success** | Whether the agent completed tasks in the correct order | `is_topological_sort(ground_truth.goals, labelled_messages)` | Boolean |
355
- | **Wrong Function Calls** | Number of calls to non-existent or unexpected functions | Count of labelled_messages containing "_WRONG_FUNCTION_CALL" | Integer ≥ 0 |
356
- | **Bad Calls** | Reserved metric for future use | Currently hardcoded to 0 | Integer (0) |
357
- | **Wrong Parameters** | Number of tool calls with incorrect parameters | Count of labelled_messages containing "_WRONG_PARAMETERS" | Integer ≥ 0 |
358
- | **Wrong Routing Calls** | Number of incorrect agent routing calls | Count of labelled_messages containing "_WRONG_ROUTING_CALL" | Integer ≥ 0 |
359
- | **Text Match** | Quality of final text summary | "Keyword Mismatch" \| "Semantic Mismatch" \| "Summary Matched" | Categorical |
360
- | **Tool Call Accuracy** | Percentage of non-routing tool calls that were executed correctly | `correct_tool_calls / non_transfer_tool_calls` | Float 0.0-1.0 |
361
- | **Tool Call Relevancy** | Percentage of non-routing tool calls that were relevant to the task | `(relevant_tool_calls - expected_routing_calls) / non_transfer_tool_calls` | Float 0.0-1.0 |
362
- | **Agent Routing Accuracy** | Percentage of routing calls that were executed correctly | `expected_routing_calls / total_routing_calls` | Float 0.0-1.0 |
363
- | **WXO Average Response Time (Secs)** | Average response time for agent responses | Mean response time across all agent interactions | Float ≥ 0.0 |
364
-
365
- ### Key Definitions
366
-
367
- - **Relevant Tool Call**: A tool call whose name matches one of the expected tool names defined in the ground truth
368
- - **Correct Tool Call**: A relevant tool call that also has the correct parameters/arguments
369
- - **Routing Call**: A tool call whose name starts with "transfer_" (used for agent-to-agent routing)
370
- - **Non-Transfer Tool Call**: Regular tool calls excluding routing calls (`total_tool_calls - total_routing_calls`)
371
- - **Expected Routing Call**: A routing call that was both expected and executed correctly
372
-
373
- ### Averaging Behavior
374
-
375
- - **Per Test Case Average**: Total Step, Agent Step, Tool Call Accuracy, Tool Call Relevancy, Agent Routing Accuracy, WXO Average Response Time
376
- - **Per Ground Truth Calls Average**: Wrong Function Calls, Bad Calls, Wrong Parameters, Wrong Routing Calls
377
- - **Special Calculations**:
378
- - Journey Success: Proportion of test cases that succeeded (0.0-1.0)
379
- - Text Match: Proportion of test cases with "Summary Matched" (0.0-1.0)
380
-
@@ -1,56 +0,0 @@
1
- wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
3
- wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
4
- wxo_agentic_evaluation/arg_configs.py,sha256=UCrGcakFaAM3reFquMn03qNtKe7Pg8ScbOF0K7o8VDU,2240
5
- wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
- wxo_agentic_evaluation/data_annotator.py,sha256=DJVG2CdhJRAJ3X1ARbrsn9bPjTuytCDGIBM4PEexfnk,8214
7
- wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
8
- wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
9
- wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
10
- wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
11
- wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
12
- wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
13
- wxo_agentic_evaluation/record_chat.py,sha256=9l99n4TRdwDLAOKct0ZJKKXE5Y7qE7X5WLWUpWUHfLI,7739
14
- wxo_agentic_evaluation/resource_map.py,sha256=-dIWQdpEpPeSCbDeYfRupG9KV1Q4NlHGb5KXywjkulM,1645
15
- wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
16
- wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
17
- wxo_agentic_evaluation/tool_planner.py,sha256=e-lBb4w1klT1HOL9BTwae3lkGv5VBuYC397mSJgOhus,12622
18
- wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
19
- wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
20
- wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
21
- wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
22
- wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
23
- wxo_agentic_evaluation/external_agent/__init__.py,sha256=LY3gMNzfIEwjpQkx5_2iZFHGQiUL4ymEkKL1dc2uKq4,1491
24
- wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
25
- wxo_agentic_evaluation/external_agent/performance_test.py,sha256=bCXUsW0OeUzwfSSYObgfAmEU5vARkD-PblYU-mU9aPY,2507
26
- wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
27
- wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
29
- wxo_agentic_evaluation/metrics/metrics.py,sha256=9O2m6T2iW-PMjGrTdMbOHP2Pr4RN0NwbEp6YgFpTi3I,5572
30
- wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
32
- wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
33
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
34
- wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
35
- wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
36
- wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
37
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=nDfCD0o9cRYmsgIjzD-RZNQxotlvuqrzdsZIY-vT794,684
38
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
39
- wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
40
- wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
41
- wxo_agentic_evaluation/prompt/template_render.py,sha256=FVH5ew2TofC5LGqQzqNj90unrxooUZv_5XxJzVdz8uM,3563
42
- wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
43
- wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
44
- wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
46
- wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
47
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=X5tiE0IKCR2CqhwEGm91LOdzFZQWSXzXQgLOtzi6ng0,4002
48
- wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
49
- wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
50
- wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
51
- wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
52
- wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
53
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA,sha256=L6Hq_FbQ4AY3g3Aho2wC6Io9rcLpnwNDm49BPTHbVCQ,17667
54
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD,,