ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/METADATA +103 -109
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/RECORD +97 -0
- wxo_agentic_evaluation/analytics/tools/main.py +1 -18
- wxo_agentic_evaluation/analyze_run.py +358 -97
- wxo_agentic_evaluation/arg_configs.py +28 -1
- wxo_agentic_evaluation/description_quality_checker.py +149 -0
- wxo_agentic_evaluation/evaluation_package.py +58 -17
- wxo_agentic_evaluation/inference_backend.py +32 -17
- wxo_agentic_evaluation/llm_user.py +2 -1
- wxo_agentic_evaluation/metrics/metrics.py +22 -1
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/template_render.py +34 -3
- wxo_agentic_evaluation/quick_eval.py +342 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +114 -0
- wxo_agentic_evaluation/service_instance.py +2 -2
- wxo_agentic_evaluation/service_provider/__init__.py +15 -6
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +4 -3
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +138 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +11 -4
- wxo_agentic_evaluation/tool_planner.py +3 -1
- wxo_agentic_evaluation/type.py +33 -2
- wxo_agentic_evaluation/utils/__init__.py +0 -1
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
- wxo_agentic_evaluation/utils/rich_utils.py +174 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +167 -5
- ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
@@ -10,7 +10,7 @@ Requires-Dist: rich~=13.9.4
|
|
|
10
10
|
Requires-Dist: pydantic<3.0.0,>=2.10.3
|
|
11
11
|
Requires-Dist: pyyaml~=6.0.2
|
|
12
12
|
Requires-Dist: jinja2~=3.1.5
|
|
13
|
-
Requires-Dist: python-dotenv
|
|
13
|
+
Requires-Dist: python-dotenv
|
|
14
14
|
Requires-Dist: dataclasses-json~=0.6.7
|
|
15
15
|
Requires-Dist: jsonargparse~=4.37.0
|
|
16
16
|
Provides-Extra: dev
|
|
@@ -32,29 +32,46 @@ Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
|
|
|
32
32
|
Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
|
|
33
33
|
Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
|
|
34
34
|
|
|
35
|
-
# WXO
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
# WXO Agent Evaluation Framework
|
|
36
|
+
|
|
37
|
+
## Table of Contents
|
|
38
|
+
- [Overview](#overview)
|
|
39
|
+
- [ADK Setup Guide](#adk-setup-guide)
|
|
40
|
+
- [Setup](#setup-for-evaluation-framework)
|
|
41
|
+
- [Quick Experiment](#quick-experiment-against-the-default-wxo-dev-env)
|
|
42
|
+
- [Run Against a Deployed Local Env](#run-against-a-deployed-local-env)
|
|
43
|
+
- [Run Against a SaaS Tenant](#run-against-a-saas-tenant)
|
|
44
|
+
- [Analyze Results](#analyze-results)
|
|
45
|
+
- [Record Chat Sessions](#record-chat-sessions)
|
|
46
|
+
- [Batch Test Case Generation](#batch-test-case-generation)
|
|
47
|
+
- [Using Model Proxy Provider](#using-model-proxy-provider)
|
|
48
|
+
- [Using Ollama](#using-ollama)
|
|
49
|
+
- [Workflow Diagram](#workflow-diagram)
|
|
50
|
+
- [Results](#results)
|
|
51
|
+
- [Metrics](#metrics)
|
|
52
|
+
|
|
53
|
+
## Overview
|
|
54
|
+
|
|
55
|
+
- This framework is designed to test a tool-calling agent's ability to make real API calls against a `wxo-dev` testing tenant on your local wxo-lite server instance. To run evaluation against a remote tenant on SaaS, follow [Run Against a SaaS Tenant](#run-against-a-saas-tenant).
|
|
39
56
|
- As an LLM-as-agent evaluation framework, we aim to test the agent's ability to do the following:
|
|
40
57
|
- We use a ground truth to evaluate our conversation against after inference. The process of inference is manifested through a user-LLM and agent simulation. Please set `enable_verbose_logging: True` in your configuration.
|
|
41
|
-
- Make real API calls correctly and efficiently. We provide metrics
|
|
42
|
-
|
|
58
|
+
- Make real API calls correctly and efficiently. We provide metrics such as tool call precision, recall, and routing accuracy to measure the agent's performance against the ground truth.
|
|
43
59
|
- The `benchmarks/` folder contains test-cases for the different agents we have evaluated so far. They are segmented by release versions of the `wxo-domains` repository.
|
|
44
60
|
- The agent calls the `runs/` endpoint of the wxo-lite server instance, and the actual tool code is executed on the server side. The server database is not visible to our framework.
|
|
45
61
|
|
|
46
|
-
##
|
|
47
|
-
Follow the [
|
|
48
|
-
The current framework is compatible with ADK version >= 1.20, <= 1.6.0
|
|
62
|
+
## ADK Setup Guide
|
|
63
|
+
Follow the [ADK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the ADK.
|
|
49
64
|
|
|
50
|
-
|
|
65
|
+
The current framework is compatible with ADK version >= 1.20, <= 1.7.0
|
|
66
|
+
|
|
67
|
+
## Setup for Evaluation Framework
|
|
51
68
|
Run the following command to install evaluation framework in the same env:
|
|
52
69
|
```
|
|
53
70
|
pip install -e .
|
|
54
71
|
```
|
|
55
72
|
|
|
56
|
-
##
|
|
57
|
-
###
|
|
73
|
+
## Contribution Guide
|
|
74
|
+
### Secret Resolution
|
|
58
75
|
install detect secret utilities:
|
|
59
76
|
```
|
|
60
77
|
pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
|
|
@@ -65,7 +82,7 @@ detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseli
|
|
|
65
82
|
```
|
|
66
83
|
|
|
67
84
|
|
|
68
|
-
##
|
|
85
|
+
## Quick Experiment Against the Default wxo-dev Env
|
|
69
86
|
```bash
|
|
70
87
|
orchestrate server start
|
|
71
88
|
export WATSONX_SPACE_ID=""
|
|
@@ -88,8 +105,7 @@ Note:
|
|
|
88
105
|
1. This approach uses the default `wxo-dev` tenant already available in your orchestrate env if you have used wxo-lite before.
|
|
89
106
|
2. ADK also reads the env environments variable. If you have an env conflict, start the wxo-lite server before exporting the envs.
|
|
90
107
|
|
|
91
|
-
|
|
92
|
-
## run against a deployed local env
|
|
108
|
+
## Run Against a Deployed Local Env
|
|
93
109
|
|
|
94
110
|
1. start the orchestrated server: `orchestrate server start`
|
|
95
111
|
2. create a simple test case like the following save in a folder like `benchmarks/TEST_CASE_NAME`:
|
|
@@ -116,7 +132,6 @@ Note:
|
|
|
116
132
|
- The target agent name can be found `orchestrate agents list`
|
|
117
133
|
- the example shown only evaluate the final response for the agent. For more sophisticated examples, follow `benchmarks/hr_sample/data_simple.json` or `benchmarks/hr_sample/data_complex.json`.
|
|
118
134
|
|
|
119
|
-
|
|
120
135
|
3. create a test config yaml like the following:
|
|
121
136
|
```YAML
|
|
122
137
|
test_paths:
|
|
@@ -129,7 +144,6 @@ auth_config:
|
|
|
129
144
|
output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
|
|
130
145
|
```
|
|
131
146
|
|
|
132
|
-
|
|
133
147
|
NOTE: run `orchestrate env list` to find the name of the active tenant. for default `local` tenant, the name should be `wxo-dev`
|
|
134
148
|
|
|
135
149
|
4. Run the test:
|
|
@@ -141,27 +155,17 @@ python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
|
|
|
141
155
|
|
|
142
156
|
NOTE: if your run fails for any reason and doesn't cover all the test cases, you can re-run the main script with `--skip_available_results=True` to skip the test cases that are already completed.
|
|
143
157
|
|
|
144
|
-
##
|
|
145
|
-
```bash
|
|
146
|
-
python -m wxo_agentic_evaluation.analyze_run --input_data results/hr_sample/llama_3_2_90b/messages/data_simple.messages.json --ground_truth benchmarks/hr_sample/data_simple.json --enable_verbose_logging False
|
|
147
|
-
```
|
|
148
|
-
You can also run the analyze script on a batch of test cases in a folder
|
|
149
|
-
```bash
|
|
150
|
-
python -m wxo_agentic_evaluation.analyze_run --input_data results/hr_sample/llama_3_2_90b/messages/ --ground_truth benchmarks/hr_sample/ --enable_verbose_logging False --enable_verbose_logging False
|
|
151
|
-
```
|
|
152
|
-
|
|
158
|
+
## Run Against a SaaS Tenant
|
|
153
159
|
|
|
154
|
-
|
|
160
|
+
Orchestrate ADK ≥ 1.2 is required for this section.
|
|
155
161
|
|
|
156
162
|
This section describes how to run benchmark tests using a **SaaS-based Orchestrate tenant**. The rest of the setup (test case creation, config structure, etc.) is similar to the [local setup](#run-against-a-deployed-local-env) and can be referred to as needed.
|
|
157
163
|
|
|
158
164
|
### Prerequisites
|
|
159
165
|
|
|
160
|
-
- **Orchestrate
|
|
166
|
+
- **Orchestrate ADK version ≥ 1.2** is required.
|
|
161
167
|
- Access to the **production SaaS Orchestrate instance** or **staging SaaS Orchestrate instance**.
|
|
162
168
|
|
|
163
|
-
---
|
|
164
|
-
|
|
165
169
|
### 1. Get Authentication Details
|
|
166
170
|
|
|
167
171
|
1. Visit the Orchestrate UI [ Prod /staging]:
|
|
@@ -178,8 +182,6 @@ For other locations, please use the designated url for your data center.
|
|
|
178
182
|
4. For more detailed instructions, refer to this guide:
|
|
179
183
|
https://developer.ibm.com/apis/catalog/watsonorchestrate--custom-assistants/Getting+the+API+endpoint
|
|
180
184
|
|
|
181
|
-
---
|
|
182
|
-
|
|
183
185
|
### 2. Add the SaaS Tenant
|
|
184
186
|
|
|
185
187
|
Run the following command:
|
|
@@ -209,16 +211,12 @@ orchestrate env add -n saas \
|
|
|
209
211
|
|
|
210
212
|
> When prompted, paste the API key generated above.
|
|
211
213
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
### 3. Set the IAM API Key Environment Variable
|
|
214
|
+
### 3. Set `WO_API_KEY` Environment Variable
|
|
215
215
|
|
|
216
216
|
```bash
|
|
217
|
-
export
|
|
217
|
+
export WO_API_KEY=[your_generated_api_key]
|
|
218
218
|
```
|
|
219
219
|
|
|
220
|
-
---
|
|
221
|
-
|
|
222
220
|
### 4. Update Your Test Config YAML
|
|
223
221
|
|
|
224
222
|
Make sure your YAML config includes the correct SaaS tenant name:
|
|
@@ -234,17 +232,55 @@ auth_config:
|
|
|
234
232
|
output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
|
|
235
233
|
```
|
|
236
234
|
- Use staging url if using the staging set-up.
|
|
237
|
-
---
|
|
238
|
-
|
|
239
235
|
### 5. Run the Simulation in SaaS Mode
|
|
240
236
|
|
|
241
237
|
```bash
|
|
242
238
|
python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
|
|
243
239
|
```
|
|
244
240
|
|
|
245
|
-
|
|
241
|
+
## Analyze Results
|
|
242
|
+
|
|
243
|
+
The `analyze_run.py` script summarizes agent evaluation results, showing successes, failures, and reasons for errors to help improve agent performance. After running an evaluation, analyze the results with:
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
python -m wxo_agentic_evaluation.analyze_run --data_path path/to/results
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
Additionally, the script comes with a feature to analyze the quality of tool descriptions for failing tools where the reason for failure is incorrect parameter usage by the agent.
|
|
246
250
|
|
|
247
|
-
|
|
251
|
+
In order to analyze the description(s) of your failing tools, consider passing the optional flag `--tool_definition_path` like so:
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
python -m wxo_agentic_evaluation.analyze_run --data_path path/to/results --tool_definition_path path/to/.py/source/file/containing/tool/definitions
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**Note:** If the flag `tool_definition_path` is not provided, description quality analysis is simply skipped.
|
|
258
|
+
|
|
259
|
+
## Record Chat Sessions
|
|
260
|
+
|
|
261
|
+
The `record_chat.py` script lets you capture your chat sessions in the chat UI and automatically generate ground truth data for evaluating your agents. This is valuable for benchmarking and experimenting with agent behavior under different configurations.
|
|
262
|
+
|
|
263
|
+
Start the chat interface:
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
orchestrate chat start
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
Then open your browser to [http://localhost:3000/chat-lite](http://localhost:3000/chat-lite) and select the agent you wish to interact with.
|
|
270
|
+
|
|
271
|
+
To begin recording, run:
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
python -m wxo_agentic_evaluation.record_chat --output_dir dir/to/save/recordings
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
While this process is running, for every chat session, annotated ground truth data is generated in your output directory: `<THREAD_ID>_annotated_data.json`
|
|
278
|
+
|
|
279
|
+
Review the generated annotated data for accuracy before using it for evaluation.
|
|
280
|
+
|
|
281
|
+
Press `Ctrl+C` in the terminal to stop recording when your session is complete.
|
|
282
|
+
|
|
283
|
+
## Batch Test Case Generation
|
|
248
284
|
|
|
249
285
|
For full instructions on setting up tools, writing stories, configuring the pipeline, and generating batch test cases, see the [Batch Test case Generation Guide](./benchmarks/batch_sample/README.MD).
|
|
250
286
|
|
|
@@ -311,9 +347,9 @@ To use model from Ollama (local LLM deployment), follow these steps:
|
|
|
311
347
|
python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
|
|
312
348
|
```
|
|
313
349
|
|
|
314
|
-
## Workflow
|
|
350
|
+
## Workflow Diagram
|
|
315
351
|
|
|
316
|
-
To help better understand the workflow, this is a diagram of how this repo works together with wxO
|
|
352
|
+
To help better understand the workflow, this is a diagram of how this repo works together with wxO ADK and a wxO runtime.
|
|
317
353
|
|
|
318
354
|

|
|
319
355
|
|
|
@@ -322,70 +358,28 @@ Inputs:
|
|
|
322
358
|
- a json file containing test cases, see [example 1](benchmarks/hr_sample/data_complex.json) or [example 2](benchmarks/hr_sample/data_simple.json) as a reference
|
|
323
359
|
- optionally, a `tools.py` file for tools definition and one or more agent definitions e.g. `benchmarks/hr_sample/hr_agent.json`. Alternatively, these files are not needed if you have a tenant already set up with such tools and agents
|
|
324
360
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
2. Create test cases by following the sample instructions at [benchmarks/sap_successfactors_sample/annotation/README.md](benchmarks/sap_successfactors_sample/annotation/README.md)
|
|
328
|
-
3. Start the evaluation run by calling the `wxo_agentic_evaluation.main` script of this repo, which will invoke the `/runs` endpoint of the wxO runtime to simulate conversations with the agent
|
|
329
|
-
4. Reports and metrics will be generated by this repo
|
|
330
|
-
5. (optional) this repo will call wxO Lite python SDK (which calls several endpoints exposed by the wxO runtime) to clean up the environment, to avoid the tools and agents affecting subsequent runs with the same tenant
|
|
331
|
-
6. (optional) You can generate further error analysis by using the `wxo_agentic_evaluation.analyze_run` script from this repo
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
## results
|
|
335
|
-
### workday
|
|
336
|
-
|
|
337
|
-
| Model | User Setting | Total Step | Agent Step | Journey Success | Wrong Function Calls | Bad Calls | Wrong Parameters | Wrong Routing Calls | Text Match | Test Cases | WXO Avg. Response Time |
|
|
338
|
-
|-------------------------------|------------- |-------------|------------|------------------|-----------------------|-----------|-------------------|----------------------|------------|------------|-------------------------|
|
|
339
|
-
| llama-3-2-90b-vision-instruct | normal | 8.13 | 4.21 | 0.87 | 0.01 | 0.0 | 0.20 | 0.00 | 0.95 | 38 | 15.09 |
|
|
340
|
-
| llama-3-2-90b-vision-instruct | verbose | 11.76 | 6.11 | 0.79 | 0.02 | 0.0 | 0.19 | 0.00 | 0.86 | 38 | 14.32 |
|
|
341
|
-
| llama-3-405b-instruct | normal | 9.66 | 5.03 | 0.82 | 0.02 | 0.0 | 0.47 | 0.04 | 0.89 | 38 | 13.36 |
|
|
342
|
-
| llama-3-405b-instruct | verbose | 11.76 | 6.11 | 0.84 | 0.05 | 0.0 | 0.70 | 0.04 | 0.92 | 38 | 12.21 |
|
|
343
|
-
|
|
344
|
-
You can find the detailed results under [results/workday](results/workday)
|
|
345
|
-
|
|
346
|
-
### sap successfactor (rel-1.7)
|
|
347
|
-
|
|
348
|
-
| Model | User Setting | Total Step | Agent Step | Journey Success | Wrong Function Calls | Bad Calls | Wrong Parameters | Wrong Routing Calls | Text Match | Test Cases | WXO Avg. Response Time |
|
|
349
|
-
|------------------------------ |--------------|-------------|------------|------------------|-----------------------|-----------|-------------------|----------------------|------------|------------|-------------------------|
|
|
350
|
-
| llama-3-2-90b-vision-instruct | normal | 10.32 | 5.84 | 0.73 | 0.04 | 0.0 | 0.06 | 0.08 | 0.84 | 38 | - |
|
|
351
|
-
| llama-3-2-90b-vision-instruct | verbose | 11.19 | 6.35 | 0.68 | 0.04 | 0.0 | 0.08 | 0.16 | 0.81 | 38 | - |
|
|
352
|
-
| llama-3-405b-instruct | normal | 11.41 | 6.24 | 0.46 | 0.01 | 0.0 | 0.23 | 0.02 | 0.62 | 38 | - |
|
|
353
|
-
| llama-3-405b-instruct | verbose | 15.32 | 8.38 | 0.46 | 0.04 | 0.0 | 0.40 | 0.06 | 0.62 | 38 | - |
|
|
354
|
-
|
|
355
|
-
You can find the detailed results under [results/sap_successfactor_4](results/sap_successfactor_4)
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
## METRICS KEY
|
|
359
|
-
|
|
360
|
-
| Metric | Description | Calculation | Range/Type |
|
|
361
|
-
|--------|-------------|-------------|------------|
|
|
362
|
-
| **Total Step** | Total number of messages/steps in the conversation | Count of all messages in the conversation | Integer ≥ 0 |
|
|
363
|
-
| **Agent Step** | Number of assistant responses (text or tool calls) | Count of messages where `role == "assistant"` and `type` is text or tool_call | Integer ≥ 0 |
|
|
364
|
-
| **Ground Truth Calls** | Expected number of tool calls based on ground truth | Count of goal_details with `type == ContentType.tool_call` | Integer ≥ 0 |
|
|
365
|
-
| **Journey Success** | Whether the agent completed tasks in the correct order | `is_topological_sort(ground_truth.goals, labelled_messages)` | Boolean |
|
|
366
|
-
| **Wrong Function Calls** | Number of calls to non-existent or unexpected functions | Count of labelled_messages containing "_WRONG_FUNCTION_CALL" | Integer ≥ 0 |
|
|
367
|
-
| **Bad Calls** | Reserved metric for future use | Currently hardcoded to 0 | Integer (0) |
|
|
368
|
-
| **Wrong Parameters** | Number of tool calls with incorrect parameters | Count of labelled_messages containing "_WRONG_PARAMETERS" | Integer ≥ 0 |
|
|
369
|
-
| **Wrong Routing Calls** | Number of incorrect agent routing calls | Count of labelled_messages containing "_WRONG_ROUTING_CALL" | Integer ≥ 0 |
|
|
370
|
-
| **Text Match** | Quality of final text summary | "Keyword Mismatch" \| "Semantic Mismatch" \| "Summary Matched" | Categorical |
|
|
371
|
-
| **Tool Call Accuracy** | Percentage of non-routing tool calls that were executed correctly | `correct_tool_calls / non_transfer_tool_calls` | Float 0.0-1.0 |
|
|
372
|
-
| **Tool Call Relevancy** | Percentage of non-routing tool calls that were relevant to the task | `(relevant_tool_calls - expected_routing_calls) / non_transfer_tool_calls` | Float 0.0-1.0 |
|
|
373
|
-
| **Agent Routing Accuracy** | Percentage of routing calls that were executed correctly | `expected_routing_calls / total_routing_calls` | Float 0.0-1.0 |
|
|
374
|
-
| **WXO Average Response Time (Secs)** | Average response time for agent responses | Mean response time across all agent interactions | Float ≥ 0.0 |
|
|
361
|
+
## Results
|
|
362
|
+
You can find benchmark results [here](benchmarks/domain_1.8/README.md)
|
|
375
363
|
|
|
376
|
-
|
|
364
|
+
## Metrics
|
|
377
365
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
366
|
+
| Metric | Description | Calculation | Range/Type |
|
|
367
|
+
|----------------------------|--------------------------------------------------------------------|------------------------------------------------------------------------------|--------------------|
|
|
368
|
+
| **Total Steps** | Total number of messages/steps in the conversation | Count of all messages in the conversation | Integer ≥ 0 |
|
|
369
|
+
| **LLM Steps** | Number of assistant (LLM) responses (text or tool calls) | Count of messages where `role == "assistant"` | Integer ≥ 0 |
|
|
370
|
+
| **Total Tool Calls** | Number of tool calls made by the agent | Count of all tool calls | Integer ≥ 0 |
|
|
371
|
+
| **Tool Call Precision** | Fraction of correct tool calls out of all tool calls | `correct_tool_calls / total_tool_calls` | Float 0.0–1.0 |
|
|
372
|
+
| **Tool Call Recall** | Fraction of correct tool calls out of expected tool calls | `correct_tool_calls / expected_tool_calls` | Float 0.0–1.0 |
|
|
373
|
+
| **Agent Routing Accuracy** | Fraction of correct agents visited (relevant_routing_calls) out of total number of agents visited (total_routing_calls) | `relevant_routing_calls / total_routing_calls` | Float 0.0–1.0 |
|
|
374
|
+
| **Text Match** | Whether the final summary text matches the ground truth | `Summary Matched` \| `Summary MisMatched` | Categorical |
|
|
375
|
+
| **Journey Success** | Whether the agent completed tasks in the correct order | Boolean (`True`/`False`) | Boolean |
|
|
376
|
+
| **Avg Resp Time (sec)** | Average response time for agent responses | Mean response time across all agent interactions | Float ≥ 0.0 |
|
|
383
377
|
|
|
384
|
-
###
|
|
385
|
-
|
|
386
|
-
- **Per Test Case Average**: Total Step, Agent Step, Tool Call Accuracy, Tool Call Relevancy, Agent Routing Accuracy, WXO Average Response Time
|
|
387
|
-
- **Per Ground Truth Calls Average**: Wrong Function Calls, Bad Calls, Wrong Parameters, Wrong Routing Calls
|
|
388
|
-
- **Special Calculations**:
|
|
389
|
-
- Journey Success: Proportion of test cases that succeeded (0.0-1.0)
|
|
390
|
-
- Text Match: Proportion of test cases with "Summary Matched" (0.0-1.0)
|
|
378
|
+
### Key Definitions
|
|
391
379
|
|
|
380
|
+
- **Correct Tool Call**: A tool call that matches both the expected function and arguments.
|
|
381
|
+
- **Expected Tool Call**: A tool call that is required by the ground truth.
|
|
382
|
+
- **Routing Call**: When an agent routes to another agent.
|
|
383
|
+
- **Relevant Routing Call**: An agent is relevant when it's either the entry point agent or it includes a tool that is presented in the ground-truth.
|
|
384
|
+
- **Text Match**: Indicates if the agent's final summary matches the expected summary ("Summary Matched") or does not match ("Summary MisMatched").
|
|
385
|
+
- **Journey Success**: Indicates if the agent completed all required tasks in the correct order.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
wxo_agentic_evaluation/analyze_run.py,sha256=4QLlo_NQjCh5M52ztFHoMvk_jtwptKpVXDmdTxj2ikQ,13054
|
|
3
|
+
wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=a3Lo3RurTOLysxmsliMKIqvld7T3ZTb4Kw_FPEeBC78,2997
|
|
5
|
+
wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
|
|
6
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=6cUUpCTFSs36VF3wICLXWrWbEUJz6v-PzPeuzO9S1k8,8310
|
|
7
|
+
wxo_agentic_evaluation/description_quality_checker.py,sha256=7vvGpPwa8J8ArTWAXRp865e_cHzSTMFLxkpI-rfj2ZQ,6097
|
|
8
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=9NrpKaGOUnAkslP7t3vU3Uv4lFUs-XLu0IUO7q0Muik,23575
|
|
9
|
+
wxo_agentic_evaluation/inference_backend.py,sha256=ItnwjhEJHX28sBS7CIVe7hmcy9FLd1HQEpzhdsJ1jDk,30341
|
|
10
|
+
wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
|
|
11
|
+
wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
|
|
12
|
+
wxo_agentic_evaluation/llm_user.py,sha256=LhS7Ti9v3TLMrEv0og9N6yUF4y8lLMcMycEqVhwtGAE,1493
|
|
13
|
+
wxo_agentic_evaluation/main.py,sha256=JYcOaSPM8EQdgsPFdYmelouH-3_o-OtLQ0oh5cjADOU,11933
|
|
14
|
+
wxo_agentic_evaluation/quick_eval.py,sha256=nROa-xZ265-k8JJ1M4t1LZe4ucdJi8GuRNVuCWPiZTU,12525
|
|
15
|
+
wxo_agentic_evaluation/record_chat.py,sha256=uFdbLt4HaMREN3q4HHAA1ZvtjoLdiBEyxPd9Eoc6svc,8103
|
|
16
|
+
wxo_agentic_evaluation/resource_map.py,sha256=11qF1oJDwGNWOLYFVsIPsR66JK4eD0cqVOBKreK2mPQ,1644
|
|
17
|
+
wxo_agentic_evaluation/service_instance.py,sha256=6Y7byxdQakB3NMP288Rhne3ygOumSSgJjBT5Q-YY1OA,6468
|
|
18
|
+
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
19
|
+
wxo_agentic_evaluation/tool_planner.py,sha256=00e_d2Ju5J61priEaKWLkSK2yW0donK8KJCq0PfKUuw,13013
|
|
20
|
+
wxo_agentic_evaluation/type.py,sha256=R_s2kFn3VydHI4y5aWSBEaYPpDODHF5yPb7MKbysxwk,4014
|
|
21
|
+
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
|
|
22
|
+
wxo_agentic_evaluation/analytics/tools/main.py,sha256=dxjjIlVQY-ZJ3NC6knW8r-kmTo8WWEhwlwZfP38uj8Q,6105
|
|
23
|
+
wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
|
|
24
|
+
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
|
|
25
|
+
wxo_agentic_evaluation/external_agent/__init__.py,sha256=9NomrFEZQPrh91nto_hEGwoSks77nerAbWqS0L70qnY,1511
|
|
26
|
+
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
|
|
27
|
+
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=vaaAMBhJoQ0hQ4xq4Zp7E39Xtba05inWaKzkAtWlhlY,2426
|
|
28
|
+
wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
|
|
29
|
+
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
|
|
31
|
+
wxo_agentic_evaluation/metrics/metrics.py,sha256=V9tcGHuwG1_m0Aa8ztmduBR8gufr6rpvZjlzPtPnDZQ,6236
|
|
32
|
+
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
34
|
+
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
35
|
+
wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2,sha256=_Ty6QDcQcbde2ZP2HVvFtOCm_2mFu_1cUM6qj11MvcU,8085
|
|
36
|
+
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
|
|
37
|
+
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
|
|
38
|
+
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
|
|
39
|
+
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
|
|
40
|
+
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=GAHtEJvFNtgWBQma1I9KJdhXdhmqbEQf7JY66Z1JLMU,1113
|
|
41
|
+
wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2,sha256=yrLtXfmVIJ_C3XIaTvpqlQGlg9kKIibrVR3UzpzBEmo,1288
|
|
42
|
+
wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2,sha256=90KF7fXW5PPwY8IkPqA6ZflDMkr_KFDpO9H_mVGdGf8,2212
|
|
43
|
+
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
|
|
44
|
+
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
45
|
+
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
46
|
+
wxo_agentic_evaluation/prompt/template_render.py,sha256=BVRT-BKyBJn5cM6Dze4GhFmMLyvGlyilFKQsfUhrklQ,4722
|
|
47
|
+
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
48
|
+
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
49
|
+
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
+
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
51
|
+
wxo_agentic_evaluation/red_teaming/attack_evaluator.py,sha256=rlkSAb7QDHUoXg-LLK_wOyaTtYNrhV2SXbpnJxSUrD0,4714
|
|
52
|
+
wxo_agentic_evaluation/red_teaming/attack_generator.py,sha256=YQi9xoaFATBNGe_NebndH6o1eQalcSKvWKSjbZ8dzP4,11526
|
|
53
|
+
wxo_agentic_evaluation/red_teaming/attack_list.py,sha256=edphWARWqDtXFtcHTVbRXngvO0YfG5SgrfPtrBRXuFw,4734
|
|
54
|
+
wxo_agentic_evaluation/red_teaming/attack_runner.py,sha256=qBZY4GK1352NUMyED5LVjjbcvpdCcxG6mDIN1HvxKIc,4340
|
|
55
|
+
wxo_agentic_evaluation/referenceless_eval/__init__.py,sha256=lijXMgQ8nQe-9eIfade2jLfHMlXfYafMZIwXtC9KDZo,106
|
|
56
|
+
wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py,sha256=ypEMOeAwaztGkOuDr_2JArSQWwos7XcBTwo8lFs2N5w,4262
|
|
57
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py,sha256=UidTaT9g5IxbcakfQqP_9c5civ1wDqY-PpPUf0uOXJo,915
|
|
59
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py,sha256=th36x0RMpGx1MAzqOUxjuhAcroUgjT2CJkT6tlMUbPg,843
|
|
61
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py,sha256=5ZOWW82V0VFgpiaXpQ3hZIVKO7JAsoYRhwwb2ZDGxxk,7481
|
|
62
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py,sha256=P1ytcARCy696ZCLq9tcaQWgaolmu0ON_kySmmTeyBtc,1549
|
|
64
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json,sha256=bVSyudTk2Nim1DcgQZf8ilOTszY2Kgm4YU6beHWvEhQ,40475
|
|
65
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py,sha256=UwPMCn7T2BEut7Mbj6U5UJvb2AAZw03BiB9wEjSCheg,1017
|
|
67
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json,sha256=o4oRur1MiXO2RYzmzj07QOBzX75DyU7T7yd-gFsgFdo,30563
|
|
68
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
69
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py,sha256=kMMFq4ABX5q6cPnDdublLMVqXu4Ij-x4OlxZyePWIjc,3599
|
|
70
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py,sha256=44HNEoIt3_jKZczs1qB8WGltCG-vn3ZI5aNhucxSDeM,9272
|
|
71
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py,sha256=z_k-qdFoUJqstkPYn9Zmhlp2YTVQKJtoDZCIdKow664,17306
|
|
72
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py,sha256=_Er2KfCkc3HFmOmxZT6eb-e7qF7ukqsf6Si5CJTqPPg,6016
|
|
73
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py,sha256=tW1wc87WIm8BZh2lhdj1RDP6VdRLqZBWSMmemSttbGs,22034
|
|
74
|
+
wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py,sha256=QHIEHmr3GnCoQPIPyLAMiT2IdYJKUUhqSPJDLefVY2U,16983
|
|
75
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py,sha256=uJc7ZwK6pJpsMIuBSBXUtxdvd-aiRnOXGX3aeyvw2ik,151
|
|
76
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/field.py,sha256=z4S5QJJi1acshC0YFzblppgtm1oxNEgMKYjaJdfzkn4,8324
|
|
77
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/metric.py,sha256=mSoJAjYRSEpq8zBm-EP0UwF0zmZ4gDRjoUe4jT9nJt0,12212
|
|
78
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py,sha256=JHZhoSfGJYYp3sFx3XP9cTsDQgpgajzZ7TV5c4hmKCs,5980
|
|
79
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py,sha256=CGQ5LvhQrmxAyZDHBHds47rjJYWsx670c56yOHCrEAI,15074
|
|
80
|
+
wxo_agentic_evaluation/referenceless_eval/metrics/utils.py,sha256=jurmc4KFFKH4hwnvor2xg97H91b-xJc3cUKYaU2I8uM,1370
|
|
81
|
+
wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
|
+
wxo_agentic_evaluation/referenceless_eval/prompt/runner.py,sha256=FFmcSWXQnLmylpYyj8LZuPwb6nqwQp-jj6Mv9g8zby0,5052
|
|
83
|
+
wxo_agentic_evaluation/service_provider/__init__.py,sha256=yNQ-urOIdjANbpCzVAhkPHNcpBY6hndDJgPZM1C2qeo,2107
|
|
84
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=EW1JIiIWoKaTTC-fqKURSsbdyo-dbVWYVrXY8-gEmvc,4081
|
|
85
|
+
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
|
|
86
|
+
wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
|
|
87
|
+
wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py,sha256=aJrCz8uco6HOQwNCSjEKviwnhlyLTNAGpLtsOAegQ70,5200
|
|
88
|
+
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=ugXCXwrfi_XC2d9FPa96ccMKGQbTd1ElDw8RNR8TDB8,6544
|
|
89
|
+
wxo_agentic_evaluation/utils/__init__.py,sha256=ItryTgc1jVc32rB3XktTFaYGA_A6bRIDZ1Pts_JGmv8,144
|
|
90
|
+
wxo_agentic_evaluation/utils/open_ai_tool_extractor.py,sha256=Vyji_edgou2xMLbsGwFG-QI7xRBNvO3-1nbeOc8ZuFo,5646
|
|
91
|
+
wxo_agentic_evaluation/utils/rich_utils.py,sha256=J9lzL4ETQeiAJcXKsUzXh82XdKvlDY7jmcgTQlwmL9s,6252
|
|
92
|
+
wxo_agentic_evaluation/utils/rouge_score.py,sha256=WvcGh6mwF4rWH599J9_lAt3BfaHbAZKtKEJBsC61iKo,692
|
|
93
|
+
wxo_agentic_evaluation/utils/utils.py,sha256=qQR_2W5p0Rk6KSE3-llRyZrWXkO5zG9JW7H1692L4PI,11428
|
|
94
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA,sha256=4yfSRfaNQUwauYPqvTFAoaVSn_c3i5YbIC7SFK4SnDU,16105
|
|
95
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
96
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
97
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/RECORD,,
|
|
@@ -5,8 +5,8 @@ import rich
|
|
|
5
5
|
from type import ContentType
|
|
6
6
|
from analytics.tools.analyzer import ToolErrorAnalyzer
|
|
7
7
|
from analytics.tools.ux import ToolErrorDisplayManager
|
|
8
|
-
from type import Message
|
|
9
8
|
from shutil import get_terminal_size
|
|
9
|
+
from utils.utils import load_messages
|
|
10
10
|
|
|
11
11
|
if __name__ == "__main__":
|
|
12
12
|
parser = argparse.ArgumentParser(description="tool-analytics-resources")
|
|
@@ -47,23 +47,6 @@ if __name__ == "__main__":
|
|
|
47
47
|
"""Count total tool calls in the conversation."""
|
|
48
48
|
return sum(1 for msg in messages if msg.type == ContentType.tool_call)
|
|
49
49
|
|
|
50
|
-
# Function to load messages from JSON file
|
|
51
|
-
def load_messages(file_path):
|
|
52
|
-
with open(file_path, "r") as f:
|
|
53
|
-
|
|
54
|
-
try:
|
|
55
|
-
message_data = json.load(f)
|
|
56
|
-
messages = []
|
|
57
|
-
for msg in message_data:
|
|
58
|
-
messages.append(Message.model_validate(msg))
|
|
59
|
-
|
|
60
|
-
return messages
|
|
61
|
-
|
|
62
|
-
except Exception as e:
|
|
63
|
-
print(file_path)
|
|
64
|
-
print(e)
|
|
65
|
-
return None
|
|
66
|
-
|
|
67
50
|
# Function to load ground truth from JSON file
|
|
68
51
|
def load_ground_truth(file_path):
|
|
69
52
|
with open(file_path, "r") as f:
|