ibm-watsonx-orchestrate-evaluation-framework 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/METADATA +70 -7
- ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/RECORD +56 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +3 -3
- wxo_agentic_evaluation/analytics/tools/ux.py +1 -1
- wxo_agentic_evaluation/analyze_run.py +10 -10
- wxo_agentic_evaluation/arg_configs.py +8 -1
- wxo_agentic_evaluation/batch_annotate.py +3 -9
- wxo_agentic_evaluation/data_annotator.py +50 -36
- wxo_agentic_evaluation/evaluation_package.py +102 -85
- wxo_agentic_evaluation/external_agent/__init__.py +37 -0
- wxo_agentic_evaluation/external_agent/external_validate.py +74 -29
- wxo_agentic_evaluation/external_agent/performance_test.py +66 -0
- wxo_agentic_evaluation/external_agent/types.py +8 -2
- wxo_agentic_evaluation/inference_backend.py +45 -50
- wxo_agentic_evaluation/llm_matching.py +6 -6
- wxo_agentic_evaluation/llm_rag_eval.py +4 -4
- wxo_agentic_evaluation/llm_user.py +3 -3
- wxo_agentic_evaluation/main.py +63 -23
- wxo_agentic_evaluation/metrics/metrics.py +59 -0
- wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2 +23 -0
- wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +2 -0
- wxo_agentic_evaluation/prompt/examples/data_simple.json +1 -2
- wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2 +195 -0
- wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2 +154 -0
- wxo_agentic_evaluation/prompt/template_render.py +17 -0
- wxo_agentic_evaluation/prompt/tool_planner.jinja2 +13 -7
- wxo_agentic_evaluation/record_chat.py +74 -26
- wxo_agentic_evaluation/resource_map.py +47 -0
- wxo_agentic_evaluation/service_provider/__init__.py +35 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +108 -0
- wxo_agentic_evaluation/service_provider/ollama_provider.py +40 -0
- wxo_agentic_evaluation/service_provider/provider.py +19 -0
- wxo_agentic_evaluation/{watsonx_provider.py → service_provider/watsonx_provider.py} +27 -18
- wxo_agentic_evaluation/test_prompt.py +94 -0
- wxo_agentic_evaluation/tool_planner.py +130 -17
- wxo_agentic_evaluation/type.py +0 -57
- wxo_agentic_evaluation/utils/utils.py +6 -54
- ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/RECORD +0 -46
- ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info/licenses/LICENSE +0 -22
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.2.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ibm-watsonx-orchestrate-evaluation-framework
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: The WxO evaluation framework
|
|
5
5
|
Author-email: Haode Qi <Haode.Qi@ibm.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: <3.14,>=3.11
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
|
-
License-File: LICENSE
|
|
10
9
|
Requires-Dist: rich~=13.9.4
|
|
11
10
|
Requires-Dist: pydantic~=2.10.6
|
|
12
11
|
Requires-Dist: pyyaml~=6.0.2
|
|
@@ -32,7 +31,6 @@ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
|
|
|
32
31
|
Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
|
|
33
32
|
Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
|
|
34
33
|
Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
|
|
35
|
-
Dynamic: license-file
|
|
36
34
|
|
|
37
35
|
# WXO-agent evaluation framework
|
|
38
36
|
|
|
@@ -46,7 +44,8 @@ Dynamic: license-file
|
|
|
46
44
|
- The agent calls the `runs/` endpoint of the wxo-lite server instance, and the actual tool code is executed on the server side. The server database is not visible to our framework.
|
|
47
45
|
|
|
48
46
|
## prerequisite
|
|
49
|
-
Follow the [SDK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the SDK.
|
|
47
|
+
Follow the [SDK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the SDK.
|
|
48
|
+
The current framework is compatible with ADK version >= 1.20, <= 1.6.0
|
|
50
49
|
|
|
51
50
|
## setup for evaluation framework
|
|
52
51
|
Run the following command to install evaluation framework in the same env:
|
|
@@ -59,9 +58,11 @@ pip install -e .
|
|
|
59
58
|
```bash
|
|
60
59
|
orchestrate server start
|
|
61
60
|
export WATSONX_SPACE_ID=""
|
|
62
|
-
export
|
|
61
|
+
export WATSONX_APIKEY=""
|
|
63
62
|
```
|
|
64
63
|
|
|
64
|
+
NOTE: If you want to use `WO_INSTANCE` and `WO_API_KEY` instead, follow the [model proxy section](#using-model-proxy-provider).
|
|
65
|
+
|
|
65
66
|
Import sample hr tools and agent into your default `wxo-dev` env:
|
|
66
67
|
```bash
|
|
67
68
|
orchestrate tools import -f benchmarks/hr_sample/tools.py -k python
|
|
@@ -97,7 +98,6 @@ Note:
|
|
|
97
98
|
]
|
|
98
99
|
}
|
|
99
100
|
],
|
|
100
|
-
"mine_fields": [],
|
|
101
101
|
"story": "Your username is nwaters and you want to find out timeoff schedule from 20250101 to 20250303."
|
|
102
102
|
}
|
|
103
103
|
```
|
|
@@ -124,7 +124,7 @@ NOTE: run `orchestrate env list` to find the name of the active tenant. for defa
|
|
|
124
124
|
4. Run the test:
|
|
125
125
|
```bash
|
|
126
126
|
export WATSONX_SPACE_ID=""
|
|
127
|
-
export
|
|
127
|
+
export WATSONX_APIKEY=""
|
|
128
128
|
python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
|
|
129
129
|
```
|
|
130
130
|
|
|
@@ -237,6 +237,69 @@ python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
|
|
|
237
237
|
|
|
238
238
|
For full instructions on setting up tools, writing stories, configuring the pipeline, and generating batch test cases, see the [Batch Test case Generation Guide](./benchmarks/batch_sample/README.MD).
|
|
239
239
|
|
|
240
|
+
## Using Model Proxy Provider
|
|
241
|
+
|
|
242
|
+
To use the model proxy provider (which allows direct access to LLM models), follow these steps:
|
|
243
|
+
|
|
244
|
+
1. Set up environment variables:
|
|
245
|
+
```sh
|
|
246
|
+
export WO_INSTANCE=<your-instance-url>
|
|
247
|
+
export WO_API_KEY=<your-api-key>
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
2. Create a configuration file similar to [benchmarks/hr_sample/config_model_proxy.yaml](benchmarks/hr_sample/config_model_proxy.yaml):
|
|
251
|
+
```yaml
|
|
252
|
+
test_paths:
|
|
253
|
+
- <your-test-path>
|
|
254
|
+
|
|
255
|
+
auth_config:
|
|
256
|
+
url: http://localhost:4321
|
|
257
|
+
tenant_name: wxo-dev
|
|
258
|
+
|
|
259
|
+
provider_config:
|
|
260
|
+
provider: "model_proxy"
|
|
261
|
+
model_id: "<model-id>"
|
|
262
|
+
|
|
263
|
+
output_dir: "<output-dir>"
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
3. Run the evaluation:
|
|
267
|
+
```sh
|
|
268
|
+
python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## Using Ollama
|
|
272
|
+
|
|
273
|
+
To use model from Ollama (local LLM deployment), follow these steps:
|
|
274
|
+
|
|
275
|
+
1. Make sure you have [Ollama](https://ollama.com) installed and running on your system.
|
|
276
|
+
|
|
277
|
+
2. Pull your desired model using Ollama (e.g. llama3.1:8b):
|
|
278
|
+
```sh
|
|
279
|
+
ollama pull <model-id>
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
3. Create a configuration file similar to [benchmarks/hr_sample/config_ollama.yaml](benchmarks/hr_sample/config_ollama.yaml):
|
|
283
|
+
```yaml
|
|
284
|
+
test_paths:
|
|
285
|
+
- <your-test-path>
|
|
286
|
+
|
|
287
|
+
auth_config:
|
|
288
|
+
url: http://localhost:4321
|
|
289
|
+
tenant_name: wxo-dev
|
|
290
|
+
|
|
291
|
+
provider_config:
|
|
292
|
+
provider: "ollama"
|
|
293
|
+
model_id: "<model-id>"
|
|
294
|
+
|
|
295
|
+
output_dir: "results/ollama/<model-name>"
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
4. Run the evaluation:
|
|
299
|
+
```sh
|
|
300
|
+
python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
|
|
301
|
+
```
|
|
302
|
+
|
|
240
303
|
## Workflow diagram
|
|
241
304
|
|
|
242
305
|
To help better understand the workflow, this is a diagram of how this repo works together with wxO lite python SDK and a wxO runtime.
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
|
|
3
|
+
wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
|
|
4
|
+
wxo_agentic_evaluation/arg_configs.py,sha256=UCrGcakFaAM3reFquMn03qNtKe7Pg8ScbOF0K7o8VDU,2240
|
|
5
|
+
wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
|
|
6
|
+
wxo_agentic_evaluation/data_annotator.py,sha256=DJVG2CdhJRAJ3X1ARbrsn9bPjTuytCDGIBM4PEexfnk,8214
|
|
7
|
+
wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
|
|
8
|
+
wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
|
|
9
|
+
wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
|
|
10
|
+
wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
|
|
11
|
+
wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
|
|
12
|
+
wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
|
|
13
|
+
wxo_agentic_evaluation/record_chat.py,sha256=ZaOxIabDcE_CzZjKJESgh8LY7pK9UT4OvqQMFVdTG7A,8102
|
|
14
|
+
wxo_agentic_evaluation/resource_map.py,sha256=-dIWQdpEpPeSCbDeYfRupG9KV1Q4NlHGb5KXywjkulM,1645
|
|
15
|
+
wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
|
|
16
|
+
wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
|
|
17
|
+
wxo_agentic_evaluation/tool_planner.py,sha256=e-lBb4w1klT1HOL9BTwae3lkGv5VBuYC397mSJgOhus,12622
|
|
18
|
+
wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
|
|
19
|
+
wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
|
|
20
|
+
wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
|
|
21
|
+
wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
|
|
22
|
+
wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
|
|
23
|
+
wxo_agentic_evaluation/external_agent/__init__.py,sha256=LY3gMNzfIEwjpQkx5_2iZFHGQiUL4ymEkKL1dc2uKq4,1491
|
|
24
|
+
wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
|
|
25
|
+
wxo_agentic_evaluation/external_agent/performance_test.py,sha256=bCXUsW0OeUzwfSSYObgfAmEU5vARkD-PblYU-mU9aPY,2507
|
|
26
|
+
wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
|
|
27
|
+
wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
|
|
29
|
+
wxo_agentic_evaluation/metrics/metrics.py,sha256=9O2m6T2iW-PMjGrTdMbOHP2Pr4RN0NwbEp6YgFpTi3I,5572
|
|
30
|
+
wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
|
+
wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
|
|
32
|
+
wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
|
|
33
|
+
wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
|
|
34
|
+
wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
|
|
35
|
+
wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
|
|
36
|
+
wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
|
|
37
|
+
wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=nDfCD0o9cRYmsgIjzD-RZNQxotlvuqrzdsZIY-vT794,684
|
|
38
|
+
wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
|
|
39
|
+
wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
|
|
40
|
+
wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
|
|
41
|
+
wxo_agentic_evaluation/prompt/template_render.py,sha256=FVH5ew2TofC5LGqQzqNj90unrxooUZv_5XxJzVdz8uM,3563
|
|
42
|
+
wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
|
|
43
|
+
wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
|
|
44
|
+
wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
+
wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
|
|
46
|
+
wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
|
|
47
|
+
wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=X5tiE0IKCR2CqhwEGm91LOdzFZQWSXzXQgLOtzi6ng0,4002
|
|
48
|
+
wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
|
|
49
|
+
wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
|
|
50
|
+
wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
|
|
51
|
+
wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
|
|
52
|
+
wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
|
|
53
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/METADATA,sha256=uhmuzKUbgWgKDNayG2dAc-YYvZ_ypeVY4onrcomv0Co,17667
|
|
54
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
55
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
|
|
56
|
+
ibm_watsonx_orchestrate_evaluation_framework-1.0.4.dist-info/RECORD,,
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from type import Message, ContentType, EvaluationData
|
|
1
|
+
from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
|
|
2
2
|
from typing import List, Optional
|
|
3
3
|
import json
|
|
4
4
|
import rich
|
|
5
5
|
from collections import defaultdict
|
|
6
|
-
from analytics.tools.types import (
|
|
6
|
+
from wxo_agentic_evaluation.analytics.tools.types import (
|
|
7
7
|
ErrorPatterns,
|
|
8
8
|
ToolFailure,
|
|
9
9
|
HallucinatedParameter,
|
|
@@ -15,7 +15,7 @@ from analytics.tools.types import (
|
|
|
15
15
|
AnalysisResults,
|
|
16
16
|
ErrorType,
|
|
17
17
|
)
|
|
18
|
-
from data_annotator import ERROR_KEYWORDS
|
|
18
|
+
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
19
19
|
from http import HTTPStatus
|
|
20
20
|
|
|
21
21
|
|
|
@@ -5,7 +5,7 @@ from rich.table import Table
|
|
|
5
5
|
from rich.panel import Panel
|
|
6
6
|
from rich.align import Align
|
|
7
7
|
from rich.console import Group
|
|
8
|
-
from type import Message, ContentType
|
|
8
|
+
from wxo_agentic_evaluation.type import Message, ContentType
|
|
9
9
|
from typing import List, Dict, Optional
|
|
10
10
|
from analytics.tools.types import (
|
|
11
11
|
ToolDefinitionRecommendation,
|
|
@@ -9,9 +9,9 @@ from rich.table import Table
|
|
|
9
9
|
from typing import List
|
|
10
10
|
from wxo_agentic_evaluation.type import (
|
|
11
11
|
ExtendedMessage,
|
|
12
|
-
ContentType
|
|
13
|
-
ToolCallAndRoutingMetrics,
|
|
12
|
+
ContentType
|
|
14
13
|
)
|
|
14
|
+
from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
|
|
15
15
|
from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
|
|
16
16
|
from jsonargparse import CLI
|
|
17
17
|
|
|
@@ -71,10 +71,11 @@ def analyze(config: AnalyzeConfig):
|
|
|
71
71
|
|
|
72
72
|
test_case_with_failed_tools = []
|
|
73
73
|
for entry in summary:
|
|
74
|
-
test_case_name = entry["
|
|
74
|
+
test_case_name = entry["dataset_name"]
|
|
75
75
|
if test_case_name.lower().strip() == "summary (average)":
|
|
76
76
|
continue
|
|
77
|
-
if
|
|
77
|
+
if not entry["is_success"] or float(entry["tool_calls_with_incorrect_parameter"]) > 0 or float(entry["tool_call_precision"]) < 1.0\
|
|
78
|
+
or float(entry["tool_call_recall"]) < 1.0:
|
|
78
79
|
test_case_with_failed_tools.append(entry)
|
|
79
80
|
if len(test_case_with_failed_tools) == 0:
|
|
80
81
|
header_table = Table(show_header=False, box=None)
|
|
@@ -85,7 +86,7 @@ def analyze(config: AnalyzeConfig):
|
|
|
85
86
|
rich.print(header_panel)
|
|
86
87
|
|
|
87
88
|
for test_case_entry in test_case_with_failed_tools:
|
|
88
|
-
test_case_name = test_case_entry["
|
|
89
|
+
test_case_name = test_case_entry["dataset_name"]
|
|
89
90
|
|
|
90
91
|
test_case_path = os.path.join(
|
|
91
92
|
config.data_path, "messages", f"{test_case_name}.messages.analyze.json"
|
|
@@ -94,7 +95,8 @@ def analyze(config: AnalyzeConfig):
|
|
|
94
95
|
with open(test_case_path, "r", encoding="utf-8") as f:
|
|
95
96
|
temp = json.load(f)
|
|
96
97
|
for entry in temp:
|
|
97
|
-
|
|
98
|
+
msg = ExtendedMessage(**entry)
|
|
99
|
+
test_messages.append(msg)
|
|
98
100
|
|
|
99
101
|
test_metrics_path = os.path.join(
|
|
100
102
|
config.data_path, "messages", f"{test_case_name}.metrics.json"
|
|
@@ -105,11 +107,9 @@ def analyze(config: AnalyzeConfig):
|
|
|
105
107
|
header_table.add_row(f"Test Case Name: {test_case_name}")
|
|
106
108
|
header_table.add_row((f"Expected Tool Calls: {metrics.expected_tool_calls}"))
|
|
107
109
|
header_table.add_row(f"Correct Tool Calls: {metrics.correct_tool_calls}")
|
|
108
|
-
|
|
109
|
-
header_table.add_row(f"Irrelevant Tool Call: {irrelevant_tool_calls}")
|
|
110
|
-
tool_call_with_incorrect_parameters = test_case_entry["Wrong Parameters"]
|
|
110
|
+
header_table.add_row(f"Text Match: {metrics.text_match.value}")
|
|
111
111
|
header_table.add_row(
|
|
112
|
-
f"
|
|
112
|
+
f"Journey Success: {metrics.is_success}"
|
|
113
113
|
)
|
|
114
114
|
header_panel = Panel(
|
|
115
115
|
header_table, title="[bold green]📋 Analysis Summary[/bold green]"
|
|
@@ -22,12 +22,19 @@ class LLMUserConfig:
|
|
|
22
22
|
user_response_style: List[str] = field(default_factory=list)
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
@dataclass
|
|
26
|
+
class ProviderConfig:
|
|
27
|
+
model_id: str = field(default="meta-llama/llama-3-405b-instruct")
|
|
28
|
+
provider: str = field(default="watsonx")
|
|
29
|
+
|
|
30
|
+
|
|
25
31
|
@dataclass
|
|
26
32
|
class TestConfig:
|
|
27
33
|
test_paths: List[str]
|
|
28
34
|
output_dir: str
|
|
29
35
|
auth_config: AuthConfig
|
|
30
36
|
wxo_lite_version: str
|
|
37
|
+
provider_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
31
38
|
llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
|
|
32
39
|
enable_verbose_logging: bool = True
|
|
33
40
|
enable_manual_user_input: bool = False
|
|
@@ -65,7 +72,7 @@ class ChatRecordingConfig:
|
|
|
65
72
|
default_factory=KeywordsGenerationConfig
|
|
66
73
|
)
|
|
67
74
|
service_url: str = "http://localhost:4321"
|
|
68
|
-
tenant_name: str = "
|
|
75
|
+
tenant_name: str = "local"
|
|
69
76
|
token: str = None
|
|
70
77
|
|
|
71
78
|
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from jsonargparse import CLI
|
|
7
7
|
|
|
8
|
-
from wxo_agentic_evaluation.
|
|
8
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
9
9
|
from wxo_agentic_evaluation.prompt.template_render import BatchTestCaseGeneratorTemplateRenderer
|
|
10
10
|
from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
|
|
11
11
|
from wxo_agentic_evaluation import __file__
|
|
@@ -71,7 +71,6 @@ def extract_inputs_from_snapshot(snapshot_path: Path) -> dict:
|
|
|
71
71
|
def load_example(example_path: Path):
|
|
72
72
|
with example_path.open("r", encoding="utf-8") as f:
|
|
73
73
|
data = json.load(f)
|
|
74
|
-
data.pop("mine_fields", None)
|
|
75
74
|
return data
|
|
76
75
|
|
|
77
76
|
|
|
@@ -98,13 +97,9 @@ def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story:
|
|
|
98
97
|
def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-llama/llama-3-405b-instruct", ):
|
|
99
98
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
100
99
|
|
|
101
|
-
provider =
|
|
100
|
+
provider = get_provider(
|
|
102
101
|
model_id=model_id,
|
|
103
|
-
|
|
104
|
-
"min_new_tokens": 50,
|
|
105
|
-
"decoding_method": "greedy",
|
|
106
|
-
"max_new_tokens": 3000
|
|
107
|
-
}
|
|
102
|
+
params={"min_new_tokens": 50, "decoding_method": "greedy", "max_new_tokens": 3000},
|
|
108
103
|
)
|
|
109
104
|
|
|
110
105
|
response = provider.query(prompt)
|
|
@@ -119,7 +114,6 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
|
|
|
119
114
|
assert isinstance(test_cases, list), "Expected list of test cases"
|
|
120
115
|
|
|
121
116
|
for i, case in enumerate(test_cases, start=starting_index):
|
|
122
|
-
case["mine_fields"] = [] # ✅ Add the field here
|
|
123
117
|
out_file = output_dir / f"synthetic_test_case_{i}.json"
|
|
124
118
|
with out_file.open("w", encoding="utf-8") as f:
|
|
125
119
|
json.dump(case, f, indent=2)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from wxo_agentic_evaluation.type import Message, EvaluationData
|
|
2
|
-
from wxo_agentic_evaluation.watsonx_provider import
|
|
2
|
+
from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
|
|
3
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
3
4
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
4
5
|
LlamaKeywordsGenerationTemplateRenderer,
|
|
5
6
|
)
|
|
@@ -94,16 +95,16 @@ ERROR_KEYWORDS = [
|
|
|
94
95
|
class KeywordsGenerationLLM:
|
|
95
96
|
def __init__(
|
|
96
97
|
self,
|
|
97
|
-
|
|
98
|
+
provider: Provider,
|
|
98
99
|
template: LlamaKeywordsGenerationTemplateRenderer,
|
|
99
100
|
):
|
|
100
|
-
self.
|
|
101
|
+
self.provider = provider
|
|
101
102
|
self.prompt_template = template
|
|
102
103
|
|
|
103
104
|
def genereate_keywords(self, response) -> Message | None:
|
|
104
105
|
prompt = self.prompt_template.render(response=response)
|
|
105
|
-
res = self.
|
|
106
|
-
keywords = ast.literal_eval(res
|
|
106
|
+
res: str = self.provider.query(prompt)
|
|
107
|
+
keywords = ast.literal_eval(res.strip())
|
|
107
108
|
return keywords
|
|
108
109
|
|
|
109
110
|
|
|
@@ -120,7 +121,6 @@ class DataAnnotator:
|
|
|
120
121
|
agent="",
|
|
121
122
|
story="",
|
|
122
123
|
starting_sentence=messages[0].content if messages else "",
|
|
123
|
-
mine_fields=[],
|
|
124
124
|
goals={},
|
|
125
125
|
goal_details=[],
|
|
126
126
|
)
|
|
@@ -145,29 +145,48 @@ class DataAnnotator:
|
|
|
145
145
|
|
|
146
146
|
def _process_tool_call_order(self, wrong_tool_response_id: list[str]) -> list[str]:
|
|
147
147
|
"""Process and order tool calls, skipping failed ones"""
|
|
148
|
+
# gather all call ids that actually got a response
|
|
149
|
+
valid_call_ids = {
|
|
150
|
+
json.loads(m.content)["tool_call_id"]
|
|
151
|
+
for m in self.messages
|
|
152
|
+
if m.type == "tool_response"
|
|
153
|
+
}
|
|
154
|
+
|
|
148
155
|
order = []
|
|
149
|
-
for message in self.messages:
|
|
150
|
-
if message.type
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
156
|
+
for idx, message in enumerate(self.messages):
|
|
157
|
+
if message.type != "tool_call":
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
content = json.loads(message.content)
|
|
161
|
+
call_id = content.get("tool_call_id") or content.get("id")
|
|
162
|
+
|
|
163
|
+
# skip any calls that errored
|
|
164
|
+
if call_id in wrong_tool_response_id:
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
# skip calls that never produced a tool_response
|
|
168
|
+
if call_id not in valid_call_ids:
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
# skip the "reflection" copy that the LLM emits right after a response
|
|
172
|
+
prev = self.messages[idx - 1] if idx > 0 else None
|
|
173
|
+
if (
|
|
174
|
+
prev is not None
|
|
175
|
+
and prev.type == "tool_response"
|
|
176
|
+
and json.loads(prev.content).get("tool_call_id") == call_id
|
|
177
|
+
):
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
# normalize ids so json dumps only reflects name-args
|
|
181
|
+
content.pop("tool_call_id", None)
|
|
182
|
+
content.pop("id", None)
|
|
183
|
+
|
|
184
|
+
signature = json.dumps(content, sort_keys=True)
|
|
185
|
+
# if we’ve seen that exact (name-args) before, drop the old one
|
|
186
|
+
if signature in order:
|
|
187
|
+
order.remove(signature)
|
|
188
|
+
order.append(signature)
|
|
189
|
+
|
|
171
190
|
return order
|
|
172
191
|
|
|
173
192
|
def _process_tool_calls(self) -> tuple[Dict, List, str]:
|
|
@@ -209,16 +228,12 @@ class DataAnnotator:
|
|
|
209
228
|
# we assume single summary step at the end
|
|
210
229
|
for message in self.messages[::-1]:
|
|
211
230
|
if message.role == "assistant":
|
|
212
|
-
|
|
231
|
+
provider = get_provider(
|
|
213
232
|
model_id=self.keywords_generation_config.model_id,
|
|
214
|
-
|
|
215
|
-
"min_new_tokens": 0,
|
|
216
|
-
"decoding_method": "greedy",
|
|
217
|
-
"max_new_tokens": 256,
|
|
218
|
-
},
|
|
233
|
+
params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 256},
|
|
219
234
|
)
|
|
220
235
|
kw_generator = KeywordsGenerationLLM(
|
|
221
|
-
|
|
236
|
+
provider=provider,
|
|
222
237
|
template=LlamaKeywordsGenerationTemplateRenderer(
|
|
223
238
|
self.keywords_generation_config.prompt_config
|
|
224
239
|
),
|
|
@@ -247,7 +262,6 @@ class DataAnnotator:
|
|
|
247
262
|
"agent": self.initial_data.agent,
|
|
248
263
|
"goals": goals,
|
|
249
264
|
"goal_details": goal_details,
|
|
250
|
-
"mine_fields": [],
|
|
251
265
|
"story": self.initial_data.story,
|
|
252
266
|
"starting_sentence": self.initial_data.starting_sentence,
|
|
253
267
|
}
|