ibm-watsonx-orchestrate-evaluation-framework 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info}/METADATA +70 -12
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +56 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +3 -3
  4. wxo_agentic_evaluation/analytics/tools/ux.py +1 -1
  5. wxo_agentic_evaluation/analyze_run.py +10 -10
  6. wxo_agentic_evaluation/arg_configs.py +8 -1
  7. wxo_agentic_evaluation/batch_annotate.py +4 -10
  8. wxo_agentic_evaluation/data_annotator.py +50 -36
  9. wxo_agentic_evaluation/evaluation_package.py +102 -85
  10. wxo_agentic_evaluation/external_agent/__init__.py +37 -0
  11. wxo_agentic_evaluation/external_agent/external_validate.py +74 -31
  12. wxo_agentic_evaluation/external_agent/performance_test.py +66 -0
  13. wxo_agentic_evaluation/external_agent/types.py +8 -2
  14. wxo_agentic_evaluation/inference_backend.py +45 -50
  15. wxo_agentic_evaluation/llm_matching.py +6 -6
  16. wxo_agentic_evaluation/llm_rag_eval.py +4 -4
  17. wxo_agentic_evaluation/llm_user.py +3 -3
  18. wxo_agentic_evaluation/main.py +63 -23
  19. wxo_agentic_evaluation/metrics/metrics.py +72 -5
  20. wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2 +23 -0
  21. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +2 -0
  22. wxo_agentic_evaluation/prompt/examples/data_simple.json +1 -2
  23. wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2 +195 -0
  24. wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2 +154 -0
  25. wxo_agentic_evaluation/prompt/template_render.py +17 -0
  26. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +13 -7
  27. wxo_agentic_evaluation/record_chat.py +59 -18
  28. wxo_agentic_evaluation/resource_map.py +47 -0
  29. wxo_agentic_evaluation/service_provider/__init__.py +35 -0
  30. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +108 -0
  31. wxo_agentic_evaluation/service_provider/ollama_provider.py +40 -0
  32. wxo_agentic_evaluation/service_provider/provider.py +19 -0
  33. wxo_agentic_evaluation/{watsonx_provider.py → service_provider/watsonx_provider.py} +54 -57
  34. wxo_agentic_evaluation/test_prompt.py +94 -0
  35. wxo_agentic_evaluation/tool_planner.py +130 -17
  36. wxo_agentic_evaluation/type.py +0 -57
  37. wxo_agentic_evaluation/utils/utils.py +6 -54
  38. ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info/RECORD +0 -46
  39. ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info/licenses/LICENSE +0 -22
  40. {ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info}/WHEEL +0 -0
  41. {ibm_watsonx_orchestrate_evaluation_framework-1.0.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,18 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: The WxO evaluation framework
5
5
  Author-email: Haode Qi <Haode.Qi@ibm.com>
6
6
  License: MIT
7
7
  Requires-Python: <3.14,>=3.11
8
8
  Description-Content-Type: text/markdown
9
- License-File: LICENSE
10
9
  Requires-Dist: rich~=13.9.4
11
- Requires-Dist: ibm-watsonx-ai~=1.3.6
12
10
  Requires-Dist: pydantic~=2.10.6
13
11
  Requires-Dist: pyyaml~=6.0.2
14
12
  Requires-Dist: jinja2~=3.1.5
15
13
  Requires-Dist: python-dotenv~=1.0.1
16
14
  Requires-Dist: dataclasses-json~=0.6.7
17
15
  Requires-Dist: jsonargparse~=4.37.0
18
- Requires-Dist: networkx~=3.4.2
19
- Requires-Dist: matplotlib~=3.10.1
20
- Requires-Dist: numpy~=1.26.4
21
- Requires-Dist: langchain-openai~=0.3.5
22
16
  Provides-Extra: dev
23
17
  Requires-Dist: setuptools~=70.3.0; extra == "dev"
24
18
  Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
@@ -37,7 +31,6 @@ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
37
31
  Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
38
32
  Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
39
33
  Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
40
- Dynamic: license-file
41
34
 
42
35
  # WXO-agent evaluation framework
43
36
 
@@ -51,7 +44,8 @@ Dynamic: license-file
51
44
  - The agent calls the `runs/` endpoint of the wxo-lite server instance, and the actual tool code is executed on the server side. The server database is not visible to our framework.
52
45
 
53
46
  ## prerequisite
54
- Follow the [SDK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the SDK. Make sure you are using version 1.2.0 of the SDK as this is the version this framework requires.
47
+ Follow the [SDK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the SDK.
48
+ The current framework is compatible with ADK version >= 1.20, <= 1.6.0
55
49
 
56
50
  ## setup for evaluation framework
57
51
  Run the following command to install evaluation framework in the same env:
@@ -64,9 +58,11 @@ pip install -e .
64
58
  ```bash
65
59
  orchestrate server start
66
60
  export WATSONX_SPACE_ID=""
67
- export WATSONX_API_KEY=""
61
+ export WATSONX_APIKEY=""
68
62
  ```
69
63
 
64
+ NOTE: If you want to use `WO_INSTANCE` and `WO_API_KEY` instead, follow the [model proxy section](#using-model-proxy-provider).
65
+
70
66
  Import sample hr tools and agent into your default `wxo-dev` env:
71
67
  ```bash
72
68
  orchestrate tools import -f benchmarks/hr_sample/tools.py -k python
@@ -102,7 +98,6 @@ Note:
102
98
  ]
103
99
  }
104
100
  ],
105
- "mine_fields": [],
106
101
  "story": "Your username is nwaters and you want to find out timeoff schedule from 20250101 to 20250303."
107
102
  }
108
103
  ```
@@ -129,7 +124,7 @@ NOTE: run `orchestrate env list` to find the name of the active tenant. for defa
129
124
  4. Run the test:
130
125
  ```bash
131
126
  export WATSONX_SPACE_ID=""
132
- export WATSONX_API_KEY=""
127
+ export WATSONX_APIKEY=""
133
128
  python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
134
129
  ```
135
130
 
@@ -242,6 +237,69 @@ python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
242
237
 
243
238
  For full instructions on setting up tools, writing stories, configuring the pipeline, and generating batch test cases, see the [Batch Test case Generation Guide](./benchmarks/batch_sample/README.MD).
244
239
 
240
+ ## Using Model Proxy Provider
241
+
242
+ To use the model proxy provider (which allows direct access to LLM models), follow these steps:
243
+
244
+ 1. Set up environment variables:
245
+ ```sh
246
+ export WO_INSTANCE=<your-instance-url>
247
+ export WO_API_KEY=<your-api-key>
248
+ ```
249
+
250
+ 2. Create a configuration file similar to [benchmarks/hr_sample/config_model_proxy.yaml](benchmarks/hr_sample/config_model_proxy.yaml):
251
+ ```yaml
252
+ test_paths:
253
+ - <your-test-path>
254
+
255
+ auth_config:
256
+ url: http://localhost:4321
257
+ tenant_name: wxo-dev
258
+
259
+ provider_config:
260
+ provider: "model_proxy"
261
+ model_id: "<model-id>"
262
+
263
+ output_dir: "<output-dir>"
264
+ ```
265
+
266
+ 3. Run the evaluation:
267
+ ```sh
268
+ python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
269
+ ```
270
+
271
+ ## Using Ollama
272
+
273
+ To use model from Ollama (local LLM deployment), follow these steps:
274
+
275
+ 1. Make sure you have [Ollama](https://ollama.com) installed and running on your system.
276
+
277
+ 2. Pull your desired model using Ollama (e.g. llama3.1:8b):
278
+ ```sh
279
+ ollama pull <model-id>
280
+ ```
281
+
282
+ 3. Create a configuration file similar to [benchmarks/hr_sample/config_ollama.yaml](benchmarks/hr_sample/config_ollama.yaml):
283
+ ```yaml
284
+ test_paths:
285
+ - <your-test-path>
286
+
287
+ auth_config:
288
+ url: http://localhost:4321
289
+ tenant_name: wxo-dev
290
+
291
+ provider_config:
292
+ provider: "ollama"
293
+ model_id: "<model-id>"
294
+
295
+ output_dir: "results/ollama/<model-name>"
296
+ ```
297
+
298
+ 4. Run the evaluation:
299
+ ```sh
300
+ python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
301
+ ```
302
+
245
303
  ## Workflow diagram
246
304
 
247
305
  To help better understand the workflow, this is a diagram of how this repo works together with wxO lite python SDK and a wxO runtime.
@@ -0,0 +1,56 @@
1
+ wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ wxo_agentic_evaluation/analyze_run.py,sha256=C4HowEukNMM-H8FkRcHRqkiNYIQVCoTKbBLiqr1cFRM,4332
3
+ wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
4
+ wxo_agentic_evaluation/arg_configs.py,sha256=UCrGcakFaAM3reFquMn03qNtKe7Pg8ScbOF0K7o8VDU,2240
5
+ wxo_agentic_evaluation/batch_annotate.py,sha256=44K4DUI498uaLIWUn3nz82AKcU6VnCjrExoG6GpPHoM,6323
6
+ wxo_agentic_evaluation/data_annotator.py,sha256=DJVG2CdhJRAJ3X1ARbrsn9bPjTuytCDGIBM4PEexfnk,8214
7
+ wxo_agentic_evaluation/evaluation_package.py,sha256=jOSe-TCJdAWCk1sWpRYfi_EMkZERrVf5swm-bxfozzc,21333
8
+ wxo_agentic_evaluation/inference_backend.py,sha256=fhEB1kaNN-A08RtJglBiv3QL_8nq8m-g7xbF4WbHAvU,25691
9
+ wxo_agentic_evaluation/llm_matching.py,sha256=l010exoMmsvTIAVHCm-Ok0diyeQogjCmemUb9rJLe6A,1477
10
+ wxo_agentic_evaluation/llm_rag_eval.py,sha256=vsNGz1cFE5QGdhnfrx-iJq1r6q8tSI9Ef1mzuhoHElg,1642
11
+ wxo_agentic_evaluation/llm_user.py,sha256=0zSsyEM7pYQtLcfbnu0gEIkosHDwntOZY84Ito6__SM,1407
12
+ wxo_agentic_evaluation/main.py,sha256=tRXVle2o1JhwJZOTpqdsOzBOpxPYxAH5ziZkbCmzfyU,11470
13
+ wxo_agentic_evaluation/record_chat.py,sha256=9l99n4TRdwDLAOKct0ZJKKXE5Y7qE7X5WLWUpWUHfLI,7739
14
+ wxo_agentic_evaluation/resource_map.py,sha256=-dIWQdpEpPeSCbDeYfRupG9KV1Q4NlHGb5KXywjkulM,1645
15
+ wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
16
+ wxo_agentic_evaluation/test_prompt.py,sha256=ksteXCs9iDQPMETc4Hb7JAXHhxz2r678U6-sgZJAO28,3924
17
+ wxo_agentic_evaluation/tool_planner.py,sha256=e-lBb4w1klT1HOL9BTwae3lkGv5VBuYC397mSJgOhus,12622
18
+ wxo_agentic_evaluation/type.py,sha256=uVKim70XgPW-3L7Z0yRO07wAH9xa-NcjfaiIyPhYMR0,3413
19
+ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=IPX_lAFujjPVI9fhXTNohXTxTmpqRhfzQygCWDYHBHg,18125
20
+ wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
21
+ wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
22
+ wxo_agentic_evaluation/analytics/tools/ux.py,sha256=EaWNvsq68X_i2H4pQ2fABtXEEmk3ZXqaMrTs42_7MwE,18347
23
+ wxo_agentic_evaluation/external_agent/__init__.py,sha256=LY3gMNzfIEwjpQkx5_2iZFHGQiUL4ymEkKL1dc2uKq4,1491
24
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=xW8tqPcm8JYvveSxf-oFCajvF5J8ORaK23YXu-LuFmc,4142
25
+ wxo_agentic_evaluation/external_agent/performance_test.py,sha256=bCXUsW0OeUzwfSSYObgfAmEU5vARkD-PblYU-mU9aPY,2507
26
+ wxo_agentic_evaluation/external_agent/types.py,sha256=4kfWD_ZyGZmpbib33gCxEuKS4HLb7CEtferlQgQe7uk,1624
27
+ wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
29
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=9O2m6T2iW-PMjGrTdMbOHP2Pr4RN0NwbEp6YgFpTi3I,5572
30
+ wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
32
+ wxo_agentic_evaluation/prompt/args_extractor_prompt.jinja2,sha256=0qBicXFcc6AA3mQNLPVRmFsnuYaCABJXgZkIH9fO0Js,952
33
+ wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=QXuk2ecnEPPRCPoWZJyrtb1gAVuIPljB91YoqPBp2Dk,1896
34
+ wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
35
+ wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
36
+ wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
37
+ wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=nDfCD0o9cRYmsgIjzD-RZNQxotlvuqrzdsZIY-vT794,684
38
+ wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
39
+ wxo_agentic_evaluation/prompt/starting_sentence_generation_prompt.jinja2,sha256=m_l6f7acfnWJmGQ0mXAy85oLGLgzhVhoz7UL1FVYq8A,4908
40
+ wxo_agentic_evaluation/prompt/story_generation_prompt.jinja2,sha256=_DxjkFoHpNTmdVSUzUrUdwn4Cng7nAGqkMnm0ScOH1w,4191
41
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=FVH5ew2TofC5LGqQzqNj90unrxooUZv_5XxJzVdz8uM,3563
42
+ wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
43
+ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=Ln43kwfSX50B1VBsT-MY1TCE0o8pGFh8aQJAzZfGkpI,3239
44
+ wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
+ wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=XXF-Pn-mosklC9Ch7coyaJxosFNnl3OkHSW3YPuiKMM,2333
46
+ wxo_agentic_evaluation/service_provider/__init__.py,sha256=EaY4jjKp58M3W8N3b3a8PNC2S81xA7YV2_QkTIy9DfI,1600
47
+ wxo_agentic_evaluation/service_provider/model_proxy_provider.py,sha256=X5tiE0IKCR2CqhwEGm91LOdzFZQWSXzXQgLOtzi6ng0,4002
48
+ wxo_agentic_evaluation/service_provider/ollama_provider.py,sha256=HMHQVUGFbLSQI1dhysAn70ozJl90yRg-CbNd4vsz-Dc,1116
49
+ wxo_agentic_evaluation/service_provider/provider.py,sha256=MsnRzLYAaQiU6y6xf6eId7kn6-CetQuNZl00EP-Nl28,417
50
+ wxo_agentic_evaluation/service_provider/watsonx_provider.py,sha256=iKVkWs4PRTM_S0TIdPgQ9NFQWPlDvcEvuHpQlIPzO10,6216
51
+ wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
52
+ wxo_agentic_evaluation/utils/utils.py,sha256=JYZQZ-OBy43gAWg9S7duJi9StRApGJATs2JUsW1l30M,6057
53
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA,sha256=L6Hq_FbQ4AY3g3Aho2wC6Io9rcLpnwNDm49BPTHbVCQ,17667
54
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
55
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
56
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD,,
@@ -1,9 +1,9 @@
1
- from type import Message, ContentType, EvaluationData
1
+ from wxo_agentic_evaluation.type import Message, ContentType, EvaluationData
2
2
  from typing import List, Optional
3
3
  import json
4
4
  import rich
5
5
  from collections import defaultdict
6
- from analytics.tools.types import (
6
+ from wxo_agentic_evaluation.analytics.tools.types import (
7
7
  ErrorPatterns,
8
8
  ToolFailure,
9
9
  HallucinatedParameter,
@@ -15,7 +15,7 @@ from analytics.tools.types import (
15
15
  AnalysisResults,
16
16
  ErrorType,
17
17
  )
18
- from data_annotator import ERROR_KEYWORDS
18
+ from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
19
19
  from http import HTTPStatus
20
20
 
21
21
 
@@ -5,7 +5,7 @@ from rich.table import Table
5
5
  from rich.panel import Panel
6
6
  from rich.align import Align
7
7
  from rich.console import Group
8
- from type import Message, ContentType
8
+ from wxo_agentic_evaluation.type import Message, ContentType
9
9
  from typing import List, Dict, Optional
10
10
  from analytics.tools.types import (
11
11
  ToolDefinitionRecommendation,
@@ -9,9 +9,9 @@ from rich.table import Table
9
9
  from typing import List
10
10
  from wxo_agentic_evaluation.type import (
11
11
  ExtendedMessage,
12
- ContentType,
13
- ToolCallAndRoutingMetrics,
12
+ ContentType
14
13
  )
14
+ from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
15
15
  from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
16
16
  from jsonargparse import CLI
17
17
 
@@ -71,10 +71,11 @@ def analyze(config: AnalyzeConfig):
71
71
 
72
72
  test_case_with_failed_tools = []
73
73
  for entry in summary:
74
- test_case_name = entry["test_case"]
74
+ test_case_name = entry["dataset_name"]
75
75
  if test_case_name.lower().strip() == "summary (average)":
76
76
  continue
77
- if int(entry["Wrong Function Calls"]) > 0 or int(entry["Wrong Parameters"]) > 0:
77
+ if not entry["is_success"] or float(entry["tool_calls_with_incorrect_parameter"]) > 0 or float(entry["tool_call_precision"]) < 1.0\
78
+ or float(entry["tool_call_recall"]) < 1.0:
78
79
  test_case_with_failed_tools.append(entry)
79
80
  if len(test_case_with_failed_tools) == 0:
80
81
  header_table = Table(show_header=False, box=None)
@@ -85,7 +86,7 @@ def analyze(config: AnalyzeConfig):
85
86
  rich.print(header_panel)
86
87
 
87
88
  for test_case_entry in test_case_with_failed_tools:
88
- test_case_name = test_case_entry["test_case"]
89
+ test_case_name = test_case_entry["dataset_name"]
89
90
 
90
91
  test_case_path = os.path.join(
91
92
  config.data_path, "messages", f"{test_case_name}.messages.analyze.json"
@@ -94,7 +95,8 @@ def analyze(config: AnalyzeConfig):
94
95
  with open(test_case_path, "r", encoding="utf-8") as f:
95
96
  temp = json.load(f)
96
97
  for entry in temp:
97
- test_messages.append(ExtendedMessage(**entry))
98
+ msg = ExtendedMessage(**entry)
99
+ test_messages.append(msg)
98
100
 
99
101
  test_metrics_path = os.path.join(
100
102
  config.data_path, "messages", f"{test_case_name}.metrics.json"
@@ -105,11 +107,9 @@ def analyze(config: AnalyzeConfig):
105
107
  header_table.add_row(f"Test Case Name: {test_case_name}")
106
108
  header_table.add_row((f"Expected Tool Calls: {metrics.expected_tool_calls}"))
107
109
  header_table.add_row(f"Correct Tool Calls: {metrics.correct_tool_calls}")
108
- irrelevant_tool_calls = test_case_entry["Wrong Function Calls"]
109
- header_table.add_row(f"Irrelevant Tool Call: {irrelevant_tool_calls}")
110
- tool_call_with_incorrect_parameters = test_case_entry["Wrong Parameters"]
110
+ header_table.add_row(f"Text Match: {metrics.text_match.value}")
111
111
  header_table.add_row(
112
- f"Tool Call with incorrect parameters: {tool_call_with_incorrect_parameters}"
112
+ f"Journey Success: {metrics.is_success}"
113
113
  )
114
114
  header_panel = Panel(
115
115
  header_table, title="[bold green]📋 Analysis Summary[/bold green]"
@@ -22,12 +22,19 @@ class LLMUserConfig:
22
22
  user_response_style: List[str] = field(default_factory=list)
23
23
 
24
24
 
25
+ @dataclass
26
+ class ProviderConfig:
27
+ model_id: str = field(default="meta-llama/llama-3-405b-instruct")
28
+ provider: str = field(default="watsonx")
29
+
30
+
25
31
  @dataclass
26
32
  class TestConfig:
27
33
  test_paths: List[str]
28
34
  output_dir: str
29
35
  auth_config: AuthConfig
30
36
  wxo_lite_version: str
37
+ provider_config: ProviderConfig = field(default_factory=ProviderConfig)
31
38
  llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
32
39
  enable_verbose_logging: bool = True
33
40
  enable_manual_user_input: bool = False
@@ -65,7 +72,7 @@ class ChatRecordingConfig:
65
72
  default_factory=KeywordsGenerationConfig
66
73
  )
67
74
  service_url: str = "http://localhost:4321"
68
- tenant_name: str = "wxo-dev"
75
+ tenant_name: str = "local"
69
76
  token: str = None
70
77
 
71
78
 
@@ -5,7 +5,7 @@ import os
5
5
  from pathlib import Path
6
6
  from jsonargparse import CLI
7
7
 
8
- from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
8
+ from wxo_agentic_evaluation.service_provider import get_provider
9
9
  from wxo_agentic_evaluation.prompt.template_render import BatchTestCaseGeneratorTemplateRenderer
10
10
  from wxo_agentic_evaluation.arg_configs import BatchAnnotateConfig
11
11
  from wxo_agentic_evaluation import __file__
@@ -71,7 +71,6 @@ def extract_inputs_from_snapshot(snapshot_path: Path) -> dict:
71
71
  def load_example(example_path: Path):
72
72
  with example_path.open("r", encoding="utf-8") as f:
73
73
  data = json.load(f)
74
- data.pop("mine_fields", None)
75
74
  return data
76
75
 
77
76
 
@@ -98,19 +97,15 @@ def build_prompt_for_story(agent, tools, tool_inputs, example_case: dict, story:
98
97
  def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-llama/llama-3-405b-instruct", ):
99
98
  output_dir.mkdir(parents=True, exist_ok=True)
100
99
 
101
- provider = WatsonXProvider(
100
+ provider = get_provider(
102
101
  model_id=model_id,
103
- llm_decode_parameter={
104
- "min_new_tokens": 50,
105
- "decoding_method": "greedy",
106
- "max_new_tokens": 3000
107
- }
102
+ params={"min_new_tokens": 50, "decoding_method": "greedy", "max_new_tokens": 3000},
108
103
  )
109
104
 
110
105
  response = provider.query(prompt)
111
106
 
112
107
  try:
113
- raw_text = response.get("generated_text", "")
108
+ raw_text = response
114
109
  json_start = raw_text.find("[")
115
110
  json_end = raw_text.rfind("]") + 1
116
111
  json_block = raw_text[json_start:json_end].strip()
@@ -119,7 +114,6 @@ def generate_multiple_in_one(prompt, output_dir, starting_index, model_id="meta-
119
114
  assert isinstance(test_cases, list), "Expected list of test cases"
120
115
 
121
116
  for i, case in enumerate(test_cases, start=starting_index):
122
- case["mine_fields"] = [] # ✅ Add the field here
123
117
  out_file = output_dir / f"synthetic_test_case_{i}.json"
124
118
  with out_file.open("w", encoding="utf-8") as f:
125
119
  json.dump(case, f, indent=2)
@@ -1,5 +1,6 @@
1
1
  from wxo_agentic_evaluation.type import Message, EvaluationData
2
- from wxo_agentic_evaluation.watsonx_provider import WatsonXProvider
2
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
3
+ from wxo_agentic_evaluation.service_provider import get_provider
3
4
  from wxo_agentic_evaluation.prompt.template_render import (
4
5
  LlamaKeywordsGenerationTemplateRenderer,
5
6
  )
@@ -94,16 +95,16 @@ ERROR_KEYWORDS = [
94
95
  class KeywordsGenerationLLM:
95
96
  def __init__(
96
97
  self,
97
- wai_client: WatsonXProvider,
98
+ provider: Provider,
98
99
  template: LlamaKeywordsGenerationTemplateRenderer,
99
100
  ):
100
- self.wai_client = wai_client
101
+ self.provider = provider
101
102
  self.prompt_template = template
102
103
 
103
104
  def genereate_keywords(self, response) -> Message | None:
104
105
  prompt = self.prompt_template.render(response=response)
105
- res = self.wai_client.query(prompt)
106
- keywords = ast.literal_eval(res["generated_text"].strip())
106
+ res: str = self.provider.query(prompt)
107
+ keywords = ast.literal_eval(res.strip())
107
108
  return keywords
108
109
 
109
110
 
@@ -120,7 +121,6 @@ class DataAnnotator:
120
121
  agent="",
121
122
  story="",
122
123
  starting_sentence=messages[0].content if messages else "",
123
- mine_fields=[],
124
124
  goals={},
125
125
  goal_details=[],
126
126
  )
@@ -145,29 +145,48 @@ class DataAnnotator:
145
145
 
146
146
  def _process_tool_call_order(self, wrong_tool_response_id: list[str]) -> list[str]:
147
147
  """Process and order tool calls, skipping failed ones"""
148
+ # gather all call ids that actually got a response
149
+ valid_call_ids = {
150
+ json.loads(m.content)["tool_call_id"]
151
+ for m in self.messages
152
+ if m.type == "tool_response"
153
+ }
154
+
148
155
  order = []
149
- for message in self.messages:
150
- if message.type == "tool_call":
151
- content = json.loads(message.content)
152
- # skip all the tool calls that fail
153
- if (
154
- content.get("tool_call_id", "") in wrong_tool_response_id
155
- or content.get("id", "") in wrong_tool_response_id
156
- ):
157
- continue
158
-
159
- if "tool_call_id" in content:
160
- del content["tool_call_id"]
161
- if "id" in content:
162
- del content["id"]
163
-
164
- content = json.dumps(content, sort_keys=True)
165
- # for a given tool call signature - function name + args only keep the most recent one
166
- if content in order:
167
- idx = order.index(content)
168
- order = order[:idx] + order[idx + 1 :] + [content]
169
- else:
170
- order.append(content)
156
+ for idx, message in enumerate(self.messages):
157
+ if message.type != "tool_call":
158
+ continue
159
+
160
+ content = json.loads(message.content)
161
+ call_id = content.get("tool_call_id") or content.get("id")
162
+
163
+ # skip any calls that errored
164
+ if call_id in wrong_tool_response_id:
165
+ continue
166
+
167
+ # skip calls that never produced a tool_response
168
+ if call_id not in valid_call_ids:
169
+ continue
170
+
171
+ # skip the "reflection" copy that the LLM emits right after a response
172
+ prev = self.messages[idx - 1] if idx > 0 else None
173
+ if (
174
+ prev is not None
175
+ and prev.type == "tool_response"
176
+ and json.loads(prev.content).get("tool_call_id") == call_id
177
+ ):
178
+ continue
179
+
180
+ # normalize ids so json dumps only reflects name-args
181
+ content.pop("tool_call_id", None)
182
+ content.pop("id", None)
183
+
184
+ signature = json.dumps(content, sort_keys=True)
185
+ # if we’ve seen that exact (name-args) before, drop the old one
186
+ if signature in order:
187
+ order.remove(signature)
188
+ order.append(signature)
189
+
171
190
  return order
172
191
 
173
192
  def _process_tool_calls(self) -> tuple[Dict, List, str]:
@@ -209,16 +228,12 @@ class DataAnnotator:
209
228
  # we assume single summary step at the end
210
229
  for message in self.messages[::-1]:
211
230
  if message.role == "assistant":
212
- wai_client = WatsonXProvider(
231
+ provider = get_provider(
213
232
  model_id=self.keywords_generation_config.model_id,
214
- llm_decode_parameter={
215
- "min_new_tokens": 0,
216
- "decoding_method": "greedy",
217
- "max_new_tokens": 256,
218
- },
233
+ params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 256},
219
234
  )
220
235
  kw_generator = KeywordsGenerationLLM(
221
- wai_client=wai_client,
236
+ provider=provider,
222
237
  template=LlamaKeywordsGenerationTemplateRenderer(
223
238
  self.keywords_generation_config.prompt_config
224
239
  ),
@@ -247,7 +262,6 @@ class DataAnnotator:
247
262
  "agent": self.initial_data.agent,
248
263
  "goals": goals,
249
264
  "goal_details": goal_details,
250
- "mine_fields": [],
251
265
  "story": self.initial_data.story,
252
266
  "starting_sentence": self.initial_data.starting_sentence,
253
267
  }