ibm-watsonx-orchestrate-evaluation-framework 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info/METADATA +0 -386
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.1.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,31 @@
1
+ import glob
2
+ import json
3
+ import os
4
+ import re
5
+ from typing import List, Optional, Union
1
6
  from urllib.parse import urlparse
7
+
8
+ import yaml
9
+ from rich import box, print
2
10
  from rich.console import Console, Group
3
- from rich.table import Table
4
11
  from rich.panel import Panel
5
12
  from rich.rule import Rule
6
- from rich import box
7
- from rich import print
8
- import re
9
13
  from rich.style import Style
10
-
11
- from typing import List, Optional, Union
12
- import json
13
- import yaml
14
- import glob
15
- import os
14
+ from rich.table import Table
16
15
 
17
16
  from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
18
- from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ReferenceLessEvalMetrics
19
- from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore, Message
17
+ from wxo_agentic_evaluation.metrics.metrics import (
18
+ KnowledgeBaseMetricSummary,
19
+ ReferenceLessEvalMetrics,
20
+ )
21
+ from wxo_agentic_evaluation.type import (
22
+ ConversationalConfidenceThresholdScore,
23
+ Message,
24
+ )
20
25
 
21
26
  console = Console()
22
27
 
28
+
23
29
  class AttackResultsTable:
24
30
  def __init__(self, attack_results: dict):
25
31
  self.table = Table(
@@ -35,11 +41,21 @@ class AttackResultsTable:
35
41
  n_on_policy = attack_results.get("n_on_policy_attacks", 0)
36
42
  n_off_policy = attack_results.get("n_off_policy_attacks", 0)
37
43
  n_on_policy_successful = attack_results.get("n_on_policy_successful", 0)
38
- n_off_policy_successful = attack_results.get("n_off_policy_successful", 0)
44
+ n_off_policy_successful = attack_results.get(
45
+ "n_off_policy_successful", 0
46
+ )
39
47
 
40
48
  # Calculate success rates
41
- on_policy_rate = f"{round(100 * safe_divide(n_on_policy_successful, n_on_policy))}%" if n_on_policy else "0%"
42
- off_policy_rate = f"{round(100 * safe_divide(n_off_policy_successful, n_off_policy))}%" if n_off_policy else "0%"
49
+ on_policy_rate = (
50
+ f"{round(100 * safe_divide(n_on_policy_successful, n_on_policy))}%"
51
+ if n_on_policy
52
+ else "0%"
53
+ )
54
+ off_policy_rate = (
55
+ f"{round(100 * safe_divide(n_off_policy_successful, n_off_policy))}%"
56
+ if n_off_policy
57
+ else "0%"
58
+ )
43
59
 
44
60
  self.table.add_row("On Policy", str(n_on_policy), on_policy_rate)
45
61
  self.table.add_row("Off Policy", str(n_off_policy), off_policy_rate)
@@ -47,6 +63,7 @@ class AttackResultsTable:
47
63
  def print(self):
48
64
  console.print(self.table)
49
65
 
66
+
50
67
  class AgentMetricsTable:
51
68
  def __init__(self, data):
52
69
  self.table = Table(
@@ -90,7 +107,8 @@ def safe_divide(nom, denom):
90
107
  if denom == 0:
91
108
  return 0
92
109
  else:
93
- return nom/denom
110
+ return nom / denom
111
+
94
112
 
95
113
  def is_saas_url(service_url: str) -> bool:
96
114
  hostname = urlparse(service_url).hostname
@@ -103,19 +121,17 @@ def is_ibm_cloud_url(service_url: str) -> bool:
103
121
 
104
122
 
105
123
  def add_line_seperator(
106
- style_config: Optional[
107
- Union[str,Style]
108
- ]=None,
124
+ style_config: Optional[Union[str, Style]] = None,
109
125
  ):
110
-
126
+
111
127
  if not style_config:
112
- style="grey42"
128
+ style = "grey42"
113
129
  else:
114
- style=style_config
115
-
130
+ style = style_config
131
+
116
132
  console.print(
117
133
  Rule(
118
- style=style,
134
+ style=style,
119
135
  )
120
136
  )
121
137
 
@@ -124,14 +140,18 @@ class FaithfulnessTable:
124
140
  def __init__(
125
141
  self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
126
142
  ):
127
- self.table = Table(title="Faithfulness", box=box.ROUNDED, show_lines=True)
143
+ self.table = Table(
144
+ title="Faithfulness", box=box.ROUNDED, show_lines=True
145
+ )
128
146
 
129
147
  self.table.add_column("Tool Call Id", style="blue")
130
148
  self.table.add_column("Faithfulness Score", style="blue3")
131
149
  self.table.add_column("Evidence", style="cyan")
132
150
  self.table.add_column("Reasoning", style="yellow3")
133
151
 
134
- for tool_call_id, faithfulness in zip(tool_call_ids, faithfulness_metrics):
152
+ for tool_call_id, faithfulness in zip(
153
+ tool_call_ids, faithfulness_metrics
154
+ ):
135
155
  faithfulness = faithfulness.table()
136
156
  self.table.add_row(
137
157
  tool_call_id,
@@ -185,7 +205,9 @@ class KnowledgePanel:
185
205
  self.confidence_scores = ConversationalSearchTable(
186
206
  confidence_scores, tool_call_id
187
207
  )
188
- self.group = Group(self.faithfulness.table, self.confidence_scores.table)
208
+ self.group = Group(
209
+ self.faithfulness.table, self.confidence_scores.table
210
+ )
189
211
 
190
212
  # Panel acts as a section
191
213
  self.section = Panel(
@@ -240,35 +262,32 @@ class Tokenizer:
240
262
  \w+| # Regular words (letters, numbers, underscores)
241
263
  [^\w\s] # Punctuation marks (anything that's not word chars or whitespace)
242
264
  """
243
-
265
+
244
266
  def __init__(self):
245
267
  self.compiled_pattern = re.compile(
246
- self.PATTERN,
247
- re.VERBOSE | re.IGNORECASE
268
+ self.PATTERN, re.VERBOSE | re.IGNORECASE
248
269
  )
249
-
270
+
250
271
  def __call__(self, text: str) -> List[str]:
251
272
  """
252
273
  Tokenizes text by splitting on punctuation and handling contractions.
253
274
 
254
275
  Args:
255
276
  text: Input text to tokenize.
256
-
277
+
257
278
  Returns:
258
279
  List of tokenized words (lowercase, no punctuation).
259
-
280
+
260
281
  Examples:
261
282
  - "I'm fine" -> ['i', 'm', 'fine']
262
- - "don't go" -> ['do', "n't", 'go']
283
+ - "don't go" -> ['do', "n't", 'go']
263
284
  - "Hello, world!" -> ['hello', 'world']
264
285
  """
265
-
266
- tokens = self.compiled_pattern.findall(
267
- text
268
- )
269
-
286
+
287
+ tokens = self.compiled_pattern.findall(text)
288
+
270
289
  return self._clean_tokens(tokens)
271
-
290
+
272
291
  def _clean_tokens(self, raw_tokens: List[str]) -> List[str]:
273
292
  """
274
293
  Applies some basic post-processing to tokenized messages.
@@ -276,12 +295,11 @@ class Tokenizer:
276
295
  Args:
277
296
  raw_tokens: list of tokens extracted from a message.
278
297
  """
279
-
298
+
280
299
  filtered_tokens = [
281
- token.lower() \
282
- for token in raw_tokens \
283
- if token.strip() \
284
- and not (len(token) == 1 and not token.isalnum())
300
+ token.lower()
301
+ for token in raw_tokens
302
+ if token.strip() and not (len(token) == 1 and not token.isalnum())
285
303
  ]
286
304
 
287
305
  return filtered_tokens
@@ -296,10 +314,22 @@ class ReferencelessEvalPanel:
296
314
  )
297
315
 
298
316
  self.table.add_column("Dataset", style="yellow", justify="center")
299
- self.table.add_column("Tool Calls", style="deep_sky_blue1", justify="center")
300
- self.table.add_column("Successful Tool Calls", style="magenta", justify="center")
301
- self.table.add_column("Tool Calls Failed due to Schema Mismatch", style="deep_sky_blue1", justify="center")
302
- self.table.add_column("Tool Calls Failed due to Hallucination", style="magenta", justify="center")
317
+ self.table.add_column(
318
+ "Tool Calls", style="deep_sky_blue1", justify="center"
319
+ )
320
+ self.table.add_column(
321
+ "Successful Tool Calls", style="magenta", justify="center"
322
+ )
323
+ self.table.add_column(
324
+ "Tool Calls Failed due to Schema Mismatch",
325
+ style="deep_sky_blue1",
326
+ justify="center",
327
+ )
328
+ self.table.add_column(
329
+ "Tool Calls Failed due to Hallucination",
330
+ style="magenta",
331
+ justify="center",
332
+ )
303
333
 
304
334
  for metric in referenceless_metrics:
305
335
  self.table.add_row(
@@ -307,12 +337,13 @@ class ReferencelessEvalPanel:
307
337
  str(metric.number_of_tool_calls),
308
338
  str(metric.number_of_successful_tool_calls),
309
339
  str(metric.number_of_static_failed_tool_calls),
310
- str(metric.number_of_semantic_failed_tool_calls)
340
+ str(metric.number_of_semantic_failed_tool_calls),
311
341
  )
312
342
 
313
343
  def print(self):
314
344
  console.print(self.table)
315
345
 
346
+
316
347
  # Function to load messages from JSON file
317
348
  def load_messages(file_path):
318
349
  with open(file_path, "r") as f:
@@ -339,9 +370,9 @@ def load_agents(agents_path: str):
339
370
  for agent_path in agents_json:
340
371
  with open(agent_path, "r") as f:
341
372
  agents.append(json.load(f))
342
-
373
+
343
374
  for agent_path in agents_yaml:
344
375
  with open(agent_path, "r") as f:
345
376
  agents.append(yaml.safe_load(f))
346
-
377
+
347
378
  return agents
@@ -1,386 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: ibm-watsonx-orchestrate-evaluation-framework
3
- Version: 1.1.1
4
- Summary: The WxO evaluation framework
5
- Author-email: Haode Qi <Haode.Qi@ibm.com>
6
- License: MIT
7
- Requires-Python: <3.14,>=3.11
8
- Description-Content-Type: text/markdown
9
- Requires-Dist: rich~=13.9.4
10
- Requires-Dist: pydantic<3.0.0,>=2.10.3
11
- Requires-Dist: pyyaml~=6.0.2
12
- Requires-Dist: jinja2~=3.1.5
13
- Requires-Dist: python-dotenv
14
- Requires-Dist: dataclasses-json~=0.6.7
15
- Requires-Dist: jsonargparse~=4.37.0
16
- Requires-Dist: jsonschema~=4.23.0
17
- Provides-Extra: dev
18
- Requires-Dist: setuptools~=70.3.0; extra == "dev"
19
- Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
20
- Requires-Dist: pytest-cov==6.0.0; extra == "dev"
21
- Requires-Dist: pytest-mock==3.14.0; extra == "dev"
22
- Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
23
- Requires-Dist: coverage[toml]>=6.5; extra == "dev"
24
- Requires-Dist: black~=22.3.0; extra == "dev"
25
- Requires-Dist: pylint~=2.16.4; extra == "dev"
26
- Provides-Extra: rag-eval
27
- Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
28
- Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
29
- Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
30
- Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
31
- Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
32
- Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
33
- Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
34
- Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
35
-
36
- # WXO Agent Evaluation Framework
37
-
38
- ## Table of Contents
39
- - [Overview](#overview)
40
- - [ADK Setup Guide](#adk-setup-guide)
41
- - [Setup](#setup-for-evaluation-framework)
42
- - [Quick Experiment](#quick-experiment-against-the-default-wxo-dev-env)
43
- - [Run Against a Deployed Local Env](#run-against-a-deployed-local-env)
44
- - [Run Against a SaaS Tenant](#run-against-a-saas-tenant)
45
- - [Analyze Results](#analyze-results)
46
- - [Record Chat Sessions](#record-chat-sessions)
47
- - [Batch Test Case Generation](#batch-test-case-generation)
48
- - [Using Model Proxy Provider](#using-model-proxy-provider)
49
- - [Using Ollama](#using-ollama)
50
- - [Workflow Diagram](#workflow-diagram)
51
- - [Results](#results)
52
- - [Metrics](#metrics)
53
-
54
- ## Overview
55
-
56
- - This framework is designed to test a tool-calling agent's ability to make real API calls against a `wxo-dev` testing tenant on your local wxo-lite server instance. To run evaluation against a remote tenant on SaaS, follow [Run Against a SaaS Tenant](#run-against-a-saas-tenant).
57
- - As an LLM-as-agent evaluation framework, we aim to test the agent's ability to do the following:
58
- - We use a ground truth to evaluate our conversation against after inference. The process of inference is manifested through a user-LLM and agent simulation. Please set `enable_verbose_logging: True` in your configuration.
59
- - Make real API calls correctly and efficiently. We provide metrics such as tool call precision, recall, and routing accuracy to measure the agent's performance against the ground truth.
60
- - The `benchmarks/` folder contains test-cases for the different agents we have evaluated so far. They are segmented by release versions of the `wxo-domains` repository.
61
- - The agent calls the `runs/` endpoint of the wxo-lite server instance, and the actual tool code is executed on the server side. The server database is not visible to our framework.
62
-
63
- ## ADK Setup Guide
64
- Follow the [ADK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the ADK.
65
-
66
- The current framework is compatible with ADK version >= 1.20, <= 1.7.0
67
-
68
- ## Setup for Evaluation Framework
69
- Run the following command to install evaluation framework in the same env:
70
- ```
71
- pip install -e .
72
- ```
73
-
74
- ## Contribution Guide
75
- ### Secret Resolution
76
- install detect secret utilities:
77
- ```
78
- pip install --upgrade git+https://github.com/ibm/detect-secrets.git@master#egg=detect-secrets
79
- ```
80
- run the scan & resolve detections:
81
- ```
82
- detect-secrets scan --exclude-files "benchmark|results" --update .secrets.baseline && detect-secrets audit .secrets.baseline && git add .secrets.baseline
83
- ```
84
-
85
-
86
- ## Quick Experiment Against the Default wxo-dev Env
87
- ```bash
88
- orchestrate server start
89
- export WATSONX_SPACE_ID=""
90
- export WATSONX_APIKEY=""
91
- ```
92
-
93
- NOTE: If you want to use `WO_INSTANCE` and `WO_API_KEY` instead, follow the [model proxy section](#using-model-proxy-provider).
94
-
95
- Import sample hr tools and agent into your default `wxo-dev` env:
96
- ```bash
97
- orchestrate tools import -f benchmarks/hr_sample/tools.py -k python
98
- orchestrate agents import -f benchmarks/hr_sample/hr_agent.json
99
- ```
100
-
101
- Run the main script:
102
- ```bash
103
- python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml --output_dir=results/test --num_workers=2
104
- ```
105
- Note:
106
- 1. This approach uses the default `wxo-dev` tenant already available in your orchestrate env if you have used wxo-lite before.
107
- 2. ADK also reads the env environments variable. If you have an env conflict, start the wxo-lite server before exporting the envs.
108
-
109
- ## Run Against a Deployed Local Env
110
-
111
- 1. start the orchestrated server: `orchestrate server start`
112
- 2. create a simple test case like the following save in a folder like `benchmarks/TEST_CASE_NAME`:
113
- ```JSON
114
- {
115
- "agent": "NAME_OF_THE_AGENT",
116
- "goals": {
117
- "summarize": []
118
- },
119
- "goal_details": [
120
- {
121
- "type": "text",
122
- "name": "summarize",
123
- "response": "Your timeoff schedule for 20250101 to 20250303 is: 20250105",
124
- "keywords": [
125
- "20250105"
126
- ]
127
- }
128
- ],
129
- "story": "Your username is nwaters and you want to find out timeoff schedule from 20250101 to 20250303."
130
- }
131
- ```
132
- Note:
133
- - The target agent name can be found `orchestrate agents list`
134
- - the example shown only evaluate the final response for the agent. For more sophisticated examples, follow `benchmarks/hr_sample/data_simple.json` or `benchmarks/hr_sample/data_complex.json`.
135
-
136
- 3. create a test config yaml like the following:
137
- ```YAML
138
- test_paths:
139
- - benchmarks/TEST_CASE_NAME
140
-
141
- auth_config:
142
- url: http://localhost:4321
143
- tenant_name: wxo-dev
144
-
145
- output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
146
- ```
147
-
148
- NOTE: run `orchestrate env list` to find the name of the active tenant. for default `local` tenant, the name should be `wxo-dev`
149
-
150
- 4. Run the test:
151
- ```bash
152
- export WATSONX_SPACE_ID=""
153
- export WATSONX_APIKEY=""
154
- python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
155
- ```
156
-
157
- NOTE: if your run fails for any reason and doesn't cover all the test cases, you can re-run the main script with `--skip_available_results=True` to skip the test cases that are already completed.
158
-
159
- ## Run Against a SaaS Tenant
160
-
161
- Orchestrate ADK ≥ 1.2 is required for this section.
162
-
163
- This section describes how to run benchmark tests using a **SaaS-based Orchestrate tenant**. The rest of the setup (test case creation, config structure, etc.) is similar to the [local setup](#run-against-a-deployed-local-env) and can be referred to as needed.
164
-
165
- ### Prerequisites
166
-
167
- - **Orchestrate ADK version ≥ 1.2** is required.
168
- - Access to the **production SaaS Orchestrate instance** or **staging SaaS Orchestrate instance**.
169
-
170
- ### 1. Get Authentication Details
171
-
172
- 1. Visit the Orchestrate UI [ Prod /staging]:
173
-
174
- - **AWS Production us-east-1:** [https://dl.watson-orchestrate.ibm.com](https://dl.watson-orchestrate.ibm.com)
175
- For other locations, please use the designated url for your data center.
176
- - **AWS Staging:** [https://staging-wa.watson-orchestrate.ibm.com](https://staging-wa.watson-orchestrate.ibm.com)
177
- - **IBM Cloud Production us-south:** [https://us-south.watson-orchestrate.cloud.ibm.com](https://us-south.watson-orchestrate.cloud.ibm.com)
178
-
179
- 2. Log in and click the **Settings** button (top-right corner).
180
-
181
- 3. Open the **API details** tab, then copy the **Instance URL** and generate an **API Key**.
182
-
183
- 4. For more detailed instructions, refer to this guide:
184
- https://developer.ibm.com/apis/catalog/watsonorchestrate--custom-assistants/Getting+the+API+endpoint
185
-
186
- ### 2. Add the SaaS Tenant
187
-
188
- Run the following command:
189
-
190
- ```bash
191
- orchestrate env add -n saas \
192
- -u [INSTANCE_URL] \
193
- -t mcsp \
194
- -a
195
- ```
196
- if using stagging setup then pass the --iam-url argument as follow:
197
- - For AWS:
198
- ```bash
199
- orchestrate env add -n saas \
200
- -u [INSTANCE_URL] \
201
- --iam-url https://iam.platform.test.saas.ibm.com \
202
- -a
203
- ```
204
-
205
- - For IBM Cloud:
206
- ```bash
207
- orchestrate env add -n saas \
208
- -u [INSTANCE_URL] \
209
- --iam-url https://iam.test.cloud.ibm.com \
210
- -a
211
- ```
212
-
213
- > When prompted, paste the API key generated above.
214
-
215
- ### 3. Set `WO_API_KEY` Environment Variable
216
-
217
- ```bash
218
- export WO_API_KEY=[your_generated_api_key]
219
- ```
220
-
221
- ### 4. Update Your Test Config YAML
222
-
223
- Make sure your YAML config includes the correct SaaS tenant name:
224
-
225
- ```yaml
226
- test_paths:
227
- - benchmarks/TEST_CASE_NAME
228
-
229
- auth_config:
230
- url: [INSTANCE_URL]
231
- tenant_name: saas
232
-
233
- output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
234
- ```
235
- - Use staging url if using the staging set-up.
236
- ### 5. Run the Simulation in SaaS Mode
237
-
238
- ```bash
239
- python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
240
- ```
241
-
242
- ## Analyze Results
243
-
244
- The `analyze_run.py` script summarizes agent evaluation results, showing successes, failures, and reasons for errors to help improve agent performance. After running an evaluation, analyze the results with:
245
-
246
- ```bash
247
- python -m wxo_agentic_evaluation.analyze_run --data_path path/to/results
248
- ```
249
-
250
- Additionally, the script comes with a feature to analyze the quality of tool descriptions for failing tools where the reason for failure is incorrect parameter usage by the agent.
251
-
252
- In order to analyze the description(s) of your failing tools, consider passing the optional flag `--tool_definition_path` like so:
253
-
254
- ```bash
255
- python -m wxo_agentic_evaluation.analyze_run --data_path path/to/results --tool_definition_path path/to/.py/source/file/containing/tool/definitions
256
- ```
257
-
258
- **Note:** If the flag `tool_definition_path` is not provided, description quality analysis is simply skipped.
259
-
260
- ## Record Chat Sessions
261
-
262
- The `record_chat.py` script lets you capture your chat sessions in the chat UI and automatically generate ground truth data for evaluating your agents. This is valuable for benchmarking and experimenting with agent behavior under different configurations.
263
-
264
- Start the chat interface:
265
-
266
- ```bash
267
- orchestrate chat start
268
- ```
269
-
270
- Then open your browser to [http://localhost:3000/chat-lite](http://localhost:3000/chat-lite) and select the agent you wish to interact with.
271
-
272
- To begin recording, run:
273
-
274
- ```bash
275
- python -m wxo_agentic_evaluation.record_chat --output_dir dir/to/save/recordings
276
- ```
277
-
278
- While this process is running, for every chat session, annotated ground truth data is generated in your output directory: `<THREAD_ID>_annotated_data.json`
279
-
280
- Review the generated annotated data for accuracy before using it for evaluation.
281
-
282
- Press `Ctrl+C` in the terminal to stop recording when your session is complete.
283
-
284
- ## Batch Test Case Generation
285
-
286
- For full instructions on setting up tools, writing stories, configuring the pipeline, and generating batch test cases, see the [Batch Test case Generation Guide](./benchmarks/batch_sample/README.MD).
287
-
288
- ## Using Model Proxy Provider
289
-
290
- To use the model proxy provider (which allows direct access to LLM models), follow these steps:
291
-
292
- 1. Set up environment variables:
293
- ```sh
294
- export WO_INSTANCE=<your-instance-url>
295
- export WO_API_KEY=<your-api-key>
296
- ```
297
-
298
- 2. Create a configuration file similar to [benchmarks/hr_sample/config_model_proxy.yaml](benchmarks/hr_sample/config_model_proxy.yaml):
299
- ```yaml
300
- test_paths:
301
- - <your-test-path>
302
-
303
- auth_config:
304
- url: http://localhost:4321
305
- tenant_name: wxo-dev
306
-
307
- provider_config:
308
- provider: "model_proxy"
309
- model_id: "<model-id>"
310
-
311
- output_dir: "<output-dir>"
312
- ```
313
-
314
- 3. Run the evaluation:
315
- ```sh
316
- python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
317
- ```
318
-
319
- ## Using Ollama
320
-
321
- To use model from Ollama (local LLM deployment), follow these steps:
322
-
323
- 1. Make sure you have [Ollama](https://ollama.com) installed and running on your system.
324
-
325
- 2. Pull your desired model using Ollama (e.g. llama3.1:8b):
326
- ```sh
327
- ollama pull <model-id>
328
- ```
329
-
330
- 3. Create a configuration file similar to [benchmarks/hr_sample/config_ollama.yaml](benchmarks/hr_sample/config_ollama.yaml):
331
- ```yaml
332
- test_paths:
333
- - <your-test-path>
334
-
335
- auth_config:
336
- url: http://localhost:4321
337
- tenant_name: wxo-dev
338
-
339
- provider_config:
340
- provider: "ollama"
341
- model_id: "<model-id>"
342
-
343
- output_dir: "results/ollama/<model-name>"
344
- ```
345
-
346
- 4. Run the evaluation:
347
- ```sh
348
- python -m wxo_agentic_evaluation.main --config path/to/your/config.yaml
349
- ```
350
-
351
- ## Workflow Diagram
352
-
353
- To help better understand the workflow, this is a diagram of how this repo works together with wxO ADK and a wxO runtime.
354
-
355
- ![Alt text](./doc/assets/workflow.png "Workflow")
356
-
357
- Inputs:
358
- - [a test config yaml](benchmarks/hr_sample/config.yaml)
359
- - a json file containing test cases, see [example 1](benchmarks/hr_sample/data_complex.json) or [example 2](benchmarks/hr_sample/data_simple.json) as a reference
360
- - optionally, a `tools.py` file for tools definition and one or more agent definitions e.g. `benchmarks/hr_sample/hr_agent.json`. Alternatively, these files are not needed if you have a tenant already set up with such tools and agents
361
-
362
- ## Results
363
- You can find benchmark results [here](benchmarks/domain_1.8/README.md)
364
-
365
- ## Metrics
366
-
367
- | Metric | Description | Calculation | Range/Type |
368
- |----------------------------|--------------------------------------------------------------------|------------------------------------------------------------------------------|--------------------|
369
- | **Total Steps** | Total number of messages/steps in the conversation | Count of all messages in the conversation | Integer ≥ 0 |
370
- | **LLM Steps** | Number of assistant (LLM) responses (text or tool calls) | Count of messages where `role == "assistant"` | Integer ≥ 0 |
371
- | **Total Tool Calls** | Number of tool calls made by the agent | Count of all tool calls | Integer ≥ 0 |
372
- | **Tool Call Precision** | Fraction of correct tool calls out of all tool calls | `correct_tool_calls / total_tool_calls` | Float 0.0–1.0 |
373
- | **Tool Call Recall** | Fraction of correct tool calls out of expected tool calls | `correct_tool_calls / expected_tool_calls` | Float 0.0–1.0 |
374
- | **Agent Routing Accuracy** | Fraction of correct agents visited (relevant_routing_calls) out of total number of agents visited (total_routing_calls) | `relevant_routing_calls / total_routing_calls` | Float 0.0–1.0 |
375
- | **Text Match** | Whether the final summary text matches the ground truth | `Summary Matched` \| `Summary MisMatched` | Categorical |
376
- | **Journey Success** | Whether the agent completed tasks in the correct order | Boolean (`True`/`False`) | Boolean |
377
- | **Avg Resp Time (sec)** | Average response time for agent responses | Mean response time across all agent interactions | Float ≥ 0.0 |
378
-
379
- ### Key Definitions
380
-
381
- - **Correct Tool Call**: A tool call that matches both the expected function and arguments.
382
- - **Expected Tool Call**: A tool call that is required by the ground truth.
383
- - **Routing Call**: When an agent routes to another agent.
384
- - **Relevant Routing Call**: An agent is relevant when it's either the entry point agent or it includes a tool that is presented in the ground-truth.
385
- - **Text Match**: Indicates if the agent's final summary matches the expected summary ("Summary Matched") or does not match ("Summary MisMatched").
386
- - **Journey Success**: Indicates if the agent completed all required tasks in the correct order.