ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,322 @@
1
+ Metadata-Version: 2.4
2
+ Name: ibm-watsonx-orchestrate-evaluation-framework
3
+ Version: 1.0.0
4
+ Summary: The WxO evaluation framework
5
+ Author-email: Haode Qi <Haode.Qi@ibm.com>
6
+ License: MIT
7
+ Requires-Python: <3.14,>=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: rich~=13.9.4
11
+ Requires-Dist: ibm-watsonx-ai~=1.3.6
12
+ Requires-Dist: pydantic~=2.10.6
13
+ Requires-Dist: pyyaml~=6.0.2
14
+ Requires-Dist: jinja2~=3.1.5
15
+ Requires-Dist: python-dotenv~=1.0.1
16
+ Requires-Dist: dataclasses-json~=0.6.7
17
+ Requires-Dist: jsonargparse~=4.37.0
18
+ Requires-Dist: networkx~=3.4.2
19
+ Requires-Dist: matplotlib~=3.10.1
20
+ Requires-Dist: numpy~=1.26.4
21
+ Requires-Dist: langchain-openai~=0.3.23
22
+ Provides-Extra: dev
23
+ Requires-Dist: setuptools~=70.3.0; extra == "dev"
24
+ Requires-Dist: pytest<9.0.0,>=8.3.4; extra == "dev"
25
+ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
26
+ Requires-Dist: pytest-mock==3.14.0; extra == "dev"
27
+ Requires-Dist: pytest-asyncio==0.25.1; extra == "dev"
28
+ Requires-Dist: coverage[toml]>=6.5; extra == "dev"
29
+ Requires-Dist: black~=22.3.0; extra == "dev"
30
+ Requires-Dist: pylint~=2.16.4; extra == "dev"
31
+ Provides-Extra: rag-eval
32
+ Requires-Dist: tqdm~=4.67.1; extra == "rag-eval"
33
+ Requires-Dist: sentence-transformers~=3.3.1; extra == "rag-eval"
34
+ Requires-Dist: scikit-learn~=1.6.1; extra == "rag-eval"
35
+ Requires-Dist: pandas~=2.1.4; extra == "rag-eval"
36
+ Requires-Dist: notebook~=7.4.1; extra == "rag-eval"
37
+ Requires-Dist: ipywidgets~=8.1.6; extra == "rag-eval"
38
+ Requires-Dist: jupyter_contrib_nbextensions; extra == "rag-eval"
39
+ Requires-Dist: jupyter~=1.1.1; extra == "rag-eval"
40
+ Dynamic: license-file
41
+
42
+ # WXO-agent evaluation framework
43
+
44
+ - This framework is designed to test a tool-calling agent's ability to make real API calls against a `wxo-dev` testing tenant on your local wxo-lite server instance.
45
+
46
+ - As an LLM-as-agent evaluation framework, we aim to test the agent's ability to do the following:
47
+ - We use a ground truth to evaluate our conversation against after inference. The process of inference is manifested through a user-LLM and agent simulation. Please set `enable_verbose_logging: True` in your configuration.
48
+ - Make real API calls correctly and efficiently. We provide metrics which measure the number of bad tool calls made by the agent, normalized against the number of ground truth calls made.
49
+
50
+ - The `benchmarks/` folder contains test-cases for the different agents we have evaluated so far. They are segmented by release versions of the `wxo-domains` repository.
51
+ - The agent calls the `runs/` endpoint of the wxo-lite server instance, and the actual tool code is executed on the server side. The server database is not visible to our framework.
52
+
53
+ ## prerequisite
54
+ Follow the [SDK setup guide](https://github.ibm.com/WatsonOrchestrate/wxo-clients/tree/main) to install the SDK. Make sure you are using version 1.2.0 of the SDK as this is the version this framework requires.
55
+
56
+ ## setup for evaluation framework
57
+ Run the following command to install evaluation framework in the same env:
58
+ ```
59
+ pip install -e .
60
+ ```
61
+
62
+
63
+ ## quick experiment against the default wxo-dev env
64
+ ```bash
65
+ orchestrate server start
66
+ export WATSONX_SPACE_ID=""
67
+ export WATSONX_API_KEY=""
68
+ ```
69
+
70
+ Import sample hr tools and agent into your default `wxo-dev` env:
71
+ ```bash
72
+ orchestrate tools import -f benchmarks/hr_sample/tools.py -k python
73
+ orchestrate agents import -f benchmarks/hr_sample/hr_agent.json
74
+ ```
75
+
76
+ Run the main script:
77
+ ```bash
78
+ python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml --output_dir=results/test --num_workers=2
79
+ ```
80
+ Note:
81
+ 1. This approach uses the default `wxo-dev` tenant already available in your orchestrate env if you have used wxo-lite before.
82
+ 2. ADK also reads the env environments variable. If you have an env conflict, start the wxo-lite server before exporting the envs.
83
+
84
+
85
+ ## run against a deployed local env
86
+
87
+ 1. start the orchestrated server: `orchestrate server start`
88
+ 2. create a simple test case like the following save in a folder like `benchmarks/TEST_CASE_NAME`:
89
+ ```JSON
90
+ {
91
+ "agent": "NAME_OF_THE_AGENT",
92
+ "goals": {
93
+ "summarize": []
94
+ },
95
+ "goal_details": [
96
+ {
97
+ "type": "text",
98
+ "name": "summarize",
99
+ "response": "Your timeoff schedule for 20250101 to 20250303 is: 20250105",
100
+ "keywords": [
101
+ "20250105"
102
+ ]
103
+ }
104
+ ],
105
+ "mine_fields": [],
106
+ "story": "Your username is nwaters and you want to find out timeoff schedule from 20250101 to 20250303."
107
+ }
108
+ ```
109
+ Note:
110
+ - The target agent name can be found `orchestrate agents list`
111
+ - the example shown only evaluate the final response for the agent. For more sophisticated examples, follow `benchmarks/hr_sample/data_simple.json` or `benchmarks/hr_sample/data_complex.json`.
112
+
113
+
114
+ 3. create a test config yaml like the following:
115
+ ```YAML
116
+ test_paths:
117
+ - benchmarks/TEST_CASE_NAME
118
+
119
+ auth_config:
120
+ url: http://localhost:4321
121
+ tenant_name: wxo-dev
122
+
123
+ output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
124
+ ```
125
+
126
+
127
+ NOTE: run `orchestrate env list` to find the name of the active tenant. for default `local` tenant, the name should be `wxo-dev`
128
+
129
+ 4. Run the test:
130
+ ```bash
131
+ export WATSONX_SPACE_ID=""
132
+ export WATSONX_API_KEY=""
133
+ python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
134
+ ```
135
+
136
+ NOTE: if your run fails for any reason and doesn't cover all the test cases, you can re-run the main script with `--skip_available_results=True` to skip the test cases that are already completed.
137
+
138
+ ## analyze error
139
+ ```bash
140
+ python -m wxo_agentic_evaluation.analyze_run --input_data results/hr_sample/llama_3_2_90b/messages/data_simple.messages.json --ground_truth benchmarks/hr_sample/data_simple.json --enable_verbose_logging False
141
+ ```
142
+ You can also run the analyze script on a batch of test cases in a folder
143
+ ```bash
144
+ python -m wxo_agentic_evaluation.analyze_run --input_data results/hr_sample/llama_3_2_90b/messages/ --ground_truth benchmarks/hr_sample/ --enable_verbose_logging False --enable_verbose_logging False
145
+ ```
146
+
147
+
148
+ ## Run Against a SaaS Tenant (Orchestrate SDK ≥ 1.2)
149
+
150
+ This section describes how to run benchmark tests using a **SaaS-based Orchestrate tenant**. The rest of the setup (test case creation, config structure, etc.) is similar to the [local setup](#run-against-a-deployed-local-env) and can be referred to as needed.
151
+
152
+ ### Prerequisites
153
+
154
+ - **Orchestrate SDK version ≥ 1.2** is required.
155
+ - Access to the **production SaaS Orchestrate instance** or **staging SaaS Orchestrate instance**.
156
+
157
+ ---
158
+
159
+ ### 1. Get Authentication Details
160
+
161
+ 1. Visit the Orchestrate UI [ Prod /staging]:
162
+
163
+ - **AWS Production us-east-1:** [https://dl.watson-orchestrate.ibm.com](https://dl.watson-orchestrate.ibm.com)
164
+ For other locations, please use the designated url for your data center.
165
+ - **AWS Staging:** [https://staging-wa.watson-orchestrate.ibm.com](https://staging-wa.watson-orchestrate.ibm.com)
166
+ - **IBM Cloud Production us-south:** [https://us-south.watson-orchestrate.cloud.ibm.com](https://us-south.watson-orchestrate.cloud.ibm.com)
167
+
168
+ 2. Log in and click the **Settings** button (top-right corner).
169
+
170
+ 3. Open the **API details** tab, then copy the **Instance URL** and generate an **API Key**.
171
+
172
+ 4. For more detailed instructions, refer to this guide:
173
+ https://developer.ibm.com/apis/catalog/watsonorchestrate--custom-assistants/Getting+the+API+endpoint
174
+
175
+ ---
176
+
177
+ ### 2. Add the SaaS Tenant
178
+
179
+ Run the following command:
180
+
181
+ ```bash
182
+ orchestrate env add -n saas \
183
+ -u [INSTANCE_URL] \
184
+ -t mcsp \
185
+ -a
186
+ ```
187
+ if using stagging setup then pass the --iam-url argument as follow:
188
+ - For AWS:
189
+ ```bash
190
+ orchestrate env add -n saas \
191
+ -u [INSTANCE_URL] \
192
+ --iam-url https://iam.platform.test.saas.ibm.com \
193
+ -a
194
+ ```
195
+
196
+ - For IBM Cloud:
197
+ ```bash
198
+ orchestrate env add -n saas \
199
+ -u [INSTANCE_URL] \
200
+ --iam-url https://iam.test.cloud.ibm.com \
201
+ -a
202
+ ```
203
+
204
+ > When prompted, paste the API key generated above.
205
+
206
+ ---
207
+
208
+ ### 3. Set the IAM API Key Environment Variable
209
+
210
+ ```bash
211
+ export WATSONX_IAM_SAAS_APIKEY=[your_generated_api_key]
212
+ ```
213
+
214
+ ---
215
+
216
+ ### 4. Update Your Test Config YAML
217
+
218
+ Make sure your YAML config includes the correct SaaS tenant name:
219
+
220
+ ```yaml
221
+ test_paths:
222
+ - benchmarks/TEST_CASE_NAME
223
+
224
+ auth_config:
225
+ url: [INSTANCE_URL]
226
+ tenant_name: saas
227
+
228
+ output_dir: "results/TEST_CASE_NAME/MODEL_NAME"
229
+ ```
230
+ - Use staging url if using the staging set-up.
231
+ ---
232
+
233
+ ### 5. Run the Simulation in SaaS Mode
234
+
235
+ ```bash
236
+ python -m wxo_agentic_evaluation.main --config benchmarks/hr_sample/config.yaml
237
+ ```
238
+
239
+ ---
240
+
241
+ ### Batch Test case Generation
242
+
243
+ For full instructions on setting up tools, writing stories, configuring the pipeline, and generating batch test cases, see the [Batch Test case Generation Guide](./benchmarks/batch_sample/README.MD).
244
+
245
+ ## Workflow diagram
246
+
247
+ To help better understand the workflow, this is a diagram of how this repo works together with wxO lite python SDK and a wxO runtime.
248
+
249
+ ![Alt text](./doc/assets/workflow.png "Workflow")
250
+
251
+ Inputs:
252
+ - [a test config yaml](benchmarks/hr_sample/config.yaml)
253
+ - a json file containing test cases, see [example 1](benchmarks/hr_sample/data_complex.json) or [example 2](benchmarks/hr_sample/data_simple.json) as a reference
254
+ - optionally, a `tools.py` file for tools definition and one or more agent definitions e.g. `benchmarks/hr_sample/hr_agent.json`. Alternatively, these files are not needed if you have a tenant already set up with such tools and agents
255
+
256
+ Steps:
257
+ 1. (optional) this repo will call wxO Lite python SDK (which calls several endpoints exposed by the wxO runtime) to set up the environment needed for the evaluation run, tools and agents will be imported
258
+ 2. Create test cases by following the sample instructions at [benchmarks/sap_successfactors_sample/annotation/README.md](benchmarks/sap_successfactors_sample/annotation/README.md)
259
+ 3. Start the evaluation run by calling the `wxo_agentic_evaluation.main` script of this repo, which will invoke the `/runs` endpoint of the wxO runtime to simulate conversations with the agent
260
+ 4. Reports and metrics will be generated by this repo
261
+ 5. (optional) this repo will call wxO Lite python SDK (which calls several endpoints exposed by the wxO runtime) to clean up the environment, to avoid the tools and agents affecting subsequent runs with the same tenant
262
+ 6. (optional) You can generate further error analysis by using the `wxo_agentic_evaluation.analyze_run` script from this repo
263
+
264
+
265
+ ## results
266
+ ### workday
267
+
268
+ | Model | User Setting | Total Step | Agent Step | Journey Success | Wrong Function Calls | Bad Calls | Wrong Parameters | Wrong Routing Calls | Text Match | Test Cases | WXO Avg. Response Time |
269
+ |-------------------------------|------------- |-------------|------------|------------------|-----------------------|-----------|-------------------|----------------------|------------|------------|-------------------------|
270
+ | llama-3-2-90b-vision-instruct | normal | 8.13 | 4.21 | 0.87 | 0.01 | 0.0 | 0.20 | 0.00 | 0.95 | 38 | 15.09 |
271
+ | llama-3-2-90b-vision-instruct | verbose | 11.76 | 6.11 | 0.79 | 0.02 | 0.0 | 0.19 | 0.00 | 0.86 | 38 | 14.32 |
272
+ | llama-3-405b-instruct | normal | 9.66 | 5.03 | 0.82 | 0.02 | 0.0 | 0.47 | 0.04 | 0.89 | 38 | 13.36 |
273
+ | llama-3-405b-instruct | verbose | 11.76 | 6.11 | 0.84 | 0.05 | 0.0 | 0.70 | 0.04 | 0.92 | 38 | 12.21 |
274
+
275
+ You can find the detailed results under [results/workday](results/workday)
276
+
277
+ ### sap successfactor (rel-1.7)
278
+
279
+ | Model | User Setting | Total Step | Agent Step | Journey Success | Wrong Function Calls | Bad Calls | Wrong Parameters | Wrong Routing Calls | Text Match | Test Cases | WXO Avg. Response Time |
280
+ |------------------------------ |--------------|-------------|------------|------------------|-----------------------|-----------|-------------------|----------------------|------------|------------|-------------------------|
281
+ | llama-3-2-90b-vision-instruct | normal | 10.32 | 5.84 | 0.73 | 0.04 | 0.0 | 0.06 | 0.08 | 0.84 | 38 | - |
282
+ | llama-3-2-90b-vision-instruct | verbose | 11.19 | 6.35 | 0.68 | 0.04 | 0.0 | 0.08 | 0.16 | 0.81 | 38 | - |
283
+ | llama-3-405b-instruct | normal | 11.41 | 6.24 | 0.46 | 0.01 | 0.0 | 0.23 | 0.02 | 0.62 | 38 | - |
284
+ | llama-3-405b-instruct | verbose | 15.32 | 8.38 | 0.46 | 0.04 | 0.0 | 0.40 | 0.06 | 0.62 | 38 | - |
285
+
286
+ You can find the detailed results under [results/sap_successfactor_4](results/sap_successfactor_4)
287
+
288
+
289
+ ## METRICS KEY
290
+
291
+ | Metric | Description | Calculation | Range/Type |
292
+ |--------|-------------|-------------|------------|
293
+ | **Total Step** | Total number of messages/steps in the conversation | Count of all messages in the conversation | Integer ≥ 0 |
294
+ | **Agent Step** | Number of assistant responses (text or tool calls) | Count of messages where `role == "assistant"` and `type` is text or tool_call | Integer ≥ 0 |
295
+ | **Ground Truth Calls** | Expected number of tool calls based on ground truth | Count of goal_details with `type == ContentType.tool_call` | Integer ≥ 0 |
296
+ | **Journey Success** | Whether the agent completed tasks in the correct order | `is_topological_sort(ground_truth.goals, labelled_messages)` | Boolean |
297
+ | **Wrong Function Calls** | Number of calls to non-existent or unexpected functions | Count of labelled_messages containing "_WRONG_FUNCTION_CALL" | Integer ≥ 0 |
298
+ | **Bad Calls** | Reserved metric for future use | Currently hardcoded to 0 | Integer (0) |
299
+ | **Wrong Parameters** | Number of tool calls with incorrect parameters | Count of labelled_messages containing "_WRONG_PARAMETERS" | Integer ≥ 0 |
300
+ | **Wrong Routing Calls** | Number of incorrect agent routing calls | Count of labelled_messages containing "_WRONG_ROUTING_CALL" | Integer ≥ 0 |
301
+ | **Text Match** | Quality of final text summary | "Keyword Mismatch" \| "Semantic Mismatch" \| "Summary Matched" | Categorical |
302
+ | **Tool Call Accuracy** | Percentage of non-routing tool calls that were executed correctly | `correct_tool_calls / non_transfer_tool_calls` | Float 0.0-1.0 |
303
+ | **Tool Call Relevancy** | Percentage of non-routing tool calls that were relevant to the task | `(relevant_tool_calls - expected_routing_calls) / non_transfer_tool_calls` | Float 0.0-1.0 |
304
+ | **Agent Routing Accuracy** | Percentage of routing calls that were executed correctly | `expected_routing_calls / total_routing_calls` | Float 0.0-1.0 |
305
+ | **WXO Average Response Time (Secs)** | Average response time for agent responses | Mean response time across all agent interactions | Float ≥ 0.0 |
306
+
307
+ ### Key Definitions
308
+
309
+ - **Relevant Tool Call**: A tool call whose name matches one of the expected tool names defined in the ground truth
310
+ - **Correct Tool Call**: A relevant tool call that also has the correct parameters/arguments
311
+ - **Routing Call**: A tool call whose name starts with "transfer_" (used for agent-to-agent routing)
312
+ - **Non-Transfer Tool Call**: Regular tool calls excluding routing calls (`total_tool_calls - total_routing_calls`)
313
+ - **Expected Routing Call**: A routing call that was both expected and executed correctly
314
+
315
+ ### Averaging Behavior
316
+
317
+ - **Per Test Case Average**: Total Step, Agent Step, Tool Call Accuracy, Tool Call Relevancy, Agent Routing Accuracy, WXO Average Response Time
318
+ - **Per Ground Truth Calls Average**: Wrong Function Calls, Bad Calls, Wrong Parameters, Wrong Routing Calls
319
+ - **Special Calculations**:
320
+ - Journey Success: Proportion of test cases that succeeded (0.0-1.0)
321
+ - Text Match: Proportion of test cases with "Summary Matched" (0.0-1.0)
322
+
@@ -0,0 +1,46 @@
1
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE,sha256=Shgxx7hTdCOkiVRmfGgp_1ISISrwQD7m2f0y8Hsapl4,1083
2
+ wxo_agentic_evaluation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ wxo_agentic_evaluation/analyze_run.py,sha256=qcdew4htpIg0sxCXXX3QS_XhoPOGg4_CEPYFjZiMsnA,4343
4
+ wxo_agentic_evaluation/annotate.py,sha256=nxYMc6gwfQ-GuNjCPFtbX_-Es5-9XDdbXpMH89yRDdc,1228
5
+ wxo_agentic_evaluation/arg_configs.py,sha256=_ws5GX43rG8HdIZe5JAgb3heQubzpfOGWsvzT9Zfs2A,2016
6
+ wxo_agentic_evaluation/batch_annotate.py,sha256=5e-1FpqjSdk3EaGELHhj493fcJKY3_gcv7NfFXxl3pY,6511
7
+ wxo_agentic_evaluation/data_annotator.py,sha256=to8FfIYMx-JzJ5aRmpMb1SiFS1KTXgdZU2qwowdn6BU,7823
8
+ wxo_agentic_evaluation/evaluation_package.py,sha256=RFo5oC2Gydc7wQ28bDSs5nisnRj22GCnjjrFrn4O2L4,21031
9
+ wxo_agentic_evaluation/inference_backend.py,sha256=8nW3LZg6dLTemrHBmDBx8b2NUvBjvSC4bLLxJX9yPiY,25754
10
+ wxo_agentic_evaluation/llm_matching.py,sha256=ISRMvZq-oC0amVBCPNlghPpMDCPxIjVGgm9DtZWTU40,1501
11
+ wxo_agentic_evaluation/llm_rag_eval.py,sha256=n1e0nffYlgxqu7i4Ef_j1zmo9zXcqc2zfTfNYuMsopc,1675
12
+ wxo_agentic_evaluation/llm_user.py,sha256=Ppc-iKtPGaf6tPoHGYFLqXX-vy8LoQneLTQBVOXLHiA,1422
13
+ wxo_agentic_evaluation/main.py,sha256=H6sLZrCUBfqjh9gPoQN6BHxSby-UKrea705cuIiyknc,8042
14
+ wxo_agentic_evaluation/record_chat.py,sha256=Q5w9ouvVfikms_kYyQ6wgqvNN_DxV400I2HriTOdMfg,5969
15
+ wxo_agentic_evaluation/service_instance.py,sha256=yt7XpwheaRRG8Ri4TFIS5G2p5mnCwvNgj6T7bDF5uTU,6494
16
+ wxo_agentic_evaluation/tool_planner.py,sha256=nOwoq_RMBO5ISRFwrKeblgdbMz50qfmVqHkZRAXjP3s,8075
17
+ wxo_agentic_evaluation/type.py,sha256=QbwEedAYnot9WBVIJVSP23s1KHJc7uFQyOhL_MYEdmI,4832
18
+ wxo_agentic_evaluation/watsonx_provider.py,sha256=GH4PhHIZbSRsiQ29CsZmu8wSVt0KX4htNNQKnSltmfA,5983
19
+ wxo_agentic_evaluation/analytics/tools/analyzer.py,sha256=qvkhoyY4JgneE03cmo-KuMAaQB7iM_Lm0C1sexyPFwY,18056
20
+ wxo_agentic_evaluation/analytics/tools/main.py,sha256=ocwPUlEjyK7PMdXBg5OM2DVDQBcaHT4UjR4ZmEhR0C4,6567
21
+ wxo_agentic_evaluation/analytics/tools/types.py,sha256=IFLKI1CCQwPR2iWjif8AqL_TEq--VbLwdwnMqfJujBw,4461
22
+ wxo_agentic_evaluation/analytics/tools/ux.py,sha256=CcnaefAZwzPx3FW0BzlOx7OBPwNfmA5yVtYB-gYci9w,18324
23
+ wxo_agentic_evaluation/external_agent/external_validate.py,sha256=peW89lsf8u8DXmzmRe9z2BWwPsmhUaNcNpVCtM8tkCM,2629
24
+ wxo_agentic_evaluation/external_agent/types.py,sha256=6WmDGetJGSg92HqPW_Q9K7AEorivTraiw8HgdxaiGxs,1481
25
+ wxo_agentic_evaluation/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ wxo_agentic_evaluation/metrics/llm_as_judge.py,sha256=bybJQfVWiVh3BoFEZjdBmU9EQO9Ukheu3YWmkI9b1ks,1218
27
+ wxo_agentic_evaluation/metrics/metrics.py,sha256=SdDBWdo3KFycmupQ6mtjwp6WKKNJxGTeM20I_FV9Da0,3913
28
+ wxo_agentic_evaluation/prompt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2,sha256=vLrMWce-5HlvniCQdtifnl-YdbJfT8-oixzfwulZs98,3839
30
+ wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2,sha256=PPnfczM_HjCjho8UKFTL9OYRYshpwqkBKBas8C1jMHY,1807
31
+ wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2,sha256=DW9OdjeZJbOWrngRqTAVD4w0va_HtA2FR4G1POIIamM,2524
32
+ wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2,sha256=7mTkSrppjgPluUAIMTWaT30K7M4J4hyR_LjSjW1Ofq0,1290
33
+ wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2,sha256=PiCjr1ag44Jk5xD3F24fLD_bOGYh2sF0i5miY4OrVlc,1890
34
+ wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2,sha256=nDfCD0o9cRYmsgIjzD-RZNQxotlvuqrzdsZIY-vT794,684
35
+ wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2,sha256=MltPfEXYyOwEC2xNLl7UsFTxNbr8CwHaEcPqtvKE2r8,2749
36
+ wxo_agentic_evaluation/prompt/template_render.py,sha256=arMtlNcbRIczRwpfZWlITrgDlDsaDCbwGaNPGaJyBko,3080
37
+ wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2,sha256=9RcIjLYoOvtFsf-RgyMfMcj2Fe8fq1wGkE4nG1zamYY,297
38
+ wxo_agentic_evaluation/prompt/tool_planner.jinja2,sha256=gpI7wrcIL6ytCcllr_ZdWwjVLtC_06at3-JkAcAl2HE,1243
39
+ wxo_agentic_evaluation/prompt/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ wxo_agentic_evaluation/prompt/examples/data_simple.json,sha256=O0GeKBf2s9d-7TxkDFFmCEV2sl3e3HcpT11cN0DYFjw,2354
41
+ wxo_agentic_evaluation/utils/__init__.py,sha256=QMxk6hx1CDvCBLFh40WpPZmqFNJtDqwXP7S7cXD6NQE,145
42
+ wxo_agentic_evaluation/utils/utils.py,sha256=8mD6_L_qP-2jQtRkA3Njtg2HFCSQ4FX2NgO4oZq-gow,7994
43
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA,sha256=Nx-ZE-egxcobYevowbspUR0hLyo0RKo4UQ-Bz0F5dD8,16276
44
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt,sha256=2okpqtpxyqHoLyb2msio4pzqSg7yPSzwI7ekks96wYE,23
46
+ ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,22 @@
1
+ (The MIT License)
2
+
3
+ Copyright (c) 2024, 2025 IBM Corporation
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ 'Software'), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1 @@
1
+ wxo_agentic_evaluation
File without changes