ibm-watsonx-orchestrate-evaluation-framework 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (61) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info/METADATA +34 -0
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/RECORD +60 -60
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +36 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +18 -7
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +69 -48
  8. wxo_agentic_evaluation/annotate.py +6 -4
  9. wxo_agentic_evaluation/arg_configs.py +8 -2
  10. wxo_agentic_evaluation/batch_annotate.py +78 -25
  11. wxo_agentic_evaluation/data_annotator.py +18 -13
  12. wxo_agentic_evaluation/description_quality_checker.py +20 -14
  13. wxo_agentic_evaluation/evaluation_package.py +114 -70
  14. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  15. wxo_agentic_evaluation/external_agent/external_validate.py +46 -35
  16. wxo_agentic_evaluation/external_agent/performance_test.py +32 -20
  17. wxo_agentic_evaluation/external_agent/types.py +12 -5
  18. wxo_agentic_evaluation/inference_backend.py +158 -73
  19. wxo_agentic_evaluation/llm_matching.py +4 -3
  20. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  21. wxo_agentic_evaluation/llm_user.py +7 -3
  22. wxo_agentic_evaluation/main.py +175 -67
  23. wxo_agentic_evaluation/metrics/llm_as_judge.py +2 -2
  24. wxo_agentic_evaluation/metrics/metrics.py +26 -12
  25. wxo_agentic_evaluation/prompt/template_render.py +32 -11
  26. wxo_agentic_evaluation/quick_eval.py +49 -23
  27. wxo_agentic_evaluation/record_chat.py +70 -33
  28. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +58 -18
  29. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -18
  30. wxo_agentic_evaluation/red_teaming/attack_runner.py +43 -27
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +3 -1
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +23 -15
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +13 -8
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +41 -13
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +26 -16
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +17 -11
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +44 -29
  38. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +13 -5
  39. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +16 -5
  40. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +8 -3
  41. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +6 -2
  42. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +5 -1
  43. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +16 -3
  44. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +23 -12
  45. wxo_agentic_evaluation/resource_map.py +2 -1
  46. wxo_agentic_evaluation/service_instance.py +24 -11
  47. wxo_agentic_evaluation/service_provider/__init__.py +33 -13
  48. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +129 -26
  49. wxo_agentic_evaluation/service_provider/ollama_provider.py +10 -11
  50. wxo_agentic_evaluation/service_provider/provider.py +0 -1
  51. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +34 -21
  52. wxo_agentic_evaluation/service_provider/watsonx_provider.py +50 -22
  53. wxo_agentic_evaluation/tool_planner.py +128 -44
  54. wxo_agentic_evaluation/type.py +12 -9
  55. wxo_agentic_evaluation/utils/__init__.py +1 -0
  56. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +41 -20
  57. wxo_agentic_evaluation/utils/rich_utils.py +23 -9
  58. wxo_agentic_evaluation/utils/utils.py +83 -52
  59. ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/METADATA +0 -385
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/WHEEL +0 -0
  61. {ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,49 +1,64 @@
1
- from wxo_agentic_evaluation.service_provider import get_provider
2
- from wxo_agentic_evaluation.resource_map import ResourceMap
3
- from wxo_agentic_evaluation.llm_user import LLMUser
4
- from wxo_agentic_evaluation.prompt.template_render import LlamaUserTemplateRenderer
1
+ import csv
2
+ import dataclasses
3
+ import glob
4
+ import json
5
+ import os
6
+ import traceback
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ from pathlib import Path
9
+ from typing import List
10
+
11
+ import rich
12
+ import yaml
13
+ from jsonargparse import CLI
14
+ from rich.progress import Progress
15
+
16
+ from wxo_agentic_evaluation.arg_configs import TestConfig
17
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
5
18
  from wxo_agentic_evaluation.inference_backend import (
6
19
  EvaluationController,
20
+ WXOInferenceBackend,
7
21
  get_wxo_client,
8
- WXOInferenceBackend
9
22
  )
10
- from typing import List
11
- from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
23
+ from wxo_agentic_evaluation.llm_user import LLMUser
24
+ from wxo_agentic_evaluation.metrics.metrics import (
25
+ KnowledgeBaseMetricSummary,
26
+ TextMatchType,
27
+ ToolCallAndRoutingMetrics,
28
+ )
29
+ from wxo_agentic_evaluation.prompt.template_render import (
30
+ LlamaUserTemplateRenderer,
31
+ )
32
+ from wxo_agentic_evaluation.resource_map import ResourceMap
33
+ from wxo_agentic_evaluation.service_provider import get_provider
12
34
  from wxo_agentic_evaluation.type import EvaluationData
13
-
14
- from wxo_agentic_evaluation.arg_configs import TestConfig
35
+ from wxo_agentic_evaluation.utils import json_dump
15
36
  from wxo_agentic_evaluation.utils.utils import (
16
- create_table,
17
37
  SummaryPanel,
18
- safe_divide
38
+ create_table,
39
+ safe_divide,
19
40
  )
20
- from wxo_agentic_evaluation.utils import json_dump
21
- from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary, ToolCallAndRoutingMetrics, TextMatchType
22
- import os
23
- import json
24
- import traceback
25
- import yaml
26
- import dataclasses
27
- import glob
28
- import rich
29
- import csv
30
- from rich.progress import Progress
31
- from pathlib import Path
32
- from concurrent.futures import ThreadPoolExecutor
33
- from jsonargparse import CLI
34
41
 
35
42
 
36
- def process_test_case(task_n, test_case, config, inference_backend, resource_map, llm_user):
43
+ def process_test_case(
44
+ task_n, test_case, config, inference_backend, resource_map, llm_user
45
+ ):
37
46
  summary_results_for_path = []
38
47
  tc_name = os.path.basename(test_case).replace(".json", "")
39
48
  with open(test_case, "r") as f:
40
49
  test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
41
50
 
42
51
  evaluation_controller = EvaluationController(
43
- wxo_inference_backend=inference_backend, llm_user=llm_user, config=config
52
+ wxo_inference_backend=inference_backend,
53
+ llm_user=llm_user,
54
+ config=config,
44
55
  )
45
56
  rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
46
- history, call_tracker, conversational_search_data = evaluation_controller.run(
57
+ (
58
+ history,
59
+ call_tracker,
60
+ conversational_search_data,
61
+ ) = evaluation_controller.run(
47
62
  task_n,
48
63
  test_case.story,
49
64
  agent_name=test_case.agent,
@@ -54,7 +69,8 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
54
69
  result.append(message.model_dump())
55
70
 
56
71
  json_dump(
57
- os.path.join(config.output_dir, "messages", tc_name + ".messages.json"), result
72
+ os.path.join(config.output_dir, "messages", tc_name + ".messages.json"),
73
+ result,
58
74
  )
59
75
 
60
76
  if len(conversational_search_data) > 0:
@@ -73,7 +89,7 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
73
89
  messages=history,
74
90
  ground_truth=test_case,
75
91
  conversational_search_data=conversational_search_data,
76
- resource_map=resource_map
92
+ resource_map=resource_map,
77
93
  )
78
94
  (
79
95
  keyword_semantic_matches,
@@ -85,7 +101,9 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
85
101
  for message in messages_with_reason:
86
102
  temp.append(message.model_dump())
87
103
  json_dump(
88
- os.path.join(config.output_dir, "messages", tc_name + ".messages.analyze.json"),
104
+ os.path.join(
105
+ config.output_dir, "messages", tc_name + ".messages.analyze.json"
106
+ ),
89
107
  temp,
90
108
  )
91
109
 
@@ -108,18 +126,29 @@ def process_test_case(task_n, test_case, config, inference_backend, resource_map
108
126
  def main(config: TestConfig):
109
127
  executor = ThreadPoolExecutor(max_workers=config.num_workers)
110
128
  if config.num_workers > 1 and config.enable_manual_user_input:
111
- rich.print("[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]")
112
- config.enable_manual_user_input = False # disable manual user input for parallel execution
129
+ rich.print(
130
+ "[bold yellow]Warning ⚠️: Manual user input is disabled for parallel execution.[/bold yellow]"
131
+ )
132
+ config.enable_manual_user_input = (
133
+ False # disable manual user input for parallel execution
134
+ )
113
135
  # reason: threads continue to stream messages while waiting for user input, which is not desired
114
136
  # and the manual input prompt is not labelled properly in the UI
115
137
  wxo_client = get_wxo_client(
116
- config.auth_config.url, config.auth_config.tenant_name, config.auth_config.token
138
+ config.auth_config.url,
139
+ config.auth_config.tenant_name,
140
+ config.auth_config.token,
117
141
  )
118
142
  resource_map = ResourceMap(wxo_client)
119
143
  inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
120
144
  llm_user = LLMUser(
121
- wai_client=get_provider(config=config.provider_config, model_id=config.llm_user_config.model_id),
122
- template=LlamaUserTemplateRenderer(config.llm_user_config.prompt_config),
145
+ wai_client=get_provider(
146
+ config=config.provider_config,
147
+ model_id=config.llm_user_config.model_id,
148
+ ),
149
+ template=LlamaUserTemplateRenderer(
150
+ config.llm_user_config.prompt_config
151
+ ),
123
152
  user_response_style=config.llm_user_config.user_response_style,
124
153
  )
125
154
 
@@ -127,7 +156,9 @@ def main(config: TestConfig):
127
156
 
128
157
  results_list = []
129
158
 
130
- knowledge_base_output_folder = Path(config.output_dir) / "knowledge_base_metrics"
159
+ knowledge_base_output_folder = (
160
+ Path(config.output_dir) / "knowledge_base_metrics"
161
+ )
131
162
  knowledge_base_output_folder.mkdir(exist_ok=True, parents=True)
132
163
  detailed_rag_output_file = (
133
164
  knowledge_base_output_folder / "knowledge_base_detailed_metrics.json"
@@ -143,7 +174,9 @@ def main(config: TestConfig):
143
174
  [
144
175
  os.path.basename(f).replace(".messages", "")
145
176
  for f in glob.glob(
146
- os.path.join(config.output_dir, "messages", "*.messages.json")
177
+ os.path.join(
178
+ config.output_dir, "messages", "*.messages.json"
179
+ )
147
180
  )
148
181
  ]
149
182
  )
@@ -153,7 +186,7 @@ def main(config: TestConfig):
153
186
  if os.path.isdir(test_path):
154
187
  test_path = os.path.join(test_path, "*.json")
155
188
  test_cases.extend(sorted(glob.glob(test_path)))
156
-
189
+
157
190
  futures = []
158
191
  task_n = 0
159
192
  for test_case in test_cases:
@@ -161,7 +194,9 @@ def main(config: TestConfig):
161
194
  continue
162
195
  if config.skip_available_results:
163
196
  if test_case in available_res:
164
- print(f"Skipping test case {test_case} as results already exist.")
197
+ print(
198
+ f"Skipping test case {test_case} as results already exist."
199
+ )
165
200
  continue
166
201
 
167
202
  future = executor.submit(
@@ -180,7 +215,8 @@ def main(config: TestConfig):
180
215
  if futures:
181
216
  with Progress() as progress:
182
217
  task1 = progress.add_task(
183
- f"[purple]Evaluating {len(futures)} tasks...", total=len(futures)
218
+ f"[purple]Evaluating {len(futures)} tasks...",
219
+ total=len(futures),
184
220
  )
185
221
  for test_case, future in futures:
186
222
  try:
@@ -200,27 +236,55 @@ def main(config: TestConfig):
200
236
  SummaryPanel(rag_metric_summary).print()
201
237
 
202
238
  with open(detailed_rag_output_file, "w+", encoding="utf-8") as f:
203
- json.dump(rag_metric_summary.model_dump(by_alias=True)["detailed"], f, indent=4)
239
+ json.dump(
240
+ rag_metric_summary.model_dump(by_alias=True)["detailed"],
241
+ f,
242
+ indent=4,
243
+ )
204
244
 
205
245
  with open(summary_rag_output_file, "w+", encoding="utf-8") as f:
206
- json.dump(rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4)
246
+ json.dump(
247
+ rag_metric_summary.model_dump(by_alias=True)["summary"], f, indent=4
248
+ )
207
249
 
208
250
  if len(tool_call_metrics) > 0:
209
251
  # remove the average row if exist
210
252
  tool_call_metrics = [
211
- row for row in tool_call_metrics if row.dataset_name != "Summary (Average)"
253
+ row
254
+ for row in tool_call_metrics
255
+ if row.dataset_name != "Summary (Average)"
212
256
  ]
213
257
 
214
- def filter_display_only_values(tool_call_metric: ToolCallAndRoutingMetrics):
215
- row = {"Dataset": tool_call_metric.dataset_name, "Total Steps": tool_call_metric.total_steps,
216
- "LLM Steps": tool_call_metric.llm_step, "Total Tool Calls":tool_call_metric.total_tool_calls, "Tool Call Precision": tool_call_metric.tool_call_precision, "Tool Call Recall": tool_call_metric.tool_call_recall,
217
- "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy, "Text Match": tool_call_metric.text_match, "Journey Success": tool_call_metric.is_success, "Avg Resp Time (sec)": tool_call_metric.avg_resp_time}
258
+ def filter_display_only_values(
259
+ tool_call_metric: ToolCallAndRoutingMetrics,
260
+ ):
261
+ row = {
262
+ "Dataset": tool_call_metric.dataset_name,
263
+ "Total Steps": tool_call_metric.total_steps,
264
+ "LLM Steps": tool_call_metric.llm_step,
265
+ "Total Tool Calls": tool_call_metric.total_tool_calls,
266
+ "Tool Call Precision": tool_call_metric.tool_call_precision,
267
+ "Tool Call Recall": tool_call_metric.tool_call_recall,
268
+ "Agent Routing Accuracy": tool_call_metric.agent_routing_accuracy,
269
+ "Text Match": tool_call_metric.text_match,
270
+ "Journey Success": tool_call_metric.is_success,
271
+ "Avg Resp Time (sec)": tool_call_metric.avg_resp_time,
272
+ }
218
273
  return row
219
274
 
220
275
  def create_avg_row(metrics: List[dict]):
221
- avg_row = {"Dataset": "Summary (Average)", "Total Steps": 0,
222
- "LLM Steps": 0, "Total Tool Calls":0, "Tool Call Precision": 0, "Tool Call Recall": 0, "Agent Routing Accuracy": 0,
223
- "Text Match": 0, "Journey Success": 0, "Avg Resp Time (sec)": 0}
276
+ avg_row = {
277
+ "Dataset": "Summary (Average)",
278
+ "Total Steps": 0,
279
+ "LLM Steps": 0,
280
+ "Total Tool Calls": 0,
281
+ "Tool Call Precision": 0,
282
+ "Tool Call Recall": 0,
283
+ "Agent Routing Accuracy": 0,
284
+ "Text Match": 0,
285
+ "Journey Success": 0,
286
+ "Avg Resp Time (sec)": 0,
287
+ }
224
288
  if metrics:
225
289
  for row in metrics:
226
290
  avg_row["Total Steps"] += row["Total Steps"]
@@ -228,33 +292,77 @@ def main(config: TestConfig):
228
292
  avg_row["Total Tool Calls"] += row["Total Tool Calls"]
229
293
  avg_row["Tool Call Precision"] += row["Tool Call Precision"]
230
294
  avg_row["Tool Call Recall"] += row["Tool Call Recall"]
231
- avg_row["Agent Routing Accuracy"] += row["Agent Routing Accuracy"]
232
- avg_row["Text Match"] += row["Text Match"] == TextMatchType.text_match.value
295
+ avg_row["Agent Routing Accuracy"] += row[
296
+ "Agent Routing Accuracy"
297
+ ]
298
+ avg_row["Text Match"] += (
299
+ row["Text Match"] == TextMatchType.text_match.value
300
+ )
233
301
  avg_row["Journey Success"] += row["Journey Success"]
234
302
  avg_row["Avg Resp Time (sec)"] += row["Avg Resp Time (sec)"]
235
303
 
236
- avg_row["Total Steps"] = round(safe_divide(avg_row["Total Steps"], len(metrics)), 2)
237
- avg_row["LLM Steps"] = round(safe_divide(avg_row["LLM Steps"], len(metrics)), 2)
238
- avg_row["Total Tool Calls"] = round(safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2)
239
- avg_row["Tool Call Precision"] = round(safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2)
240
- avg_row["Tool Call Recall"] = round(safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2)
241
- avg_row["Agent Routing Accuracy"] = round(safe_divide(avg_row["Agent Routing Accuracy"], len(metrics)), 2)
242
- avg_row["Text Match"] = round(safe_divide(avg_row["Text Match"], len([row for row in metrics if row["Text Match"] != TextMatchType.text_match.na])), 2)
243
- avg_row["Journey Success"] = round(safe_divide(avg_row["Journey Success"], len(metrics)), 2)
244
- avg_row["Avg Resp Time (sec)"] = round(safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2)
304
+ avg_row["Total Steps"] = round(
305
+ safe_divide(avg_row["Total Steps"], len(metrics)), 2
306
+ )
307
+ avg_row["LLM Steps"] = round(
308
+ safe_divide(avg_row["LLM Steps"], len(metrics)), 2
309
+ )
310
+ avg_row["Total Tool Calls"] = round(
311
+ safe_divide(avg_row["Total Tool Calls"], len(metrics)), 2
312
+ )
313
+ avg_row["Tool Call Precision"] = round(
314
+ safe_divide(avg_row["Tool Call Precision"], len(metrics)), 2
315
+ )
316
+ avg_row["Tool Call Recall"] = round(
317
+ safe_divide(avg_row["Tool Call Recall"], len(metrics)), 2
318
+ )
319
+ avg_row["Agent Routing Accuracy"] = round(
320
+ safe_divide(
321
+ avg_row["Agent Routing Accuracy"], len(metrics)
322
+ ),
323
+ 2,
324
+ )
325
+ avg_row["Text Match"] = round(
326
+ safe_divide(
327
+ avg_row["Text Match"],
328
+ len(
329
+ [
330
+ row
331
+ for row in metrics
332
+ if row["Text Match"]
333
+ != TextMatchType.text_match.na
334
+ ]
335
+ ),
336
+ ),
337
+ 2,
338
+ )
339
+ avg_row["Journey Success"] = round(
340
+ safe_divide(avg_row["Journey Success"], len(metrics)), 2
341
+ )
342
+ avg_row["Avg Resp Time (sec)"] = round(
343
+ safe_divide(avg_row["Avg Resp Time (sec)"], len(metrics)), 2
344
+ )
245
345
  return avg_row
246
346
 
247
347
  tool_call_metrics_for_display = []
248
348
  for row in tool_call_metrics:
249
- tool_call_metrics_for_display.append(filter_display_only_values(row))
250
- tool_call_metrics_for_display.append(create_avg_row(tool_call_metrics_for_display))
251
- tool_call_table_for_display = create_table(tool_call_metrics_for_display)
252
-
349
+ tool_call_metrics_for_display.append(
350
+ filter_display_only_values(row)
351
+ )
352
+ tool_call_metrics_for_display.append(
353
+ create_avg_row(tool_call_metrics_for_display)
354
+ )
355
+ tool_call_table_for_display = create_table(
356
+ tool_call_metrics_for_display
357
+ )
358
+
253
359
  if tool_call_table_for_display:
254
360
  tool_call_table_for_display.print()
255
361
 
256
362
  if len(tool_call_metrics) > 0:
257
- tool_call_metrics = [metric.model_dump() for metric in tool_call_metrics]
363
+ tool_call_metrics = [
364
+ metric.model_dump() for metric in tool_call_metrics
365
+ ]
258
366
  output_file = os.path.join(config.output_dir, "summary_metrics.csv")
259
367
  header = list(tool_call_metrics[0].keys())
260
368
 
@@ -1,8 +1,8 @@
1
- from pydantic import BaseModel, computed_field
2
-
3
1
  from abc import abstractmethod
4
2
  from functools import cached_property
5
3
 
4
+ from pydantic import BaseModel, computed_field
5
+
6
6
 
7
7
  class BaseLLMJudgeMetric(BaseModel):
8
8
  @abstractmethod
@@ -1,10 +1,13 @@
1
1
  import math
2
- from typing import List, Mapping, Any, Tuple, Optional
3
2
  from enum import Enum
3
+ from typing import Any, List, Mapping, Optional, Tuple
4
4
 
5
5
  from pydantic import BaseModel, computed_field
6
6
 
7
- from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness, AnswerRelevancy
7
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
8
+ AnswerRelevancy,
9
+ Faithfulness,
10
+ )
8
11
  from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
9
12
 
10
13
 
@@ -13,7 +16,7 @@ def average(array):
13
16
  return math.nan
14
17
 
15
18
  else:
16
- return sum(array)/len(array)
19
+ return sum(array) / len(array)
17
20
 
18
21
 
19
22
  class KnowledgeBaseMetrics(BaseModel):
@@ -54,7 +57,9 @@ class KnowledgeBaseMetricSummary(BaseModel):
54
57
  }
55
58
  else:
56
59
  values = groupby[name]
57
- values.get("knowledge_base_name").append(knowledge_base_name)
60
+ values.get("knowledge_base_name").append(
61
+ knowledge_base_name
62
+ )
58
63
  values.get("faithfulness").append(faithfulness)
59
64
  values.get("answer_relevancy").append(answer_relevancy)
60
65
  values.get("confidence_scores").append(confidence_scores)
@@ -109,6 +114,7 @@ class KeywordSemanticSearchMetric(BaseModel):
109
114
  message: str
110
115
  goal_detail: str
111
116
 
117
+
112
118
  class TextMatchType(Enum):
113
119
  text_match = "Summary Matched"
114
120
  text_mismatch = "Summary MisMatched"
@@ -117,12 +123,14 @@ class TextMatchType(Enum):
117
123
 
118
124
  class ToolCallAndRoutingMetrics(BaseModel):
119
125
  dataset_name: str = ""
120
- total_steps: int=0
121
- llm_step: int =0
126
+ total_steps: int = 0
127
+ llm_step: int = 0
122
128
  total_tool_calls: int = 0
123
129
  expected_tool_calls: int = 0
124
130
  correct_tool_calls: int = 0
125
- relevant_tool_calls: int = 0 # calls with the same function but different args
131
+ relevant_tool_calls: int = (
132
+ 0 # calls with the same function but different args
133
+ )
126
134
  total_routing_calls: int = 0
127
135
  relevant_routing_calls: int = 0
128
136
  tool_calls_with_incorrect_parameter: int = 0
@@ -135,7 +143,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
135
143
  def tool_call_recall(self) -> float:
136
144
  return round(
137
145
  (
138
- self.correct_tool_calls/self.expected_tool_calls
146
+ self.correct_tool_calls / self.expected_tool_calls
139
147
  if self.expected_tool_calls > 0
140
148
  else 0.0
141
149
  ),
@@ -147,8 +155,7 @@ class ToolCallAndRoutingMetrics(BaseModel):
147
155
  def tool_call_precision(self) -> float:
148
156
  return round(
149
157
  (
150
- (self.correct_tool_calls)
151
- / self.total_tool_calls
158
+ (self.correct_tool_calls) / self.total_tool_calls
152
159
  if self.total_tool_calls > 0
153
160
  else 0.0
154
161
  ),
@@ -167,11 +174,13 @@ class ToolCallAndRoutingMetrics(BaseModel):
167
174
  2,
168
175
  )
169
176
 
177
+
170
178
  class FailedStaticTestCases(BaseModel):
171
179
  metric_name: str
172
180
  description: str
173
181
  explanation: str
174
182
 
183
+
175
184
  class FailedSemanticTestCases(BaseModel):
176
185
  metric_name: str
177
186
  evidence: str
@@ -179,11 +188,16 @@ class FailedSemanticTestCases(BaseModel):
179
188
  output: int
180
189
  confidence: float
181
190
 
191
+
182
192
  class ReferenceLessEvalMetrics(BaseModel):
183
193
  dataset_name: str
184
194
  number_of_tool_calls: int
185
195
  number_of_successful_tool_calls: int
186
196
  number_of_static_failed_tool_calls: int
187
197
  number_of_semantic_failed_tool_calls: int
188
- failed_static_tool_calls: Optional[List[Tuple[int, List[FailedStaticTestCases]]]]
189
- failed_semantic_tool_calls: Optional[List[Tuple[int, List[FailedSemanticTestCases]]]]
198
+ failed_static_tool_calls: Optional[
199
+ List[Tuple[int, List[FailedStaticTestCases]]]
200
+ ]
201
+ failed_semantic_tool_calls: Optional[
202
+ List[Tuple[int, List[FailedSemanticTestCases]]]
203
+ ]
@@ -1,7 +1,10 @@
1
- import jinja2
2
1
  from typing import List
2
+
3
+ import jinja2
4
+
3
5
  from wxo_agentic_evaluation.type import ToolDefinition
4
6
 
7
+
5
8
  class JinjaTemplateRenderer:
6
9
  def __init__(self, template_path: str):
7
10
  self._template_env = jinja2.Environment(
@@ -20,7 +23,11 @@ class JinjaTemplateRenderer:
20
23
 
21
24
  class LlamaUserTemplateRenderer(JinjaTemplateRenderer):
22
25
  def render(
23
- self, user_story: str, user_response_style: List, conversation_history: List, attack_instructions: str = None
26
+ self,
27
+ user_story: str,
28
+ user_response_style: List,
29
+ conversation_history: List,
30
+ attack_instructions: str = None,
24
31
  ) -> str:
25
32
  return super().render(
26
33
  user_story=user_story,
@@ -32,12 +39,17 @@ class LlamaUserTemplateRenderer(JinjaTemplateRenderer):
32
39
 
33
40
  class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
34
41
  def render(self, keywords_text: str, response_text: str) -> str:
35
- return super().render(keywords_text=keywords_text, response_text=response_text)
42
+ return super().render(
43
+ keywords_text=keywords_text, response_text=response_text
44
+ )
36
45
 
37
46
 
38
47
  class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
39
48
  def render(self, expected_text: str, actual_text: str) -> str:
40
- return super().render(expected_text=expected_text, actual_text=actual_text)
49
+ return super().render(
50
+ expected_text=expected_text, actual_text=actual_text
51
+ )
52
+
41
53
 
42
54
  class BadToolDescriptionRenderer(JinjaTemplateRenderer):
43
55
  def render(self, tool_definition: ToolDefinition) -> str:
@@ -51,7 +63,9 @@ class LlamaKeywordsGenerationTemplateRenderer(JinjaTemplateRenderer):
51
63
 
52
64
  class FaithfulnessTemplateRenderer(JinjaTemplateRenderer):
53
65
  def render(self, claim, retrieval_context):
54
- return super().render(claim=claim, supporting_evidence=retrieval_context)
66
+ return super().render(
67
+ claim=claim, supporting_evidence=retrieval_context
68
+ )
55
69
 
56
70
 
57
71
  class AnswerRelevancyTemplateRenderer(JinjaTemplateRenderer):
@@ -60,13 +74,16 @@ class AnswerRelevancyTemplateRenderer(JinjaTemplateRenderer):
60
74
 
61
75
 
62
76
  class ToolPlannerTemplateRenderer(JinjaTemplateRenderer):
63
- def render(self, user_story: str, agent_name: str, available_tools: str) -> str:
77
+ def render(
78
+ self, user_story: str, agent_name: str, available_tools: str
79
+ ) -> str:
64
80
  return super().render(
65
81
  user_story=user_story,
66
82
  agent_name=agent_name,
67
83
  available_tools=available_tools,
68
84
  )
69
-
85
+
86
+
70
87
  class ArgsExtractorTemplateRenderer(JinjaTemplateRenderer):
71
88
  def render(self, tool_signature: str, step: dict, inputs: dict) -> str:
72
89
  return super().render(
@@ -75,8 +92,9 @@ class ArgsExtractorTemplateRenderer(JinjaTemplateRenderer):
75
92
  inputs=inputs,
76
93
  )
77
94
 
95
+
78
96
  class ToolChainAgentTemplateRenderer(JinjaTemplateRenderer):
79
- def render(self, tool_call_history: List, available_tools:str) -> str:
97
+ def render(self, tool_call_history: List, available_tools: str) -> str:
80
98
  return super().render(
81
99
  tool_call_history=tool_call_history,
82
100
  available_tools=available_tools,
@@ -102,6 +120,7 @@ class BatchTestCaseGeneratorTemplateRenderer(JinjaTemplateRenderer):
102
120
  example_str=example_str,
103
121
  )
104
122
 
123
+
105
124
  class StoryGenerationTemplateRenderer(JinjaTemplateRenderer):
106
125
  def render(
107
126
  self,
@@ -110,7 +129,8 @@ class StoryGenerationTemplateRenderer(JinjaTemplateRenderer):
110
129
  return super().render(
111
130
  input_data=input_data,
112
131
  )
113
-
132
+
133
+
114
134
  class OnPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
115
135
  def render(
116
136
  self,
@@ -125,7 +145,8 @@ class OnPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
125
145
  original_story=original_story,
126
146
  original_starting_sentence=original_starting_sentence,
127
147
  )
128
-
148
+
149
+
129
150
  class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
130
151
  def render(
131
152
  self,
@@ -135,4 +156,4 @@ class OffPolicyAttackGeneratorTemplateRenderer(JinjaTemplateRenderer):
135
156
  return super().render(
136
157
  original_story=original_story,
137
158
  original_starting_sentence=original_starting_sentence,
138
- )
159
+ )