ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (49) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/METADATA +4 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/RECORD +49 -39
  3. wxo_agentic_evaluation/analyze_run.py +822 -344
  4. wxo_agentic_evaluation/arg_configs.py +39 -2
  5. wxo_agentic_evaluation/data_annotator.py +22 -4
  6. wxo_agentic_evaluation/description_quality_checker.py +29 -4
  7. wxo_agentic_evaluation/evaluation_package.py +197 -18
  8. wxo_agentic_evaluation/external_agent/external_validate.py +3 -1
  9. wxo_agentic_evaluation/external_agent/types.py +1 -1
  10. wxo_agentic_evaluation/inference_backend.py +105 -108
  11. wxo_agentic_evaluation/llm_matching.py +104 -2
  12. wxo_agentic_evaluation/llm_user.py +2 -2
  13. wxo_agentic_evaluation/main.py +147 -38
  14. wxo_agentic_evaluation/metrics/__init__.py +5 -0
  15. wxo_agentic_evaluation/metrics/evaluations.py +124 -0
  16. wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
  17. wxo_agentic_evaluation/metrics/metrics.py +64 -1
  18. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  19. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  20. wxo_agentic_evaluation/prompt/template_render.py +20 -2
  21. wxo_agentic_evaluation/quick_eval.py +23 -11
  22. wxo_agentic_evaluation/record_chat.py +18 -10
  23. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +169 -100
  24. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  25. wxo_agentic_evaluation/red_teaming/attack_list.py +78 -8
  26. wxo_agentic_evaluation/red_teaming/attack_runner.py +71 -14
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  30. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +103 -39
  31. wxo_agentic_evaluation/resource_map.py +3 -1
  32. wxo_agentic_evaluation/service_instance.py +12 -3
  33. wxo_agentic_evaluation/service_provider/__init__.py +129 -9
  34. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  35. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +415 -17
  36. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  37. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  38. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  39. wxo_agentic_evaluation/service_provider/watsonx_provider.py +480 -52
  40. wxo_agentic_evaluation/type.py +15 -5
  41. wxo_agentic_evaluation/utils/__init__.py +44 -3
  42. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  43. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  44. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  45. wxo_agentic_evaluation/utils/parsers.py +71 -0
  46. wxo_agentic_evaluation/utils/utils.py +140 -20
  47. wxo_agentic_evaluation/wxo_client.py +81 -0
  48. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/WHEEL +0 -0
  49. {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  import csv
2
3
  import dataclasses
3
4
  import glob
@@ -7,6 +8,7 @@ import re
7
8
  import traceback
8
9
  from collections import defaultdict
9
10
  from concurrent.futures import ThreadPoolExecutor
11
+ from dataclasses import asdict
10
12
  from datetime import datetime
11
13
  from pathlib import Path
12
14
  from typing import List
@@ -16,15 +18,16 @@ import yaml
16
18
  from jsonargparse import CLI
17
19
  from rich.progress import Progress
18
20
 
19
- from wxo_agentic_evaluation.arg_configs import TestConfig
21
+ from wxo_agentic_evaluation.arg_configs import ProviderConfig, TestConfig
20
22
  from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
21
23
  from wxo_agentic_evaluation.inference_backend import (
22
24
  EvaluationController,
23
25
  WXOInferenceBackend,
24
- get_wxo_client,
25
26
  )
26
27
  from wxo_agentic_evaluation.llm_user import LLMUser
28
+ from wxo_agentic_evaluation.metrics.evaluations import Extractor
27
29
  from wxo_agentic_evaluation.metrics.metrics import (
30
+ CustomEvalMetrics,
28
31
  KnowledgeBaseMetricSummary,
29
32
  TextMatchType,
30
33
  ToolCallAndRoutingMetrics,
@@ -33,46 +36,61 @@ from wxo_agentic_evaluation.prompt.template_render import (
33
36
  LlamaUserTemplateRenderer,
34
37
  )
35
38
  from wxo_agentic_evaluation.resource_map import ResourceMap
36
- from wxo_agentic_evaluation.service_provider import get_provider
39
+ from wxo_agentic_evaluation.service_provider import (
40
+ LOGGING_ENABLED,
41
+ get_provider,
42
+ )
43
+ from wxo_agentic_evaluation.service_provider.provider import Provider
37
44
  from wxo_agentic_evaluation.type import EvaluationData
38
45
  from wxo_agentic_evaluation.utils import json_dump
46
+ from wxo_agentic_evaluation.utils.evaluation_discovery import (
47
+ find_evaluation_subclasses,
48
+ )
39
49
  from wxo_agentic_evaluation.utils.utils import (
40
50
  SummaryPanel,
41
51
  create_table,
42
52
  safe_divide,
43
53
  )
54
+ from wxo_agentic_evaluation.wxo_client import get_wxo_client
44
55
 
45
56
 
46
57
  def process_test_case(
47
- task_n,
48
- test_case,
49
- config,
50
- inference_backend,
51
- resource_map,
52
- llm_user,
58
+ task_n: int,
59
+ test_case: str,
60
+ config: TestConfig,
61
+ inference_backend: WXOInferenceBackend,
62
+ resource_map: ResourceMap,
63
+ llm_user: LLMUser,
64
+ llmaaj_provider: Provider,
53
65
  run_idx: int = 0,
54
66
  ):
55
67
  summary_results_for_path = []
56
- tc_name = os.path.basename(test_case).replace(".json", "")
57
- run_tag = f".run{run_idx+1}" if getattr(config, "n_runs", 1) > 1 else ""
68
+ test_case_name = os.path.basename(test_case).replace(".json", "")
69
+ run_tag = f".run{run_idx+1}" if config.n_runs > 1 else ""
70
+
58
71
  with open(test_case, "r") as f:
59
- test_case: EvaluationData = EvaluationData.model_validate(json.load(f))
72
+ evaluation_data = EvaluationData.model_validate(json.load(f))
60
73
 
61
74
  evaluation_controller = EvaluationController(
62
75
  wxo_inference_backend=inference_backend,
63
76
  llm_user=llm_user,
64
77
  config=config,
65
78
  )
66
- rich.print(f"[bold magenta]Running test case: {tc_name}[/bold magenta]")
79
+
80
+ rich.print(
81
+ f"[bold magenta]Running test case: {test_case_name}[/bold magenta]"
82
+ )
83
+
67
84
  (
68
85
  history,
69
86
  call_tracker,
70
87
  conversational_search_data,
71
88
  ) = evaluation_controller.run(
72
89
  task_n,
73
- test_case.story,
74
- agent_name=test_case.agent,
75
- starting_user_input=test_case.starting_sentence,
90
+ evaluation_data.story,
91
+ agent_name=evaluation_data.agent,
92
+ starting_user_input=evaluation_data.starting_sentence,
93
+ max_user_turns=evaluation_data.max_user_turns,
76
94
  )
77
95
  result = list()
78
96
  for message in history:
@@ -80,13 +98,15 @@ def process_test_case(
80
98
 
81
99
  json_dump(
82
100
  os.path.join(
83
- config.output_dir, "messages", tc_name + run_tag + ".messages.json"
101
+ config.output_dir,
102
+ "messages",
103
+ f"{test_case_name}{run_tag}.messages.json",
84
104
  ),
85
105
  result,
86
106
  )
87
107
 
88
108
  if len(conversational_search_data) > 0:
89
- fn = tc_name + run_tag + ".retrieval_context.json"
109
+ fn = f"{test_case_name}{run_tag}.retrieval_context.json"
90
110
  out_folder = Path(config.output_dir) / "knowledge_base_metrics"
91
111
  out_folder.mkdir(exist_ok=True)
92
112
  rc = [context.model_dump() for context in conversational_search_data]
@@ -96,25 +116,51 @@ def process_test_case(
96
116
  if config.data_annotation_run:
97
117
  return summary_results_for_path # empty result set, skip summary
98
118
 
119
+ # Handle custom extractions
120
+ all_extractors = []
121
+ if config.extrators_config.paths is not None:
122
+ for path in config.extrators_config.paths:
123
+ extractors = find_evaluation_subclasses(
124
+ directory=path, base_class_name="Extractor"
125
+ )
126
+ for extractor_class in extractors:
127
+ extractor: Extractor = extractor_class()
128
+ all_extractors.append(extractor)
129
+
130
+ # Handle custom evaluations
131
+ all_custom_evals = []
132
+ if config.custom_metrics_config.paths is not None:
133
+ for path in config.custom_metrics_config.paths:
134
+ custom_eval_classes = find_evaluation_subclasses(path)
135
+ for _class in custom_eval_classes:
136
+ custom_eval = _class(llm_client=llmaaj_provider)
137
+ all_custom_evals.append(custom_eval)
138
+
99
139
  evaluation_package = EvaluationPackage(
100
- test_case_name=tc_name,
140
+ test_case_name=test_case_name,
101
141
  messages=history,
102
- ground_truth=test_case,
142
+ ground_truth=evaluation_data,
103
143
  conversational_search_data=conversational_search_data,
104
144
  resource_map=resource_map,
145
+ config=config,
146
+ custom_evals=all_custom_evals,
147
+ extractors=all_extractors,
148
+ similarity_threshold=config.similarity_threshold,
149
+ enable_fuzzy_matching=config.enable_fuzzy_matching,
105
150
  )
106
151
  (
107
152
  keyword_semantic_matches,
108
153
  knowledge_base_metrics,
109
154
  messages_with_reason,
110
155
  metrics,
156
+ custom_metrics,
111
157
  ) = evaluation_package.generate_summary()
112
158
  temp = []
113
159
  for message in messages_with_reason:
114
160
  temp.append(message.model_dump())
115
161
  expected_tools = [
116
162
  gd.tool_name
117
- for gd in test_case.goal_details
163
+ for gd in evaluation_data.goal_details
118
164
  if getattr(gd, "type", None) == "tool_call"
119
165
  ]
120
166
 
@@ -157,25 +203,29 @@ def process_test_case(
157
203
  os.path.join(
158
204
  config.output_dir,
159
205
  "messages",
160
- tc_name + run_tag + ".messages.analyze.json",
206
+ f"{test_case_name}{run_tag}.messages.analyze.json",
161
207
  ),
162
208
  temp,
163
209
  )
164
210
 
165
211
  json_dump(
166
212
  os.path.join(
167
- config.output_dir, "messages", tc_name + run_tag + ".metrics.json"
213
+ config.output_dir,
214
+ "messages",
215
+ f"{test_case_name}{run_tag}.metrics.json",
168
216
  ),
169
217
  metrics.model_dump(),
170
218
  )
171
219
 
172
- metrics.dataset_name = tc_name
220
+ metrics.dataset_name = test_case_name
173
221
  metrics.avg_resp_time = (
174
222
  sum(call_tracker.generic) + sum(call_tracker.tool_call)
175
223
  ) / (len(call_tracker.generic) + len(call_tracker.tool_call))
176
224
  metrics.avg_resp_time = round(metrics.avg_resp_time, 2)
177
225
 
178
- summary_results_for_path.append((metrics, knowledge_base_metrics))
226
+ summary_results_for_path.append(
227
+ (metrics, knowledge_base_metrics, custom_metrics)
228
+ )
179
229
 
180
230
  return summary_results_for_path
181
231
 
@@ -199,19 +249,49 @@ def main(config: TestConfig):
199
249
  config.auth_config.tenant_name,
200
250
  config.auth_config.token,
201
251
  )
252
+
202
253
  resource_map = ResourceMap(wxo_client)
203
254
  inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
255
+ original_provider_config = config.provider_config
256
+ provider_config_dict = asdict(original_provider_config)
257
+
258
+ provider_kwargs = {
259
+ "config": ProviderConfig(**provider_config_dict),
260
+ "model_id": config.llm_user_config.model_id,
261
+ }
262
+
263
+ if provider_config_dict.get("provider", "gateway") == "gateway":
264
+ provider_kwargs.update(
265
+ token=config.auth_config.token or wxo_client.api_key,
266
+ instance_url=wxo_client.service_url,
267
+ )
268
+ config.auth_config.token = (
269
+ config.auth_config.token or wxo_client.api_key
270
+ )
271
+ config.auth_config.url = (
272
+ config.auth_config.url or wxo_client.service_url
273
+ )
274
+
204
275
  llm_user = LLMUser(
205
- wai_client=get_provider(
206
- config=config.provider_config,
207
- model_id=config.llm_user_config.model_id,
208
- ),
276
+ wai_client=get_provider(**provider_kwargs),
209
277
  template=LlamaUserTemplateRenderer(
210
278
  config.llm_user_config.prompt_config
211
279
  ),
212
280
  user_response_style=config.llm_user_config.user_response_style,
213
281
  )
214
282
 
283
+ llamaj_provider_kwargs = copy.deepcopy(provider_kwargs)
284
+ llamaj_config_dict = asdict(llamaj_provider_kwargs["config"])
285
+
286
+ llamaj_config_dict["model_id"] = (
287
+ config.custom_metrics_config.llmaaj_config.model_id
288
+ )
289
+ llamaj_config_dict["embedding_model_id"] = (
290
+ config.custom_metrics_config.llmaaj_config.embedding_model_id
291
+ )
292
+ llamaj_provider_kwargs["config"] = ProviderConfig(**llamaj_config_dict)
293
+ llmaaj_provider = get_provider(**llamaj_provider_kwargs)
294
+
215
295
  print(f"Running evaluation with tenant {config.auth_config.tenant_name}")
216
296
 
217
297
  results_list = []
@@ -247,7 +327,7 @@ def main(config: TestConfig):
247
327
  run_num = int(m.group("run") or 1) # no suffix ⇒ run 1
248
328
  available_runs[stem].add(run_num)
249
329
 
250
- test_cases = []
330
+ test_cases: list[str] = []
251
331
  for test_path in config.test_paths:
252
332
  if os.path.isdir(test_path):
253
333
  test_path = os.path.join(test_path, "*.json")
@@ -256,9 +336,11 @@ def main(config: TestConfig):
256
336
  futures = []
257
337
  task_n = 0
258
338
  n_runs = getattr(config, "n_runs", 1)
339
+
259
340
  for test_case in test_cases:
260
341
  if not test_case.endswith(".json") or test_case.endswith("agent.json"):
261
342
  continue
343
+
262
344
  stem = Path(test_case).stem
263
345
 
264
346
  for run_idx in range(n_runs):
@@ -272,6 +354,7 @@ def main(config: TestConfig):
272
354
  f"Skipping {stem} run {run_number} as results already exist."
273
355
  )
274
356
  continue
357
+
275
358
  future = executor.submit(
276
359
  process_test_case,
277
360
  task_n,
@@ -280,28 +363,42 @@ def main(config: TestConfig):
280
363
  inference_backend,
281
364
  resource_map,
282
365
  llm_user,
366
+ llmaaj_provider,
283
367
  run_idx, # 👈 pass run index
284
368
  )
285
369
  futures.append(((test_case, run_idx), future))
286
370
  task_n += 1
287
371
 
288
372
  if futures:
289
- with Progress() as progress:
290
- task1 = progress.add_task(
291
- f"[purple]Evaluating {len(futures)} tasks...",
292
- total=len(futures),
293
- )
373
+
374
+ if LOGGING_ENABLED:
375
+ # No progress bar when logging - just process tasks
294
376
  for (test_case, run_idx), future in futures:
295
377
  try:
296
378
  results_list.extend(future.result())
297
379
  except Exception as e:
298
380
  rich.print(f"test case {test_case} fails with {e}")
299
381
  traceback.print_exc()
300
- finally:
301
- progress.update(task1, advance=1)
382
+ else:
383
+ with Progress() as progress:
384
+ task1 = progress.add_task(
385
+ f"[purple]Evaluating {len(futures)} tasks...",
386
+ total=len(futures),
387
+ )
388
+ for (test_case, run_idx), future in futures:
389
+ try:
390
+ results_list.extend(future.result())
391
+ except Exception as e:
392
+ rich.print(f"test case {test_case} fails with {e}")
393
+ traceback.print_exc()
394
+ finally:
395
+ progress.update(task1, advance=1)
302
396
 
303
397
  tool_call_metrics = [metric[0] for metric in results_list]
304
398
  knowledge_base_metrics = [metric[1] for metric in results_list]
399
+ custom_metrics: List[CustomEvalMetrics] = [
400
+ metric[2] for metric in results_list
401
+ ]
305
402
 
306
403
  rag_metric_summary = KnowledgeBaseMetricSummary(
307
404
  knowledge_base_metrics=knowledge_base_metrics
@@ -502,11 +599,23 @@ def main(config: TestConfig):
502
599
  output_file = os.path.join(config.output_dir, "summary_metrics.csv")
503
600
  header = list(tool_call_metrics[0].keys())
504
601
 
505
- with open(output_file, "w") as file:
602
+ with open(output_file, "w", newline="") as file:
506
603
  csv_writer = csv.writer(file)
507
604
  csv_writer.writerow(header)
508
605
  for entry in tool_call_metrics:
509
606
  csv_writer.writerow([entry[name] for name in header])
607
+ # Check if any custom metrics have been calculated
608
+ if any([m.custom_metrics for m in custom_metrics]):
609
+ custom_metrics_display_data = []
610
+ for metric in custom_metrics:
611
+ row = {}
612
+ row["dataset_name"] = metric.dataset_name
613
+ for metric in metric.custom_metrics:
614
+ row[metric.eval_name] = metric.value
615
+ custom_metrics_display_data.append(row)
616
+ create_table(
617
+ custom_metrics_display_data, title="Custom Metrics"
618
+ ).print()
510
619
 
511
620
  with open(
512
621
  os.path.join(config.output_dir, "config.yml"), "w", encoding="utf-8"
@@ -0,0 +1,5 @@
1
+ from wxo_agentic_evaluation.metrics.metrics import (
2
+ Annotation,
3
+ FailedSemanticTestCases,
4
+ FailedStaticTestCases,
5
+ )
@@ -0,0 +1,124 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Dict, Optional
4
+
5
+ from wxo_agentic_evaluation.metrics.metrics import Metric
6
+ from wxo_agentic_evaluation.prompt.template_render import LLMaaJTemplateRenderer
7
+ from wxo_agentic_evaluation.service_provider.provider import Provider
8
+ from wxo_agentic_evaluation.type import EvaluationData, Message
9
+ from wxo_agentic_evaluation.utils.messages_parser import ParsedMessages
10
+
11
+ root_dir: str = os.path.dirname(os.path.dirname(__file__))
12
+ LLMAAJ_PROMPT_PATH = os.path.join(root_dir, "prompt", "llmaaj_prompt.jinja2")
13
+
14
+
15
+ class Extractor(ABC):
16
+ @property
17
+ @abstractmethod
18
+ def name(self) -> str:
19
+ """Unique name for the extractor."""
20
+ raise NotImplementedError
21
+
22
+ @staticmethod
23
+ @abstractmethod
24
+ def extract(
25
+ messages: list[Message],
26
+ **kwargs,
27
+ ) -> Any:
28
+ """Extract data from messages."""
29
+ raise NotImplementedError
30
+
31
+
32
+ class Evaluation(ABC):
33
+ """Abstract base class for all evaluations."""
34
+
35
+ def __init__(self, llm_client: Optional[Provider] = None) -> None:
36
+ self._llm_client = llm_client
37
+
38
+ @property
39
+ def llm_client(self) -> Any:
40
+ """Access client, require it if used."""
41
+ if self._llm_client is None:
42
+ raise RuntimeError(
43
+ f"{self.__class__.__name__} requires a client, but none was provided"
44
+ )
45
+ return self._llm_client
46
+
47
+ @property
48
+ @abstractmethod
49
+ def name(self) -> str:
50
+ """Unique name for the evaluator."""
51
+ raise NotImplementedError
52
+
53
+ @abstractmethod
54
+ def evaluate(
55
+ self,
56
+ messages: list[Message],
57
+ ground_truth: EvaluationData,
58
+ extracted_context: Dict[str, Any],
59
+ ) -> Optional[Metric]:
60
+ """
61
+ Evaluation method.
62
+
63
+ Args:
64
+ messages: agent and user conversational messages (includes tool calls)
65
+ ground_truth: ground truth data
66
+ extracted_context: dictionary containing data derived from the messages
67
+
68
+ Returns:
69
+ Metic
70
+ """
71
+ raise NotImplementedError
72
+
73
+
74
+ class LLMaaJEvaluation(Evaluation, ABC):
75
+ """Evaluation metric for LLMaaJ."""
76
+
77
+ @property
78
+ @abstractmethod
79
+ def llmaaj_instructions(self) -> str:
80
+ """LLMaaJ instructions for the evaluator."""
81
+ raise NotImplementedError
82
+
83
+ @abstractmethod
84
+ def format_llm_output(self, string: str) -> int | float | bool | str:
85
+ """Format the output of the LLMaaJ query."""
86
+ raise NotImplementedError
87
+
88
+ @property
89
+ def selected_context_keys(self) -> set[str]:
90
+ """Override to implement context keys to pass to the prompt."""
91
+ return set()
92
+
93
+ def select_context(
94
+ self, extracted_context: Dict[str, Any]
95
+ ) -> dict[str, Any]:
96
+ """Additional context to be added to the prompt."""
97
+ selected_context = {
98
+ key: value
99
+ for key, value in extracted_context.items()
100
+ if key in self.selected_context_keys
101
+ }
102
+
103
+ return selected_context
104
+
105
+ def evaluate(
106
+ self,
107
+ messages: list[Message],
108
+ ground_truth: EvaluationData,
109
+ extracted_context: Dict[str, Any],
110
+ ) -> Optional[Metric]:
111
+ renderer = LLMaaJTemplateRenderer(LLMAAJ_PROMPT_PATH)
112
+ parsed = ParsedMessages(messages=messages)
113
+ if parsed.user_input is None or parsed.agent_response is None:
114
+ return None
115
+ context = str(self.select_context(extracted_context))
116
+ prompt = renderer.render(
117
+ user_input=parsed.user_input,
118
+ agent_answer=parsed.agent_response,
119
+ llmaaj_instructions=self.llmaaj_instructions,
120
+ context=context,
121
+ )
122
+ score_str = self.llm_client.query(prompt)
123
+ value = self.format_llm_output(score_str)
124
+ return Metric(eval_name=self.name, value=value)
@@ -53,8 +53,9 @@ class AnswerDerailment(BaseLLMJudgeMetric):
53
53
 
54
54
  def table(self):
55
55
  return {
56
- "statement": ",".join(self.statement),
56
+ "statement": self.statement,
57
57
  "reason": self.reason,
58
+ "on_topic_score": str(self.in_scope),
58
59
  }
59
60
 
60
61
 
@@ -65,7 +66,7 @@ class AnswerUnsafeTopic(BaseLLMJudgeMetric):
65
66
 
66
67
  def table(self):
67
68
  return {
68
- "statement": ",".join(self.statement),
69
+ "statement": self.statement,
69
70
  "reason": self.reason,
70
- "unsafe_topic_score": str(self.is_safe),
71
+ "safe_topic_score": str(self.is_safe),
71
72
  }
@@ -1,8 +1,9 @@
1
1
  import math
2
- from enum import Enum
2
+ from enum import Enum, StrEnum
3
3
  from typing import Any, List, Mapping, Optional, Tuple
4
4
 
5
5
  from pydantic import BaseModel, computed_field
6
+ from pydantic.fields import Field
6
7
 
7
8
  from wxo_agentic_evaluation.metrics.llm_as_judge import (
8
9
  AnswerRelevancy,
@@ -19,6 +20,36 @@ def average(array):
19
20
  return sum(array) / len(array)
20
21
 
21
22
 
23
+ class DescriptionQuality(StrEnum):
24
+ GOOD = "GOOD"
25
+ BAD = "BAD"
26
+ MISSING = "MISSING"
27
+
28
+
29
+ class DescriptionQualityMetric(BaseModel):
30
+ tool_name: str = None
31
+ description_score: float | None = None
32
+ threshold: float | None = None
33
+
34
+ @computed_field
35
+ @property
36
+ def is_bad_description(self) -> Optional[bool]:
37
+ if self.description_score and self.threshold:
38
+ return self.description_score >= self.threshold
39
+
40
+ return None
41
+
42
+ @computed_field
43
+ @property
44
+ def description_quality(self) -> str:
45
+ if self.description_score is None:
46
+ return DescriptionQuality.MISSING
47
+ elif self.is_bad_description:
48
+ return DescriptionQuality.BAD
49
+ else:
50
+ return DescriptionQuality.GOOD
51
+
52
+
22
53
  class KnowledgeBaseMetrics(BaseModel):
23
54
  dataset_name: str = None
24
55
  knowledge_base_name: str = (
@@ -175,6 +206,13 @@ class ToolCallAndRoutingMetrics(BaseModel):
175
206
  )
176
207
 
177
208
 
209
+ class Annotation(BaseModel):
210
+ recommendation: str
211
+ details: str
212
+ quote: str
213
+ parameter_name: Optional[str]
214
+
215
+
178
216
  class FailedStaticTestCases(BaseModel):
179
217
  metric_name: str
180
218
  description: str
@@ -187,6 +225,15 @@ class FailedSemanticTestCases(BaseModel):
187
225
  explanation: str
188
226
  output: int
189
227
  confidence: float
228
+ annotations: Optional[List[Annotation]] = None
229
+
230
+
231
+ class EnhancedAnalyzeMetrics(BaseModel):
232
+ test_case_name: str
233
+ tool_names: List[str]
234
+ parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
235
+ tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
236
+ static_metrics: List[List[FailedStaticTestCases]] = [[]]
190
237
 
191
238
 
192
239
  class ReferenceLessEvalMetrics(BaseModel):
@@ -201,3 +248,19 @@ class ReferenceLessEvalMetrics(BaseModel):
201
248
  failed_semantic_tool_calls: Optional[
202
249
  List[Tuple[int, List[FailedSemanticTestCases]]]
203
250
  ]
251
+
252
+
253
+ class Metric(BaseModel):
254
+ """Generic metric result."""
255
+
256
+ eval_name: str = Field(description="name of eval that produce metric")
257
+ value: int | float | bool | str = Field(description="metric value")
258
+ metadata: Optional[dict] = Field(
259
+ default=None,
260
+ description="metadata that was generated along side the metric. example: llmaaj reason, retrieval score",
261
+ )
262
+
263
+
264
+ class CustomEvalMetrics(BaseModel):
265
+ dataset_name: str
266
+ custom_metrics: list[Metric]
@@ -0,0 +1,15 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+
3
+ {{llmaaj_instructions}}
4
+
5
+ <|start_header_id|>user<|end_header_id|>
6
+
7
+ User question: {{user_input}}
8
+
9
+ Answer: {{agent_answer}}
10
+
11
+ Additional Conversationl Context: {{context}}
12
+
13
+ <|eot_id|>
14
+
15
+ <|start_header_id|>assistant<|end_header_id|>