ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,27 @@
1
1
  import json
2
2
  import os
3
- from typing import List
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List, Optional
4
5
 
5
6
  import rich
7
+ from dateutil import parser
6
8
 
7
9
  from wxo_agentic_evaluation import __file__
8
10
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
9
11
  from wxo_agentic_evaluation.llm_matching import LLMMatcher
10
12
  from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
13
+ from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
14
+ from wxo_agentic_evaluation.metrics.evaluations import (
15
+ Evaluation,
16
+ Metric,
17
+ )
18
+ from wxo_agentic_evaluation.extractors.extractor_base import Extractor
19
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
20
+ AnswerDerailment,
21
+ AnswerUnsafeTopic,
22
+ )
11
23
  from wxo_agentic_evaluation.metrics.metrics import (
24
+ CustomEvalMetrics,
12
25
  KeywordSemanticSearchMetric,
13
26
  KnowledgeBaseMetrics,
14
27
  TextMatchType,
@@ -16,19 +29,27 @@ from wxo_agentic_evaluation.metrics.metrics import (
16
29
  )
17
30
  from wxo_agentic_evaluation.prompt.template_render import (
18
31
  AnswerRelevancyTemplateRenderer,
32
+ DerailmentTemplateRenderer,
19
33
  FaithfulnessTemplateRenderer,
20
34
  KeywordMatchingTemplateRenderer,
21
35
  SemanticMatchingTemplateRenderer,
36
+ UnsafeTopicTemplateRenderer,
22
37
  )
23
38
  from wxo_agentic_evaluation.resource_map import ResourceMap
24
- from wxo_agentic_evaluation.service_provider import get_provider
39
+ from wxo_agentic_evaluation.service_instance import tenant_setup
40
+ from wxo_agentic_evaluation.service_provider import (
41
+ USE_GATEWAY_MODEL_PROVIDER,
42
+ get_provider,
43
+ )
44
+ from wxo_agentic_evaluation.service_provider.provider import Provider
25
45
  from wxo_agentic_evaluation.type import (
26
46
  ContentType,
27
47
  ConversationalSearch,
28
- EvaluationData,
29
48
  EventTypes,
30
49
  ExtendedMessage,
50
+ MatchingStrategy,
31
51
  Message,
52
+ OrchestrateDataset,
32
53
  )
33
54
 
34
55
  root_dir = os.path.dirname(__file__)
@@ -49,6 +70,14 @@ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
49
70
  "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
50
71
  )
51
72
 
73
+ DERAILMENT_PROMPT_PATH = os.path.join(
74
+ root_dir, "prompt", "derailment_prompt.jinja2"
75
+ )
76
+
77
+ UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
78
+ root_dir, "prompt", "unsafe_topic_prompt.jinja2"
79
+ )
80
+
52
81
  """
53
82
  - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
54
83
  - purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
@@ -61,33 +90,91 @@ DUMMY_GRAPH_NODE_NAME = "dummy-goal"
61
90
  class EvaluationPackage:
62
91
  def __init__(
63
92
  self,
64
- test_case_name,
65
- ground_truth,
66
- messages,
93
+ test_case_name: str,
94
+ ground_truth: OrchestrateDataset,
95
+ messages: list[Message],
67
96
  conversational_search_data: List[ConversationalSearch] = None,
68
97
  resource_map: ResourceMap = None,
69
98
  is_attack_evaluation: bool = False,
99
+ config=None,
100
+ custom_evals: Optional[list[Evaluation]] = None,
101
+ custom_llmaaj_client: Optional[Provider] = None,
102
+ extractors: Optional[list[Extractor]] = None,
103
+ similarity_threshold=0.8,
104
+ enable_fuzzy_matching=False,
105
+ strict_topological_matching=True,
70
106
  ):
71
- self.tool_dictionary = {
72
- goal_detail.name: goal_detail
73
- for goal_detail in ground_truth.goal_details
74
- if goal_detail.type == ContentType.tool_call
75
- }
76
- self.text_list = [
77
- goal_detail
78
- for goal_detail in ground_truth.goal_details
79
- if goal_detail.type == ContentType.text
80
- ]
81
- self.messages = messages
107
+ self.tool_dictionary = (
108
+ {
109
+ goal_detail.name: goal_detail
110
+ for goal_detail in ground_truth.goal_details
111
+ if goal_detail.type == ContentType.tool_call
112
+ }
113
+ if ground_truth.goal_details
114
+ else {}
115
+ )
116
+
117
+ self.text_list = (
118
+ [
119
+ goal_detail
120
+ for goal_detail in ground_truth.goal_details
121
+ if goal_detail.type == ContentType.text
122
+ ]
123
+ if ground_truth.goal_details
124
+ else []
125
+ )
126
+
127
+ self.messages: List[Message] = messages
82
128
  self.conversational_search_data = conversational_search_data
83
129
  self.is_attack_evaluation = is_attack_evaluation
84
130
  self.ground_truth = ground_truth
85
131
  self.test_case_name = test_case_name
86
132
  self.resource_map = resource_map
133
+ self.custom_evals = custom_evals
134
+ self.custom_llmaaj_client = custom_llmaaj_client
135
+ self.extractors = extractors
136
+ self.enable_fuzzy_matching = enable_fuzzy_matching
137
+ self.strict_topological_matching = strict_topological_matching
87
138
 
88
139
  if not self.is_attack_evaluation:
89
140
  self.validate_ground_truth(self.ground_truth, self.test_case_name)
90
141
 
142
+ extra_kwargs = {}
143
+
144
+ if USE_GATEWAY_MODEL_PROVIDER:
145
+
146
+ if resource_map and hasattr(resource_map, "wxo_client"):
147
+ wxo_client = resource_map.wxo_client
148
+
149
+ if hasattr(wxo_client, "service_url"):
150
+ extra_kwargs["instance_url"] = wxo_client.service_url
151
+
152
+ if hasattr(wxo_client, "api_key"):
153
+ extra_kwargs["token"] = wxo_client.api_key
154
+
155
+ elif config:
156
+ auth = getattr(config, "auth_config", None)
157
+
158
+ if auth:
159
+ instance_url = getattr(auth, "url", None)
160
+ token = getattr(auth, "token", None)
161
+
162
+ if instance_url:
163
+ extra_kwargs["instance_url"] = instance_url
164
+
165
+ if token:
166
+ extra_kwargs["token"] = token
167
+ else:
168
+ token, instance_url, env = tenant_setup(
169
+ service_url=None, tenant_name="local"
170
+ )
171
+ if instance_url:
172
+ extra_kwargs["instance_url"] = instance_url
173
+
174
+ if token:
175
+ extra_kwargs["token"] = token
176
+
177
+ # output response matching
91
178
  self.matcher = LLMMatcher(
92
179
  llm_client=get_provider(
93
180
  model_id="meta-llama/llama-3-405b-instruct",
@@ -96,6 +183,8 @@ class EvaluationPackage:
96
183
  "decoding_method": "greedy",
97
184
  "max_new_tokens": 10,
98
185
  },
186
+ embedding_model_id="sentence-transformers/all-minilm-l6-v2",
187
+ **extra_kwargs,
99
188
  ),
100
189
  keyword_template=KeywordMatchingTemplateRenderer(
101
190
  KEYWORD_MATCHING_PROMPT_PATH
@@ -103,7 +192,10 @@ class EvaluationPackage:
103
192
  semantic_template=SemanticMatchingTemplateRenderer(
104
193
  SEMANTIC_MATCHING_PROMPT_PATH
105
194
  ),
195
+ similarity_threshold=similarity_threshold,
196
+ enable_fuzzy_matching=enable_fuzzy_matching,
106
197
  )
198
+ # only used for RAG evaluation
107
199
  self.rag_llm_as_a_judge = LLMJudge(
108
200
  llm_client=get_provider(
109
201
  model_id="meta-llama/llama-3-405b-instruct",
@@ -112,57 +204,102 @@ class EvaluationPackage:
112
204
  "decoding_method": "greedy",
113
205
  "max_new_tokens": 4096,
114
206
  },
207
+ **extra_kwargs,
115
208
  ),
116
209
  faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
117
210
  answer_relevancy=AnswerRelevancyTemplateRenderer(
118
211
  ANSWER_RELEVANCY_PROMPT_PATH
119
212
  ),
120
213
  )
214
+ self.safety_llm_as_a_judge = LLMSafetyJudge(
215
+ llm_client=get_provider(
216
+ model_id="meta-llama/llama-3-405b-instruct",
217
+ params={
218
+ "min_new_tokens": 0,
219
+ "decoding_method": "greedy",
220
+ "max_new_tokens": 4096,
221
+ },
222
+ **extra_kwargs,
223
+ ),
224
+ answer_derailment=DerailmentTemplateRenderer(
225
+ DERAILMENT_PROMPT_PATH
226
+ ),
227
+ answer_unsafe_topic=UnsafeTopicTemplateRenderer(
228
+ UNSAFE_TOPIC_PROMPT_PATH
229
+ ),
230
+ )
121
231
 
122
232
  @staticmethod
123
- def find_ground_node(graph, start_node):
124
- """Simple implementation. Should be fixed in the future
233
+ def find_terminal_nodes(graph: dict[str, list[str]]) -> set[str]:
234
+ """Finds terminal nodes (nodes with no outgoing edges).
235
+
236
+ Args:
237
+ graph: the input graph
125
238
 
126
- Assumes that there is a single graph node that does not have children
239
+ Returns:
240
+ a set of the terminal nodes
127
241
  """
128
242
 
129
- stack = [start_node]
130
- visited_set = set()
243
+ seen_nodes = set() # track seen nodes
244
+ non_terminal_nodes = set() # track nodes with children
131
245
 
132
- while stack:
133
- node = stack.pop()
134
- if node not in visited_set:
135
- visited_set.add(node)
246
+ for node in graph:
247
+ seen_nodes.add(node)
248
+ if graph[node]:
249
+ non_terminal_nodes.add(node)
250
+ for n in graph[node]:
251
+ seen_nodes.add(n)
252
+ return seen_nodes - non_terminal_nodes
136
253
 
137
- # check for children
138
- # improvement for future: add the ground nodes here
139
- # right now, just return the first one
140
- if not graph.get(node):
141
- return node
254
+ @staticmethod
255
+ def is_topological_sort(
256
+ graph: dict[str, list[str]], ordering: list[str], is_strict: bool = True
257
+ ) -> bool:
258
+ """Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
142
259
 
143
- stack.extend(graph[node])
260
+ Args:
261
+ graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
262
+ ordering: the nodes visited, in order
144
263
 
145
- return None
264
+ Returns:
265
+ Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
266
+ """
267
+ # No keyword match or goal details were achieved
268
+ if not ordering:
269
+ return False
146
270
 
147
- @staticmethod
148
- def is_topological_sort(graph, ordering):
149
- position = {node: i for i, node in enumerate(ordering)}
150
- ground_node = EvaluationPackage.find_ground_node(
151
- graph, list(graph.keys())[0]
271
+ if is_strict:
272
+ # strict matching: only consider most recent tool call
273
+ position = {node: [i] for i, node in enumerate(ordering)}
274
+ else:
275
+ # lenient matching: consider all tool calls (account for all indexes of the node)
276
+ position = defaultdict(list)
277
+ for i, node in enumerate(ordering):
278
+ position[node].append(i)
279
+
280
+ terminal_nodes = EvaluationPackage.find_terminal_nodes(graph)
281
+ # adds a dummy node for each terminal node
282
+ next_idx = (
283
+ max(val for values in position.values() for val in values) + 1
152
284
  )
153
285
 
154
- if ground_node is not None:
155
- graph[ground_node] = [DUMMY_GRAPH_NODE_NAME]
286
+ for n in terminal_nodes:
287
+ graph[n] = [DUMMY_GRAPH_NODE_NAME]
156
288
  graph[DUMMY_GRAPH_NODE_NAME] = []
289
+ position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
290
+ next_idx += 1
157
291
 
158
- next_idx = len(position)
159
- position[DUMMY_GRAPH_NODE_NAME] = next_idx
160
-
161
- for u in graph:
162
- for v in graph[u]:
163
- if u not in position or v not in position:
292
+ for node in graph:
293
+ for child_nodes in graph[node]:
294
+ # Current node/children doesn't show up in made calls
295
+ if node not in position or child_nodes not in position:
164
296
  return False
165
- if position[u] >= position[v]:
297
+ # Current node doesn't show up before any of its child
298
+ # all index in current nodes are larger than every child nodes' index
299
+ if all(
300
+ curr >= max(position[child_nodes])
301
+ for curr in position[node]
302
+ ):
166
303
  return False
167
304
  return True
168
305
 
@@ -238,32 +375,151 @@ class EvaluationPackage:
238
375
  f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
239
376
  )
240
377
 
241
- @staticmethod
242
- def _check_if_args_match_with_ignore(
243
- actual_args: dict[str, str], expected_args: dict[str, str]
378
+ def argument_matching(
379
+ self,
380
+ expected: dict[str, str],
381
+ actual: dict[str, str],
382
+ matching_strategy: dict[str, MatchingStrategy],
244
383
  ) -> bool:
245
- """
246
- This function checks if a registered tool call matches with the goal node when:
247
- - the arg value marked as wrong is labelled with the "<IGNORE>" value in the corresponding ground truth
384
+ """Handles argument matching for expected and actual arguments and values.
385
+
248
386
  Args:
249
- actual_args (dict): Made during inference.
250
- expected_args (dict): Defined in the test case/ground truth.
387
+ expected: Expected ground truth arguments.
388
+ actual: Actual arguments in tool call
389
+ matching_strategy: Matching mode for each argument. Defaults to strict if not specified.
390
+
251
391
  Returns:
252
- bool: True if match with keyword parameters ignored | False otherwise (improper tool call).
392
+ True if all arguments match according to their matching strategy.
253
393
  """
394
+ # ignore arg matching
395
+ if expected == {"IGNORE": None}:
396
+ return True
254
397
 
255
- if set(actual_args.keys()) != set(expected_args.keys()):
256
- return False
398
+ for field in actual:
399
+ if field not in expected:
400
+ return False
401
+
402
+ for field in expected:
403
+ strategy = matching_strategy.get(
404
+ field, MatchingStrategy.strict.value
405
+ )
406
+
407
+ norm_actual_val = EvaluationPackage.normalize_args(
408
+ actual.get(field)
409
+ )
410
+ norm_expected_val = EvaluationPackage.normalize_args(
411
+ expected.get(field)
412
+ )
257
413
 
258
- for key in actual_args:
414
+ # field must exist if not using optional matching
259
415
  if (
260
- actual_args[key] != expected_args[key]
261
- and expected_args[key] != RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS
416
+ field not in actual
417
+ and strategy != MatchingStrategy.optional.value
262
418
  ):
263
419
  return False
420
+ # continue to next if it's an ignored keyword
421
+ if norm_expected_val == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
422
+ continue
423
+ # optional matching
424
+ if strategy == MatchingStrategy.optional.value:
425
+ # continue to next it's not called
426
+ if field not in actual:
427
+ continue
428
+ # must match if called
429
+ if actual[field] != expected[field]:
430
+ return False
431
+ elif strategy == MatchingStrategy.fuzzy.value:
432
+ # check date/number conversion
433
+ conversion_succeeded, values_match = (
434
+ EvaluationPackage._compare_as_date_or_number(
435
+ norm_actual_val, norm_expected_val
436
+ )
437
+ )
438
+ # If conversion succeeded and values match, continue to next parameter
439
+ if conversion_succeeded and values_match:
440
+ continue
441
+ # If conversion succeeded but values don't match, return False
442
+ if conversion_succeeded and not values_match:
443
+ return False
444
+
445
+ # try cosine matching
446
+ x = self.matcher.cosine_similarity_semantic_match(
447
+ norm_actual_val, norm_expected_val
448
+ )
449
+ print(norm_actual_val, norm_expected_val, x)
450
+ if not x:
451
+ return False
452
+ # TODO szhang 10/24/25: Decide if strict comparison must be exact or may allow normalized values.
453
+ elif strategy == MatchingStrategy.strict.value:
454
+ # must match
455
+ if norm_actual_val != norm_expected_val:
456
+ return False
457
+ else:
458
+ print(f"Warning: undefined matching strategy found: {strategy}")
264
459
 
265
460
  return True
266
461
 
462
+ @staticmethod
463
+ def normalize_args(data):
464
+ if isinstance(data, dict):
465
+ # normalize keys (case-sensitive) and values
466
+ return {
467
+ str(k): EvaluationPackage.normalize_args(v)
468
+ for k, v in data.items()
469
+ }
470
+
471
+ elif isinstance(data, list):
472
+ normalized_list = [
473
+ EvaluationPackage.normalize_args(v) for v in data
474
+ ]
475
+ return sorted(
476
+ normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
477
+ )
478
+
479
+ else:
480
+ # don’t lowercase reserved keyword
481
+ if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
482
+ return str(data)
483
+ return str(data).lower()
484
+
485
+ @staticmethod
486
+ def _compare_as_date_or_number(normalized_actual, normalized_expected):
487
+ """
488
+ Attempts to compare two normalized values as dates or numbers.
489
+
490
+ Args:
491
+ normalized_actual: The actual value from tool call
492
+ normalized_expected: The expected value from ground truth
493
+
494
+ Returns:
495
+ tuple: (conversion_succeeded, values_match)
496
+ - conversion_succeeded: True if values could be converted to numbers or dates
497
+ - values_match: True if converted values match
498
+ """
499
+ # Try to convert to numbers
500
+ try:
501
+ num_actual = float(normalized_actual)
502
+ num_expected = float(normalized_expected)
503
+ # Conversion succeeded, check if values match
504
+ return (
505
+ True,
506
+ abs(num_actual - num_expected) <= 0.001,
507
+ ) # Small epsilon for float comparison
508
+ except (ValueError, TypeError):
509
+ pass
510
+
511
+ # Try to convert to dates
512
+ try:
513
+ date_actual = parser.parse(normalized_actual)
514
+ date_expected = parser.parse(normalized_expected)
515
+ # Conversion succeeded, check if values match
516
+ return True, date_actual == date_expected
517
+ except (ValueError, TypeError):
518
+ pass
519
+
520
+ # If we get here, neither number nor date conversion worked
521
+ return False, False
522
+
267
523
  def traverse(self):
268
524
  labelled_messages = []
269
525
  message_outcomes = []
@@ -325,12 +581,12 @@ class EvaluationPackage:
325
581
  possible_ground_truth_for_analysis = []
326
582
  for goal_detail in matching_goal_details:
327
583
  # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
328
- if goal_detail.args == {"IGNORE": None} or (
329
- msg_tool_call["args"] == goal_detail.args
330
- or self._check_if_args_match_with_ignore(
331
- msg_tool_call["args"], goal_detail.args
332
- )
584
+ if self.argument_matching(
585
+ expected=goal_detail.args,
586
+ actual=msg_tool_call["args"],
587
+ matching_strategy=goal_detail.arg_matching,
333
588
  ):
589
+
334
590
  labelled_messages.append(goal_detail.name)
335
591
  labelled_messages_without_text_step.append(
336
592
  goal_detail.name
@@ -399,6 +655,7 @@ class EvaluationPackage:
399
655
  if message.event == EventTypes.message_created
400
656
  and message.role == "assistant"
401
657
  ]
658
+
402
659
  keyword_semantic_list = []
403
660
  for message in assistant_responses:
404
661
  for goal_detail in self.text_list:
@@ -407,7 +664,10 @@ class EvaluationPackage:
407
664
  message.content, goal_detail.keywords
408
665
  )
409
666
  semantic_match: bool = self.matcher.semantic_match(
410
- message.content, goal_detail.response
667
+ self.messages[0].content,
668
+ prediction=message.content,
669
+ ground_truth=goal_detail.response,
670
+ enable_fuzzy_matching=self.enable_fuzzy_matching,
411
671
  )
412
672
  keyword_semantic_match = KeywordSemanticSearchMetric(
413
673
  keyword_match=keyword_match,
@@ -442,6 +702,29 @@ class EvaluationPackage:
442
702
  else:
443
703
  return TextMatchType.text_mismatch.value
444
704
 
705
+ def generate_custom_metrics(
706
+ self, extracted_context: Dict[str, Any]
707
+ ) -> Optional[CustomEvalMetrics]:
708
+ if self.custom_evals is None:
709
+ return None
710
+
711
+ results: list[Metric] = []
712
+ for evaluation in self.custom_evals:
713
+ # TODO: cleanup. The compute method returns a Metric but pydantic thinks it is different.
714
+ # Probably because of some path issue when we auto-discover metrics
715
+ evaluate_result = evaluation.evaluate(
716
+ messages=self.messages,
717
+ ground_truth=self.ground_truth,
718
+ extracted_context=extracted_context,
719
+ )
720
+ if evaluate_result is not None:
721
+ results.append(Metric(**evaluate_result.model_dump()))
722
+
723
+ custom_eval_results = CustomEvalMetrics(
724
+ dataset_name=self.test_case_name, custom_metrics=results
725
+ )
726
+ return custom_eval_results
727
+
445
728
  def generate_summary(self):
446
729
  llm_steps = 0
447
730
  total_step = 0
@@ -454,8 +737,20 @@ class EvaluationPackage:
454
737
  message_with_reasons,
455
738
  ) = self.traverse()
456
739
 
740
+ extracted_context = {}
741
+ if self.extractors is not None and self.custom_evals is not None:
742
+ for extractor in self.extractors:
743
+ context = extractor.extract(
744
+ messages=self.messages,
745
+ ground_truth=self.ground_truth,
746
+ matcher=self.matcher,
747
+ )
748
+ extracted_context[extractor.name] = context
749
+
457
750
  is_success = self.is_topological_sort(
458
- self.ground_truth.goals, labelled_messages
751
+ graph=self.ground_truth.goals,
752
+ ordering=labelled_messages,
753
+ is_strict=self.strict_topological_matching,
459
754
  )
460
755
  match = self._is_text_match(matches)
461
756
 
@@ -474,6 +769,10 @@ class EvaluationPackage:
474
769
  knowledge_base_metric_summary = (
475
770
  self.generate_knowledge_base_metric_summary()
476
771
  )
772
+
773
+ custom_metric_summary = self.generate_custom_metrics(
774
+ extracted_context=extracted_context
775
+ )
477
776
  # TO-DO: the table is not printing properly anymore with the new columns introduced
478
777
  # we need to introduce a separate table for these.
479
778
 
@@ -487,6 +786,7 @@ class EvaluationPackage:
487
786
  knowledge_base_metric_summary,
488
787
  message_with_reasons,
489
788
  metrics,
789
+ custom_metric_summary,
490
790
  )
491
791
 
492
792
  def _get_messages_by_role_before_cs(
@@ -591,6 +891,51 @@ class EvaluationPackage:
591
891
 
592
892
  return metrics
593
893
 
894
+ def evaluate_derailment(
895
+ self, instructions: str = None
896
+ ) -> List[AnswerDerailment]:
897
+ derailments = []
898
+ last_user_message = None
899
+ for message in self.messages:
900
+ if message.role == "user" and message.type == ContentType.text:
901
+ last_user_message = message
902
+ if message.role == "assistant" and message.type == ContentType.text:
903
+ derailment = (
904
+ self.safety_llm_as_a_judge.judge_derailment_in_answer(
905
+ question=last_user_message.content,
906
+ instructions=instructions if instructions else "N/A",
907
+ answer=message.content,
908
+ )
909
+ )
910
+ derailments.append(derailment)
911
+ if derailment.in_scope == "no":
912
+ return (
913
+ derailments # short-circuit if any derailment is found
914
+ )
915
+ return derailments
916
+
917
+ def evaluate_unsafe_topics(
918
+ self, instructions: str = None
919
+ ) -> List[AnswerUnsafeTopic]:
920
+ unsafe_topics = []
921
+ last_user_message = None
922
+ for message in self.messages:
923
+ if message.role == "user" and message.type == ContentType.text:
924
+ last_user_message = message
925
+ if message.role == "assistant" and message.type == ContentType.text:
926
+ unsafe_topic = (
927
+ self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
928
+ question=last_user_message.content,
929
+ instructions=instructions if instructions else "N/A",
930
+ answer=message.content,
931
+ )
932
+ )
933
+ unsafe_topics.append(unsafe_topic)
934
+ if unsafe_topic.is_safe == "no":
935
+ return unsafe_topics # short-circuit if any unsafe topic is found
936
+
937
+ return unsafe_topics
938
+
594
939
 
595
940
  if __name__ == "__main__":
596
941
 
@@ -616,7 +961,7 @@ if __name__ == "__main__":
616
961
  rich.print("[orange3]WXO:[/orange3]", message.content)
617
962
 
618
963
  with open("./benchmarks/workday_tools/data/data18.json", "r") as f:
619
- ground_truth = EvaluationData.model_validate(json.load(f))
964
+ ground_truth = OrchestrateDataset.model_validate(json.load(f))
620
965
 
621
966
  evaluate_package = EvaluationPackage(
622
967
  test_case_name="data1.messages.json",
@@ -7,7 +7,7 @@ from wxo_agentic_evaluation import prompt
7
7
  from wxo_agentic_evaluation.prompt.template_render import (
8
8
  StoryGenerationTemplateRenderer,
9
9
  )
10
- from wxo_agentic_evaluation.service_provider import ProviderConfig, get_provider
10
+ from wxo_agentic_evaluation.service_provider import get_provider
11
11
 
12
12
  console = rich.console.Console()
13
13