ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,41 +1,82 @@
1
- from typing import List
2
1
  import json
3
- import os
2
+ import os
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List, Optional
5
+
4
6
  import rich
7
+ from dateutil import parser
5
8
 
9
+ from wxo_agentic_evaluation import __file__
6
10
  from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
7
-
8
- from wxo_agentic_evaluation.type import (
9
- ContentType,
10
- Message,
11
- EvaluationData,
12
- EventTypes,
13
- ConversationalSearch,
14
- ExtendedMessage,
11
+ from wxo_agentic_evaluation.llm_matching import LLMMatcher
12
+ from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
13
+ from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
14
+ from wxo_agentic_evaluation.metrics.evaluations import (
15
+ Evaluation,
16
+ Metric,
17
+ )
18
+ from wxo_agentic_evaluation.extractors.extractor_base import Extractor
19
+ from wxo_agentic_evaluation.metrics.llm_as_judge import (
20
+ AnswerDerailment,
21
+ AnswerUnsafeTopic,
15
22
  )
16
- from wxo_agentic_evaluation.resource_map import ResourceMap
17
- from wxo_agentic_evaluation.service_provider import get_provider
18
23
  from wxo_agentic_evaluation.metrics.metrics import (
19
- KnowledgeBaseMetrics,
24
+ CustomEvalMetrics,
20
25
  KeywordSemanticSearchMetric,
26
+ KnowledgeBaseMetrics,
27
+ TextMatchType,
21
28
  ToolCallAndRoutingMetrics,
22
- TextMatchType
23
29
  )
24
30
  from wxo_agentic_evaluation.prompt.template_render import (
31
+ AnswerRelevancyTemplateRenderer,
32
+ DerailmentTemplateRenderer,
33
+ FaithfulnessTemplateRenderer,
25
34
  KeywordMatchingTemplateRenderer,
26
35
  SemanticMatchingTemplateRenderer,
27
- FaithfulnessTemplateRenderer,
28
- AnswerRelevancyTemplateRenderer,
36
+ UnsafeTopicTemplateRenderer,
37
+ )
38
+ from wxo_agentic_evaluation.resource_map import ResourceMap
39
+ from wxo_agentic_evaluation.service_instance import tenant_setup
40
+ from wxo_agentic_evaluation.service_provider import (
41
+ USE_GATEWAY_MODEL_PROVIDER,
42
+ get_provider,
43
+ )
44
+ from wxo_agentic_evaluation.service_provider.provider import Provider
45
+ from wxo_agentic_evaluation.type import (
46
+ ContentType,
47
+ ConversationalSearch,
48
+ EventTypes,
49
+ ExtendedMessage,
50
+ MatchingStrategy,
51
+ Message,
52
+ OrchestrateDataset,
29
53
  )
30
- from wxo_agentic_evaluation.llm_matching import LLMMatcher
31
- from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
32
- from wxo_agentic_evaluation import __file__
33
54
 
34
55
  root_dir = os.path.dirname(__file__)
35
- KEYWORD_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "keyword_matching_prompt.jinja2")
36
- SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(root_dir, "prompt", "semantic_matching_prompt.jinja2")
37
- FAITHFULNESS_PROMPT_PATH = os.path.join(root_dir, "prompt", "faithfulness_prompt.jinja2")
38
- ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(root_dir, "prompt", "answer_relevancy_prompt.jinja2")
56
+ KEYWORD_MATCHING_PROMPT_PATH = os.path.join(
57
+ root_dir, "prompt", "keyword_matching_prompt.jinja2"
58
+ )
59
+ SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(
60
+ root_dir, "prompt", "semantic_matching_prompt.jinja2"
61
+ )
62
+ FAITHFULNESS_PROMPT_PATH = os.path.join(
63
+ root_dir, "prompt", "faithfulness_prompt.jinja2"
64
+ )
65
+ ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(
66
+ root_dir, "prompt", "answer_relevancy_prompt.jinja2"
67
+ )
68
+
69
+ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
70
+ "RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
71
+ )
72
+
73
+ DERAILMENT_PROMPT_PATH = os.path.join(
74
+ root_dir, "prompt", "derailment_prompt.jinja2"
75
+ )
76
+
77
+ UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
78
+ root_dir, "prompt", "unsafe_topic_prompt.jinja2"
79
+ )
39
80
 
40
81
  """
41
82
  - hyphens are not allowed in python function names, so it is safe to use as a dummy function name
@@ -45,37 +86,105 @@ single, summary step goals.
45
86
  """
46
87
  DUMMY_GRAPH_NODE_NAME = "dummy-goal"
47
88
 
89
+
48
90
  class EvaluationPackage:
49
91
  def __init__(
50
92
  self,
51
- test_case_name,
52
- ground_truth,
53
- messages,
93
+ test_case_name: str,
94
+ ground_truth: OrchestrateDataset,
95
+ messages: list[Message],
54
96
  conversational_search_data: List[ConversationalSearch] = None,
55
- is_analyze_run=False,
56
97
  resource_map: ResourceMap = None,
98
+ is_attack_evaluation: bool = False,
99
+ config=None,
100
+ custom_evals: Optional[list[Evaluation]] = None,
101
+ custom_llmaaj_client: Optional[Provider] = None,
102
+ extractors: Optional[list[Extractor]] = None,
103
+ similarity_threshold=0.8,
104
+ enable_fuzzy_matching=False,
105
+ strict_topological_matching=True,
57
106
  ):
58
- self.tool_dictionary = {
59
- goal_detail.name: goal_detail
60
- for goal_detail in ground_truth.goal_details
61
- if goal_detail.type == ContentType.tool_call
62
- }
63
- self.text_list = [
64
- goal_detail
65
- for goal_detail in ground_truth.goal_details
66
- if goal_detail.type == ContentType.text
67
- ]
68
- self.messages = messages
107
+ self.tool_dictionary = (
108
+ {
109
+ goal_detail.name: goal_detail
110
+ for goal_detail in ground_truth.goal_details
111
+ if goal_detail.type == ContentType.tool_call
112
+ }
113
+ if ground_truth.goal_details
114
+ else {}
115
+ )
116
+
117
+ self.text_list = (
118
+ [
119
+ goal_detail
120
+ for goal_detail in ground_truth.goal_details
121
+ if goal_detail.type == ContentType.text
122
+ ]
123
+ if ground_truth.goal_details
124
+ else []
125
+ )
126
+
127
+ self.messages: List[Message] = messages
69
128
  self.conversational_search_data = conversational_search_data
70
- self.validate_ground_truth(ground_truth, test_case_name)
129
+ self.is_attack_evaluation = is_attack_evaluation
71
130
  self.ground_truth = ground_truth
72
131
  self.test_case_name = test_case_name
73
- self.is_analyze_run = is_analyze_run
132
+ self.resource_map = resource_map
133
+ self.custom_evals = custom_evals
134
+ self.custom_llmaaj_client = custom_llmaaj_client
135
+ self.extractors = extractors
136
+ self.enable_fuzzy_matching = enable_fuzzy_matching
137
+ self.strict_topological_matching = strict_topological_matching
138
+
139
+ if not self.is_attack_evaluation:
140
+ self.validate_ground_truth(self.ground_truth, self.test_case_name)
141
+
142
+ extra_kwargs = {}
143
+
144
+ if USE_GATEWAY_MODEL_PROVIDER:
145
+
146
+ if resource_map and hasattr(resource_map, "wxo_client"):
147
+ wxo_client = resource_map.wxo_client
148
+
149
+ if hasattr(wxo_client, "service_url"):
150
+ extra_kwargs["instance_url"] = wxo_client.service_url
151
+
152
+ if hasattr(wxo_client, "api_key"):
153
+ extra_kwargs["token"] = wxo_client.api_key
154
+
155
+ elif config:
156
+ auth = getattr(config, "auth_config", None)
157
+
158
+ if auth:
159
+ instance_url = getattr(auth, "url", None)
160
+ token = getattr(auth, "token", None)
74
161
 
162
+ if instance_url:
163
+ extra_kwargs["instance_url"] = instance_url
164
+
165
+ if token:
166
+ extra_kwargs["token"] = token
167
+ else:
168
+ token, instance_url, env = tenant_setup(
169
+ service_url=None, tenant_name="local"
170
+ )
171
+ if instance_url:
172
+ extra_kwargs["instance_url"] = instance_url
173
+
174
+ if token:
175
+ extra_kwargs["token"] = token
176
+
177
+ # output response matching
75
178
  self.matcher = LLMMatcher(
76
179
  llm_client=get_provider(
77
180
  model_id="meta-llama/llama-3-405b-instruct",
78
- params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 10},
181
+ params={
182
+ "min_new_tokens": 0,
183
+ "decoding_method": "greedy",
184
+ "max_new_tokens": 10,
185
+ },
186
+ embedding_model_id="sentence-transformers/all-minilm-l6-v2",
187
+ **extra_kwargs,
79
188
  ),
80
189
  keyword_template=KeywordMatchingTemplateRenderer(
81
190
  KEYWORD_MATCHING_PROMPT_PATH
@@ -83,62 +192,114 @@ class EvaluationPackage:
83
192
  semantic_template=SemanticMatchingTemplateRenderer(
84
193
  SEMANTIC_MATCHING_PROMPT_PATH
85
194
  ),
195
+ similarity_threshold=similarity_threshold,
196
+ enable_fuzzy_matching=enable_fuzzy_matching,
86
197
  )
198
+ # only used for RAG evaluation
87
199
  self.rag_llm_as_a_judge = LLMJudge(
88
200
  llm_client=get_provider(
89
- model_id="meta-llama/llama-3-405b-instruct",
90
- params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 4096},
91
- ),
201
+ model_id="meta-llama/llama-3-405b-instruct",
202
+ params={
203
+ "min_new_tokens": 0,
204
+ "decoding_method": "greedy",
205
+ "max_new_tokens": 4096,
206
+ },
207
+ **extra_kwargs,
208
+ ),
92
209
  faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
93
210
  answer_relevancy=AnswerRelevancyTemplateRenderer(
94
211
  ANSWER_RELEVANCY_PROMPT_PATH
95
212
  ),
96
213
  )
214
+ self.safety_llm_as_a_judge = LLMSafetyJudge(
215
+ llm_client=get_provider(
216
+ model_id="meta-llama/llama-3-405b-instruct",
217
+ params={
218
+ "min_new_tokens": 0,
219
+ "decoding_method": "greedy",
220
+ "max_new_tokens": 4096,
221
+ },
222
+ **extra_kwargs,
223
+ ),
224
+ answer_derailment=DerailmentTemplateRenderer(
225
+ DERAILMENT_PROMPT_PATH
226
+ ),
227
+ answer_unsafe_topic=UnsafeTopicTemplateRenderer(
228
+ UNSAFE_TOPIC_PROMPT_PATH
229
+ ),
230
+ )
97
231
 
98
- self.resource_map = resource_map
99
-
100
232
  @staticmethod
101
- def find_ground_node(graph, start_node):
102
- """ Simple implementation. Should be fixed in the future
233
+ def find_terminal_nodes(graph: dict[str, list[str]]) -> set[str]:
234
+ """Finds terminal nodes (nodes with no outgoing edges).
235
+
236
+ Args:
237
+ graph: the input graph
103
238
 
104
- Assumes that there is a single graph node that does not have children
239
+ Returns:
240
+ a set of the terminal nodes
105
241
  """
106
242
 
107
- stack = [start_node]
108
- visited_set = set()
109
-
110
- while stack:
111
- node = stack.pop()
112
- if node not in visited_set:
113
- visited_set.add(node)
114
-
115
- # check for children
116
- # improvement for future: add the ground nodes here
117
- # right now, just return the first one
118
- if not graph.get(node):
119
- return node
120
-
121
- stack.extend(graph[node])
122
-
123
- return None
243
+ seen_nodes = set() # track seen nodes
244
+ non_terminal_nodes = set() # track nodes with children
245
+
246
+ for node in graph:
247
+ seen_nodes.add(node)
248
+ if graph[node]:
249
+ non_terminal_nodes.add(node)
250
+ for n in graph[node]:
251
+ seen_nodes.add(n)
252
+ return seen_nodes - non_terminal_nodes
124
253
 
125
254
  @staticmethod
126
- def is_topological_sort(graph, ordering):
127
- position = {node: i for i, node in enumerate(ordering)}
128
- ground_node = EvaluationPackage.find_ground_node(graph, list(graph.keys())[0])
255
+ def is_topological_sort(
256
+ graph: dict[str, list[str]], ordering: list[str], is_strict: bool = True
257
+ ) -> bool:
258
+ """Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
129
259
 
130
- if ground_node is not None:
131
- graph[ground_node] = [DUMMY_GRAPH_NODE_NAME]
132
- graph[DUMMY_GRAPH_NODE_NAME] = []
260
+ Args:
261
+ graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
262
+ ordering: the nodes visited, in order
263
+
264
+ Returns:
265
+ Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
266
+ """
267
+ # No keyword match or goal details were achieved
268
+ if not ordering:
269
+ return False
133
270
 
134
- next_idx = len(position)
135
- position[DUMMY_GRAPH_NODE_NAME] = next_idx
271
+ if is_strict:
272
+ # strict matching: only consider most recent tool call
273
+ position = {node: [i] for i, node in enumerate(ordering)}
274
+ else:
275
+ # lenient matching: consider all tool calls (account for all indexes of the node)
276
+ position = defaultdict(list)
277
+ for i, node in enumerate(ordering):
278
+ position[node].append(i)
279
+
280
+ terminal_nodes = EvaluationPackage.find_terminal_nodes(graph)
281
+ # adds a dummy node for each terminal node
282
+ next_idx = (
283
+ max(val for values in position.values() for val in values) + 1
284
+ )
136
285
 
137
- for u in graph:
138
- for v in graph[u]:
139
- if u not in position or v not in position:
286
+ for n in terminal_nodes:
287
+ graph[n] = [DUMMY_GRAPH_NODE_NAME]
288
+ graph[DUMMY_GRAPH_NODE_NAME] = []
289
+ position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
290
+ next_idx += 1
291
+
292
+ for node in graph:
293
+ for child_nodes in graph[node]:
294
+ # Current node/children doesn't show up in made calls
295
+ if node not in position or child_nodes not in position:
140
296
  return False
141
- if position[u] >= position[v]:
297
+ # Current node doesn't show up before any of its child
298
+ # all index in current nodes are larger than every child nodes' index
299
+ if all(
300
+ curr >= max(position[child_nodes])
301
+ for curr in position[node]
302
+ ):
142
303
  return False
143
304
  return True
144
305
 
@@ -181,7 +342,11 @@ class EvaluationPackage:
181
342
  f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
182
343
  )
183
344
  if goal_detail.name == "summarize":
184
- if (not goal_detail.keywords or len(goal_detail.keywords) == 0) and (not goal_detail.response or len(goal_detail.response) == 0):
345
+ if (
346
+ not goal_detail.keywords or len(goal_detail.keywords) == 0
347
+ ) and (
348
+ not goal_detail.response or len(goal_detail.response) == 0
349
+ ):
185
350
  rich.print(
186
351
  f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
187
352
  )
@@ -210,23 +375,176 @@ class EvaluationPackage:
210
375
  f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
211
376
  )
212
377
 
378
+ def argument_matching(
379
+ self,
380
+ expected: dict[str, str],
381
+ actual: dict[str, str],
382
+ matching_strategy: dict[str, MatchingStrategy],
383
+ ) -> bool:
384
+ """Handles argument matching for expected and actual arguments and values.
385
+
386
+ Args:
387
+ expected: Expected ground truth arguments.
388
+ actual: Actual arguments in tool call
389
+ matching_strategy: Matching mode for each argument. Defaults to strict if not specified.
390
+
391
+ Returns:
392
+ True if all arguments match according to their matching strategy.
393
+ """
394
+ # ignore arg matching
395
+ if expected == {"IGNORE": None}:
396
+ return True
397
+
398
+ for field in actual:
399
+ if field not in expected:
400
+ return False
401
+
402
+ for field in expected:
403
+ strategy = matching_strategy.get(
404
+ field, MatchingStrategy.strict.value
405
+ )
406
+
407
+ norm_actual_val = EvaluationPackage.normalize_args(
408
+ actual.get(field)
409
+ )
410
+ norm_expected_val = EvaluationPackage.normalize_args(
411
+ expected.get(field)
412
+ )
413
+
414
+ # field must exist if not using optional matching
415
+ if (
416
+ field not in actual
417
+ and strategy != MatchingStrategy.optional.value
418
+ ):
419
+ return False
420
+ # continue to next if it's an ignored keyword
421
+ if norm_expected_val == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
422
+ continue
423
+ # optional matching
424
+ if strategy == MatchingStrategy.optional.value:
425
+ # continue to next it's not called
426
+ if field not in actual:
427
+ continue
428
+ # must match if called
429
+ if actual[field] != expected[field]:
430
+ return False
431
+ elif strategy == MatchingStrategy.fuzzy.value:
432
+ # check date/number conversion
433
+ conversion_succeeded, values_match = (
434
+ EvaluationPackage._compare_as_date_or_number(
435
+ norm_actual_val, norm_expected_val
436
+ )
437
+ )
438
+ # If conversion succeeded and values match, continue to next parameter
439
+ if conversion_succeeded and values_match:
440
+ continue
441
+ # If conversion succeeded but values don't match, return False
442
+ if conversion_succeeded and not values_match:
443
+ return False
444
+
445
+ # try cosine matching
446
+ x = self.matcher.cosine_similarity_semantic_match(
447
+ norm_actual_val, norm_expected_val
448
+ )
449
+ print(norm_actual_val, norm_expected_val, x)
450
+ if not x:
451
+ return False
452
+ # TODO szhang 10/24/25: Decide if strict comparison must be exact or may allow normalized values.
453
+ elif strategy == MatchingStrategy.strict.value:
454
+ # must match
455
+ if norm_actual_val != norm_expected_val:
456
+ return False
457
+ else:
458
+ print(f"Warning: undefined matching strategy found: {strategy}")
459
+
460
+ return True
461
+
462
+ @staticmethod
463
+ def normalize_args(data):
464
+ if isinstance(data, dict):
465
+ # normalize keys (case-sensitive) and values
466
+ return {
467
+ str(k): EvaluationPackage.normalize_args(v)
468
+ for k, v in data.items()
469
+ }
470
+
471
+ elif isinstance(data, list):
472
+ normalized_list = [
473
+ EvaluationPackage.normalize_args(v) for v in data
474
+ ]
475
+ return sorted(
476
+ normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
477
+ )
478
+
479
+ else:
480
+ # don’t lowercase reserved keyword
481
+ if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
482
+ return str(data)
483
+ return str(data).lower()
484
+
485
+ @staticmethod
486
+ def _compare_as_date_or_number(normalized_actual, normalized_expected):
487
+ """
488
+ Attempts to compare two normalized values as dates or numbers.
489
+
490
+ Args:
491
+ normalized_actual: The actual value from tool call
492
+ normalized_expected: The expected value from ground truth
493
+
494
+ Returns:
495
+ tuple: (conversion_succeeded, values_match)
496
+ - conversion_succeeded: True if values could be converted to numbers or dates
497
+ - values_match: True if converted values match
498
+ """
499
+ # Try to convert to numbers
500
+ try:
501
+ num_actual = float(normalized_actual)
502
+ num_expected = float(normalized_expected)
503
+ # Conversion succeeded, check if values match
504
+ return (
505
+ True,
506
+ abs(num_actual - num_expected) <= 0.001,
507
+ ) # Small epsilon for float comparison
508
+ except (ValueError, TypeError):
509
+ pass
510
+
511
+ # Try to convert to dates
512
+ try:
513
+ date_actual = parser.parse(normalized_actual)
514
+ date_expected = parser.parse(normalized_expected)
515
+ # Conversion succeeded, check if values match
516
+ return True, date_actual == date_expected
517
+ except (ValueError, TypeError):
518
+ pass
519
+
520
+ # If we get here, neither number nor date conversion worked
521
+ return False, False
522
+
213
523
  def traverse(self):
214
524
  labelled_messages = []
215
525
  message_outcomes = []
216
526
  labelled_messages_without_text_step = []
217
527
  # Counters for tool-calling related metrics
218
- tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
528
+ tool_call_and_routing_metrics = ToolCallAndRoutingMetrics()
529
+ tool_call_and_routing_metrics.expected_tool_calls = len(
530
+ self.tool_dictionary
219
531
  )
220
- tool_call_and_routing_metrics.expected_tool_calls = len(self.tool_dictionary)
221
-
532
+ correct_tool_calls = (
533
+ set()
534
+ ) # sometimes, tool with the same signature can be called more than once
222
535
  for message in self.messages:
223
536
  if message.type == ContentType.tool_call:
224
537
 
225
538
  msg_tool_call = json.loads(message.content)
226
- if self.resource_map and msg_tool_call["name"] in self.resource_map.agent2tools:
539
+ if (
540
+ self.resource_map
541
+ and msg_tool_call["name"] in self.resource_map.agent2tools
542
+ ):
227
543
  tool_call_and_routing_metrics.total_routing_calls += 1
228
544
  relevant = False
229
- for tool in self.resource_map.agent2tools[msg_tool_call["name"]]:
545
+ for tool in self.resource_map.agent2tools[
546
+ msg_tool_call["name"]
547
+ ]:
230
548
  for goal_detail in self.tool_dictionary.values():
231
549
  if goal_detail.tool_name == tool:
232
550
  relevant = True
@@ -235,7 +553,9 @@ class EvaluationPackage:
235
553
  break
236
554
 
237
555
  if relevant:
238
- tool_call_and_routing_metrics.relevant_routing_calls += 1
556
+ tool_call_and_routing_metrics.relevant_routing_calls += (
557
+ 1
558
+ )
239
559
  else:
240
560
  message_outcome = ExtendedMessage(message=message)
241
561
  message_outcome.reason = {
@@ -244,6 +564,7 @@ class EvaluationPackage:
244
564
 
245
565
  continue
246
566
 
567
+ # TO-DO: re-think how deduplication works in the context of precision & recall
247
568
  tool_call_and_routing_metrics.total_tool_calls += 1
248
569
 
249
570
  # evaluating more than once is fine
@@ -259,19 +580,32 @@ class EvaluationPackage:
259
580
  found = False
260
581
  possible_ground_truth_for_analysis = []
261
582
  for goal_detail in matching_goal_details:
262
- if msg_tool_call["args"] == goal_detail.args:
263
- labelled_messages.append(goal_detail.name)
264
- labelled_messages_without_text_step.append(goal_detail.name)
583
+ # {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
584
+ if self.argument_matching(
585
+ expected=goal_detail.args,
586
+ actual=msg_tool_call["args"],
587
+ matching_strategy=goal_detail.arg_matching,
588
+ ):
265
589
 
266
- tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
590
+ labelled_messages.append(goal_detail.name)
591
+ labelled_messages_without_text_step.append(
592
+ goal_detail.name
593
+ )
594
+ correct_tool_calls.add(goal_detail.name)
595
+ # tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
267
596
  found = True
268
597
  message_outcome = ExtendedMessage(message=message)
269
598
  message_outcomes.append(message_outcome)
270
599
  break
271
600
  else:
272
- possible_ground_truth_for_analysis.append(goal_detail.args)
601
+ possible_ground_truth_for_analysis.append(
602
+ goal_detail.args
603
+ )
273
604
 
274
605
  if not found:
606
+ tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
607
+ 1
608
+ )
275
609
  message_outcome = ExtendedMessage(message=message)
276
610
  message_outcome.reason = {
277
611
  "reason": "incorrect parameter",
@@ -279,15 +613,17 @@ class EvaluationPackage:
279
613
  "expected": possible_ground_truth_for_analysis,
280
614
  }
281
615
  message_outcomes.append(message_outcome)
282
- rich.print(
283
- f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
284
- f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
285
- )
616
+ if not self.is_attack_evaluation:
617
+ rich.print(
618
+ f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
619
+ f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
620
+ )
286
621
  else:
287
622
 
288
- rich.print(
289
- f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
290
- )
623
+ if not self.is_attack_evaluation:
624
+ rich.print(
625
+ f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
626
+ )
291
627
  # note: this is incorrect after the 1.6 change
292
628
  message_outcome = ExtendedMessage(message=message)
293
629
  message_outcome.reason = {"reason": "irrelevant tool call"}
@@ -308,12 +644,18 @@ class EvaluationPackage:
308
644
  else:
309
645
  message_outcome = ExtendedMessage(message=message)
310
646
  message_outcomes.append(message_outcome)
647
+
648
+ tool_call_and_routing_metrics.correct_tool_calls = len(
649
+ correct_tool_calls
650
+ )
651
+
311
652
  assistant_responses = [
312
653
  message
313
654
  for message in self.messages
314
655
  if message.event == EventTypes.message_created
315
656
  and message.role == "assistant"
316
657
  ]
658
+
317
659
  keyword_semantic_list = []
318
660
  for message in assistant_responses:
319
661
  for goal_detail in self.text_list:
@@ -322,7 +664,10 @@ class EvaluationPackage:
322
664
  message.content, goal_detail.keywords
323
665
  )
324
666
  semantic_match: bool = self.matcher.semantic_match(
325
- message.content, goal_detail.response
667
+ self.messages[0].content,
668
+ prediction=message.content,
669
+ ground_truth=goal_detail.response,
670
+ enable_fuzzy_matching=self.enable_fuzzy_matching,
326
671
  )
327
672
  keyword_semantic_match = KeywordSemanticSearchMetric(
328
673
  keyword_match=keyword_match,
@@ -357,6 +702,29 @@ class EvaluationPackage:
357
702
  else:
358
703
  return TextMatchType.text_mismatch.value
359
704
 
705
+ def generate_custom_metrics(
706
+ self, extracted_context: Dict[str, Any]
707
+ ) -> Optional[CustomEvalMetrics]:
708
+ if self.custom_evals is None:
709
+ return None
710
+
711
+ results: list[Metric] = []
712
+ for evaluation in self.custom_evals:
713
+ # TODO: cleanup. The compute method returns a Metric but pydantic thinks it is different.
714
+ # Probably because of some path issue when we auto-discover metrics
715
+ evaluate_result = evaluation.evaluate(
716
+ messages=self.messages,
717
+ ground_truth=self.ground_truth,
718
+ extracted_context=extracted_context,
719
+ )
720
+ if evaluate_result is not None:
721
+ results.append(Metric(**evaluate_result.model_dump()))
722
+
723
+ custom_eval_results = CustomEvalMetrics(
724
+ dataset_name=self.test_case_name, custom_metrics=results
725
+ )
726
+ return custom_eval_results
727
+
360
728
  def generate_summary(self):
361
729
  llm_steps = 0
362
730
  total_step = 0
@@ -368,11 +736,21 @@ class EvaluationPackage:
368
736
  metrics,
369
737
  message_with_reasons,
370
738
  ) = self.traverse()
371
- if self.is_analyze_run:
372
- print(labelled_messages)
739
+
740
+ extracted_context = {}
741
+ if self.extractors is not None and self.custom_evals is not None:
742
+ for extractor in self.extractors:
743
+ context = extractor.extract(
744
+ messages=self.messages,
745
+ ground_truth=self.ground_truth,
746
+ matcher=self.matcher,
747
+ )
748
+ extracted_context[extractor.name] = context
373
749
 
374
750
  is_success = self.is_topological_sort(
375
- self.ground_truth.goals, labelled_messages
751
+ graph=self.ground_truth.goals,
752
+ ordering=labelled_messages,
753
+ is_strict=self.strict_topological_matching,
376
754
  )
377
755
  match = self._is_text_match(matches)
378
756
 
@@ -388,7 +766,13 @@ class EvaluationPackage:
388
766
  llm_steps += 1
389
767
  total_step += 1
390
768
 
391
- knowledge_base_metric_summary = self.generate_knowledge_base_metric_summary()
769
+ knowledge_base_metric_summary = (
770
+ self.generate_knowledge_base_metric_summary()
771
+ )
772
+
773
+ custom_metric_summary = self.generate_custom_metrics(
774
+ extracted_context=extracted_context
775
+ )
392
776
  # TO-DO: the table is not printing properly anymore with the new columns introduced
393
777
  # we need to introduce a separate table for these.
394
778
 
@@ -402,6 +786,7 @@ class EvaluationPackage:
402
786
  knowledge_base_metric_summary,
403
787
  message_with_reasons,
404
788
  metrics,
789
+ custom_metric_summary,
405
790
  )
406
791
 
407
792
  def _get_messages_by_role_before_cs(
@@ -433,7 +818,11 @@ class EvaluationPackage:
433
818
  for message in self.messages:
434
819
  if message.type == ContentType.tool_call:
435
820
  content = json.loads(message.content)
436
- id = content.get("tool_call_id", "")
821
+ """
822
+ - In ADK 1.9, for tool call events, the "tool_call_id" is now "id"
823
+ - still parse out "tool_call_id" for backwards compatibility
824
+ """
825
+ id = content.get("tool_call_id") or content.get("id")
437
826
  if id == tool_call_id:
438
827
  return content.get("name")
439
828
 
@@ -478,7 +867,8 @@ class EvaluationPackage:
478
867
  ) # name of knowledge base
479
868
 
480
869
  search_results = [
481
- result.body for result in conversational_search_data.search_results
870
+ result.body
871
+ for result in conversational_search_data.search_results
482
872
  ]
483
873
  faithfulness = self.rag_llm_as_a_judge.faithfulness(
484
874
  conversational_search_data.text, search_results
@@ -501,6 +891,51 @@ class EvaluationPackage:
501
891
 
502
892
  return metrics
503
893
 
894
+ def evaluate_derailment(
895
+ self, instructions: str = None
896
+ ) -> List[AnswerDerailment]:
897
+ derailments = []
898
+ last_user_message = None
899
+ for message in self.messages:
900
+ if message.role == "user" and message.type == ContentType.text:
901
+ last_user_message = message
902
+ if message.role == "assistant" and message.type == ContentType.text:
903
+ derailment = (
904
+ self.safety_llm_as_a_judge.judge_derailment_in_answer(
905
+ question=last_user_message.content,
906
+ instructions=instructions if instructions else "N/A",
907
+ answer=message.content,
908
+ )
909
+ )
910
+ derailments.append(derailment)
911
+ if derailment.in_scope == "no":
912
+ return (
913
+ derailments # short-circuit if any derailment is found
914
+ )
915
+ return derailments
916
+
917
+ def evaluate_unsafe_topics(
918
+ self, instructions: str = None
919
+ ) -> List[AnswerUnsafeTopic]:
920
+ unsafe_topics = []
921
+ last_user_message = None
922
+ for message in self.messages:
923
+ if message.role == "user" and message.type == ContentType.text:
924
+ last_user_message = message
925
+ if message.role == "assistant" and message.type == ContentType.text:
926
+ unsafe_topic = (
927
+ self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
928
+ question=last_user_message.content,
929
+ instructions=instructions if instructions else "N/A",
930
+ answer=message.content,
931
+ )
932
+ )
933
+ unsafe_topics.append(unsafe_topic)
934
+ if unsafe_topic.is_safe == "no":
935
+ return unsafe_topics # short-circuit if any unsafe topic is found
936
+
937
+ return unsafe_topics
938
+
504
939
 
505
940
  if __name__ == "__main__":
506
941
 
@@ -519,17 +954,19 @@ if __name__ == "__main__":
519
954
 
520
955
  for message in messages:
521
956
  if message.role == "user":
522
- rich.print("[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content)
957
+ rich.print(
958
+ "[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content
959
+ )
523
960
  else:
524
961
  rich.print("[orange3]WXO:[/orange3]", message.content)
525
962
 
526
963
  with open("./benchmarks/workday_tools/data/data18.json", "r") as f:
527
- ground_truth = EvaluationData.model_validate(json.load(f))
964
+ ground_truth = OrchestrateDataset.model_validate(json.load(f))
528
965
 
529
966
  evaluate_package = EvaluationPackage(
530
967
  test_case_name="data1.messages.json",
531
968
  ground_truth=ground_truth,
532
- messages=messages
969
+ messages=messages,
533
970
  )
534
971
  print(evaluate_package.generate_summary())
535
972
  # print(evaluate_package.traverse())