ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,27 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
4
5
|
|
|
5
6
|
import rich
|
|
7
|
+
from dateutil import parser
|
|
6
8
|
|
|
7
9
|
from wxo_agentic_evaluation import __file__
|
|
8
10
|
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
9
11
|
from wxo_agentic_evaluation.llm_matching import LLMMatcher
|
|
10
12
|
from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
|
|
13
|
+
from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
|
|
14
|
+
from wxo_agentic_evaluation.metrics.evaluations import (
|
|
15
|
+
Evaluation,
|
|
16
|
+
Metric,
|
|
17
|
+
)
|
|
18
|
+
from wxo_agentic_evaluation.extractors.extractor_base import Extractor
|
|
19
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
20
|
+
AnswerDerailment,
|
|
21
|
+
AnswerUnsafeTopic,
|
|
22
|
+
)
|
|
11
23
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
24
|
+
CustomEvalMetrics,
|
|
12
25
|
KeywordSemanticSearchMetric,
|
|
13
26
|
KnowledgeBaseMetrics,
|
|
14
27
|
TextMatchType,
|
|
@@ -16,19 +29,27 @@ from wxo_agentic_evaluation.metrics.metrics import (
|
|
|
16
29
|
)
|
|
17
30
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
18
31
|
AnswerRelevancyTemplateRenderer,
|
|
32
|
+
DerailmentTemplateRenderer,
|
|
19
33
|
FaithfulnessTemplateRenderer,
|
|
20
34
|
KeywordMatchingTemplateRenderer,
|
|
21
35
|
SemanticMatchingTemplateRenderer,
|
|
36
|
+
UnsafeTopicTemplateRenderer,
|
|
22
37
|
)
|
|
23
38
|
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
24
|
-
from wxo_agentic_evaluation.
|
|
39
|
+
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
40
|
+
from wxo_agentic_evaluation.service_provider import (
|
|
41
|
+
USE_GATEWAY_MODEL_PROVIDER,
|
|
42
|
+
get_provider,
|
|
43
|
+
)
|
|
44
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
25
45
|
from wxo_agentic_evaluation.type import (
|
|
26
46
|
ContentType,
|
|
27
47
|
ConversationalSearch,
|
|
28
|
-
EvaluationData,
|
|
29
48
|
EventTypes,
|
|
30
49
|
ExtendedMessage,
|
|
50
|
+
MatchingStrategy,
|
|
31
51
|
Message,
|
|
52
|
+
OrchestrateDataset,
|
|
32
53
|
)
|
|
33
54
|
|
|
34
55
|
root_dir = os.path.dirname(__file__)
|
|
@@ -49,6 +70,14 @@ RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
|
|
|
49
70
|
"RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
|
|
50
71
|
)
|
|
51
72
|
|
|
73
|
+
DERAILMENT_PROMPT_PATH = os.path.join(
|
|
74
|
+
root_dir, "prompt", "derailment_prompt.jinja2"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
|
|
78
|
+
root_dir, "prompt", "unsafe_topic_prompt.jinja2"
|
|
79
|
+
)
|
|
80
|
+
|
|
52
81
|
"""
|
|
53
82
|
- hyphens are not allowed in python function names, so it is safe to use as a dummy function name
|
|
54
83
|
- purpose behind `DUMMY_GRAPH_NODE_NAME` is to append
|
|
@@ -61,33 +90,91 @@ DUMMY_GRAPH_NODE_NAME = "dummy-goal"
|
|
|
61
90
|
class EvaluationPackage:
|
|
62
91
|
def __init__(
|
|
63
92
|
self,
|
|
64
|
-
test_case_name,
|
|
65
|
-
ground_truth,
|
|
66
|
-
messages,
|
|
93
|
+
test_case_name: str,
|
|
94
|
+
ground_truth: OrchestrateDataset,
|
|
95
|
+
messages: list[Message],
|
|
67
96
|
conversational_search_data: List[ConversationalSearch] = None,
|
|
68
97
|
resource_map: ResourceMap = None,
|
|
69
98
|
is_attack_evaluation: bool = False,
|
|
99
|
+
config=None,
|
|
100
|
+
custom_evals: Optional[list[Evaluation]] = None,
|
|
101
|
+
custom_llmaaj_client: Optional[Provider] = None,
|
|
102
|
+
extractors: Optional[list[Extractor]] = None,
|
|
103
|
+
similarity_threshold=0.8,
|
|
104
|
+
enable_fuzzy_matching=False,
|
|
105
|
+
strict_topological_matching=True,
|
|
70
106
|
):
|
|
71
|
-
self.tool_dictionary =
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
self.
|
|
107
|
+
self.tool_dictionary = (
|
|
108
|
+
{
|
|
109
|
+
goal_detail.name: goal_detail
|
|
110
|
+
for goal_detail in ground_truth.goal_details
|
|
111
|
+
if goal_detail.type == ContentType.tool_call
|
|
112
|
+
}
|
|
113
|
+
if ground_truth.goal_details
|
|
114
|
+
else {}
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
self.text_list = (
|
|
118
|
+
[
|
|
119
|
+
goal_detail
|
|
120
|
+
for goal_detail in ground_truth.goal_details
|
|
121
|
+
if goal_detail.type == ContentType.text
|
|
122
|
+
]
|
|
123
|
+
if ground_truth.goal_details
|
|
124
|
+
else []
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
self.messages: List[Message] = messages
|
|
82
128
|
self.conversational_search_data = conversational_search_data
|
|
83
129
|
self.is_attack_evaluation = is_attack_evaluation
|
|
84
130
|
self.ground_truth = ground_truth
|
|
85
131
|
self.test_case_name = test_case_name
|
|
86
132
|
self.resource_map = resource_map
|
|
133
|
+
self.custom_evals = custom_evals
|
|
134
|
+
self.custom_llmaaj_client = custom_llmaaj_client
|
|
135
|
+
self.extractors = extractors
|
|
136
|
+
self.enable_fuzzy_matching = enable_fuzzy_matching
|
|
137
|
+
self.strict_topological_matching = strict_topological_matching
|
|
87
138
|
|
|
88
139
|
if not self.is_attack_evaluation:
|
|
89
140
|
self.validate_ground_truth(self.ground_truth, self.test_case_name)
|
|
90
141
|
|
|
142
|
+
extra_kwargs = {}
|
|
143
|
+
|
|
144
|
+
if USE_GATEWAY_MODEL_PROVIDER:
|
|
145
|
+
|
|
146
|
+
if resource_map and hasattr(resource_map, "wxo_client"):
|
|
147
|
+
wxo_client = resource_map.wxo_client
|
|
148
|
+
|
|
149
|
+
if hasattr(wxo_client, "service_url"):
|
|
150
|
+
extra_kwargs["instance_url"] = wxo_client.service_url
|
|
151
|
+
|
|
152
|
+
if hasattr(wxo_client, "api_key"):
|
|
153
|
+
extra_kwargs["token"] = wxo_client.api_key
|
|
154
|
+
|
|
155
|
+
elif config:
|
|
156
|
+
auth = getattr(config, "auth_config", None)
|
|
157
|
+
|
|
158
|
+
if auth:
|
|
159
|
+
instance_url = getattr(auth, "url", None)
|
|
160
|
+
token = getattr(auth, "token", None)
|
|
161
|
+
|
|
162
|
+
if instance_url:
|
|
163
|
+
extra_kwargs["instance_url"] = instance_url
|
|
164
|
+
|
|
165
|
+
if token:
|
|
166
|
+
extra_kwargs["token"] = token
|
|
167
|
+
else:
|
|
168
|
+
token, instance_url, env = tenant_setup(
|
|
169
|
+
service_url=None, tenant_name="local"
|
|
170
|
+
)
|
|
171
|
+
if instance_url:
|
|
172
|
+
extra_kwargs["instance_url"] = instance_url
|
|
173
|
+
|
|
174
|
+
if token:
|
|
175
|
+
extra_kwargs["token"] = token
|
|
176
|
+
|
|
177
|
+
# output response matching
|
|
91
178
|
self.matcher = LLMMatcher(
|
|
92
179
|
llm_client=get_provider(
|
|
93
180
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
@@ -96,6 +183,8 @@ class EvaluationPackage:
|
|
|
96
183
|
"decoding_method": "greedy",
|
|
97
184
|
"max_new_tokens": 10,
|
|
98
185
|
},
|
|
186
|
+
embedding_model_id="sentence-transformers/all-minilm-l6-v2",
|
|
187
|
+
**extra_kwargs,
|
|
99
188
|
),
|
|
100
189
|
keyword_template=KeywordMatchingTemplateRenderer(
|
|
101
190
|
KEYWORD_MATCHING_PROMPT_PATH
|
|
@@ -103,7 +192,10 @@ class EvaluationPackage:
|
|
|
103
192
|
semantic_template=SemanticMatchingTemplateRenderer(
|
|
104
193
|
SEMANTIC_MATCHING_PROMPT_PATH
|
|
105
194
|
),
|
|
195
|
+
similarity_threshold=similarity_threshold,
|
|
196
|
+
enable_fuzzy_matching=enable_fuzzy_matching,
|
|
106
197
|
)
|
|
198
|
+
# only used for RAG evaluation
|
|
107
199
|
self.rag_llm_as_a_judge = LLMJudge(
|
|
108
200
|
llm_client=get_provider(
|
|
109
201
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
@@ -112,57 +204,102 @@ class EvaluationPackage:
|
|
|
112
204
|
"decoding_method": "greedy",
|
|
113
205
|
"max_new_tokens": 4096,
|
|
114
206
|
},
|
|
207
|
+
**extra_kwargs,
|
|
115
208
|
),
|
|
116
209
|
faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
|
|
117
210
|
answer_relevancy=AnswerRelevancyTemplateRenderer(
|
|
118
211
|
ANSWER_RELEVANCY_PROMPT_PATH
|
|
119
212
|
),
|
|
120
213
|
)
|
|
214
|
+
self.safety_llm_as_a_judge = LLMSafetyJudge(
|
|
215
|
+
llm_client=get_provider(
|
|
216
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
217
|
+
params={
|
|
218
|
+
"min_new_tokens": 0,
|
|
219
|
+
"decoding_method": "greedy",
|
|
220
|
+
"max_new_tokens": 4096,
|
|
221
|
+
},
|
|
222
|
+
**extra_kwargs,
|
|
223
|
+
),
|
|
224
|
+
answer_derailment=DerailmentTemplateRenderer(
|
|
225
|
+
DERAILMENT_PROMPT_PATH
|
|
226
|
+
),
|
|
227
|
+
answer_unsafe_topic=UnsafeTopicTemplateRenderer(
|
|
228
|
+
UNSAFE_TOPIC_PROMPT_PATH
|
|
229
|
+
),
|
|
230
|
+
)
|
|
121
231
|
|
|
122
232
|
@staticmethod
|
|
123
|
-
def
|
|
124
|
-
"""
|
|
233
|
+
def find_terminal_nodes(graph: dict[str, list[str]]) -> set[str]:
|
|
234
|
+
"""Finds terminal nodes (nodes with no outgoing edges).
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
graph: the input graph
|
|
125
238
|
|
|
126
|
-
|
|
239
|
+
Returns:
|
|
240
|
+
a set of the terminal nodes
|
|
127
241
|
"""
|
|
128
242
|
|
|
129
|
-
|
|
130
|
-
|
|
243
|
+
seen_nodes = set() # track seen nodes
|
|
244
|
+
non_terminal_nodes = set() # track nodes with children
|
|
131
245
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
if node
|
|
135
|
-
|
|
246
|
+
for node in graph:
|
|
247
|
+
seen_nodes.add(node)
|
|
248
|
+
if graph[node]:
|
|
249
|
+
non_terminal_nodes.add(node)
|
|
250
|
+
for n in graph[node]:
|
|
251
|
+
seen_nodes.add(n)
|
|
252
|
+
return seen_nodes - non_terminal_nodes
|
|
136
253
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
254
|
+
@staticmethod
|
|
255
|
+
def is_topological_sort(
|
|
256
|
+
graph: dict[str, list[str]], ordering: list[str], is_strict: bool = True
|
|
257
|
+
) -> bool:
|
|
258
|
+
"""Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
|
|
142
259
|
|
|
143
|
-
|
|
260
|
+
Args:
|
|
261
|
+
graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
|
|
262
|
+
ordering: the nodes visited, in order
|
|
144
263
|
|
|
145
|
-
|
|
264
|
+
Returns:
|
|
265
|
+
Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
|
|
266
|
+
"""
|
|
267
|
+
# No keyword match or goal details were achieved
|
|
268
|
+
if not ordering:
|
|
269
|
+
return False
|
|
146
270
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
271
|
+
if is_strict:
|
|
272
|
+
# strict matching: only consider most recent tool call
|
|
273
|
+
position = {node: [i] for i, node in enumerate(ordering)}
|
|
274
|
+
else:
|
|
275
|
+
# lenient matching: consider all tool calls (account for all indexes of the node)
|
|
276
|
+
position = defaultdict(list)
|
|
277
|
+
for i, node in enumerate(ordering):
|
|
278
|
+
position[node].append(i)
|
|
279
|
+
|
|
280
|
+
terminal_nodes = EvaluationPackage.find_terminal_nodes(graph)
|
|
281
|
+
# adds a dummy node for each terminal node
|
|
282
|
+
next_idx = (
|
|
283
|
+
max(val for values in position.values() for val in values) + 1
|
|
152
284
|
)
|
|
153
285
|
|
|
154
|
-
|
|
155
|
-
graph[
|
|
286
|
+
for n in terminal_nodes:
|
|
287
|
+
graph[n] = [DUMMY_GRAPH_NODE_NAME]
|
|
156
288
|
graph[DUMMY_GRAPH_NODE_NAME] = []
|
|
289
|
+
position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
|
|
290
|
+
next_idx += 1
|
|
157
291
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
for v in graph[u]:
|
|
163
|
-
if u not in position or v not in position:
|
|
292
|
+
for node in graph:
|
|
293
|
+
for child_nodes in graph[node]:
|
|
294
|
+
# Current node/children doesn't show up in made calls
|
|
295
|
+
if node not in position or child_nodes not in position:
|
|
164
296
|
return False
|
|
165
|
-
|
|
297
|
+
# Current node doesn't show up before any of its child
|
|
298
|
+
# all index in current nodes are larger than every child nodes' index
|
|
299
|
+
if all(
|
|
300
|
+
curr >= max(position[child_nodes])
|
|
301
|
+
for curr in position[node]
|
|
302
|
+
):
|
|
166
303
|
return False
|
|
167
304
|
return True
|
|
168
305
|
|
|
@@ -238,32 +375,151 @@ class EvaluationPackage:
|
|
|
238
375
|
f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
|
|
239
376
|
)
|
|
240
377
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
378
|
+
def argument_matching(
|
|
379
|
+
self,
|
|
380
|
+
expected: dict[str, str],
|
|
381
|
+
actual: dict[str, str],
|
|
382
|
+
matching_strategy: dict[str, MatchingStrategy],
|
|
244
383
|
) -> bool:
|
|
245
|
-
"""
|
|
246
|
-
|
|
247
|
-
- the arg value marked as wrong is labelled with the "<IGNORE>" value in the corresponding ground truth
|
|
384
|
+
"""Handles argument matching for expected and actual arguments and values.
|
|
385
|
+
|
|
248
386
|
Args:
|
|
249
|
-
|
|
250
|
-
|
|
387
|
+
expected: Expected ground truth arguments.
|
|
388
|
+
actual: Actual arguments in tool call
|
|
389
|
+
matching_strategy: Matching mode for each argument. Defaults to strict if not specified.
|
|
390
|
+
|
|
251
391
|
Returns:
|
|
252
|
-
|
|
392
|
+
True if all arguments match according to their matching strategy.
|
|
253
393
|
"""
|
|
394
|
+
# ignore arg matching
|
|
395
|
+
if expected == {"IGNORE": None}:
|
|
396
|
+
return True
|
|
254
397
|
|
|
255
|
-
|
|
256
|
-
|
|
398
|
+
for field in actual:
|
|
399
|
+
if field not in expected:
|
|
400
|
+
return False
|
|
401
|
+
|
|
402
|
+
for field in expected:
|
|
403
|
+
strategy = matching_strategy.get(
|
|
404
|
+
field, MatchingStrategy.strict.value
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
norm_actual_val = EvaluationPackage.normalize_args(
|
|
408
|
+
actual.get(field)
|
|
409
|
+
)
|
|
410
|
+
norm_expected_val = EvaluationPackage.normalize_args(
|
|
411
|
+
expected.get(field)
|
|
412
|
+
)
|
|
257
413
|
|
|
258
|
-
|
|
414
|
+
# field must exist if not using optional matching
|
|
259
415
|
if (
|
|
260
|
-
|
|
261
|
-
and
|
|
416
|
+
field not in actual
|
|
417
|
+
and strategy != MatchingStrategy.optional.value
|
|
262
418
|
):
|
|
263
419
|
return False
|
|
420
|
+
# continue to next if it's an ignored keyword
|
|
421
|
+
if norm_expected_val == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
|
|
422
|
+
continue
|
|
423
|
+
# optional matching
|
|
424
|
+
if strategy == MatchingStrategy.optional.value:
|
|
425
|
+
# continue to next it's not called
|
|
426
|
+
if field not in actual:
|
|
427
|
+
continue
|
|
428
|
+
# must match if called
|
|
429
|
+
if actual[field] != expected[field]:
|
|
430
|
+
return False
|
|
431
|
+
elif strategy == MatchingStrategy.fuzzy.value:
|
|
432
|
+
# check date/number conversion
|
|
433
|
+
conversion_succeeded, values_match = (
|
|
434
|
+
EvaluationPackage._compare_as_date_or_number(
|
|
435
|
+
norm_actual_val, norm_expected_val
|
|
436
|
+
)
|
|
437
|
+
)
|
|
438
|
+
# If conversion succeeded and values match, continue to next parameter
|
|
439
|
+
if conversion_succeeded and values_match:
|
|
440
|
+
continue
|
|
441
|
+
# If conversion succeeded but values don't match, return False
|
|
442
|
+
if conversion_succeeded and not values_match:
|
|
443
|
+
return False
|
|
444
|
+
|
|
445
|
+
# try cosine matching
|
|
446
|
+
x = self.matcher.cosine_similarity_semantic_match(
|
|
447
|
+
norm_actual_val, norm_expected_val
|
|
448
|
+
)
|
|
449
|
+
print(norm_actual_val, norm_expected_val, x)
|
|
450
|
+
if not x:
|
|
451
|
+
return False
|
|
452
|
+
# TODO szhang 10/24/25: Decide if strict comparison must be exact or may allow normalized values.
|
|
453
|
+
elif strategy == MatchingStrategy.strict.value:
|
|
454
|
+
# must match
|
|
455
|
+
if norm_actual_val != norm_expected_val:
|
|
456
|
+
return False
|
|
457
|
+
else:
|
|
458
|
+
print(f"Warning: undefined matching strategy found: {strategy}")
|
|
264
459
|
|
|
265
460
|
return True
|
|
266
461
|
|
|
462
|
+
@staticmethod
|
|
463
|
+
def normalize_args(data):
|
|
464
|
+
if isinstance(data, dict):
|
|
465
|
+
# normalize keys (case-sensitive) and values
|
|
466
|
+
return {
|
|
467
|
+
str(k): EvaluationPackage.normalize_args(v)
|
|
468
|
+
for k, v in data.items()
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
elif isinstance(data, list):
|
|
472
|
+
normalized_list = [
|
|
473
|
+
EvaluationPackage.normalize_args(v) for v in data
|
|
474
|
+
]
|
|
475
|
+
return sorted(
|
|
476
|
+
normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
else:
|
|
480
|
+
# don’t lowercase reserved keyword
|
|
481
|
+
if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
|
|
482
|
+
return str(data)
|
|
483
|
+
return str(data).lower()
|
|
484
|
+
|
|
485
|
+
@staticmethod
|
|
486
|
+
def _compare_as_date_or_number(normalized_actual, normalized_expected):
|
|
487
|
+
"""
|
|
488
|
+
Attempts to compare two normalized values as dates or numbers.
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
normalized_actual: The actual value from tool call
|
|
492
|
+
normalized_expected: The expected value from ground truth
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
tuple: (conversion_succeeded, values_match)
|
|
496
|
+
- conversion_succeeded: True if values could be converted to numbers or dates
|
|
497
|
+
- values_match: True if converted values match
|
|
498
|
+
"""
|
|
499
|
+
# Try to convert to numbers
|
|
500
|
+
try:
|
|
501
|
+
num_actual = float(normalized_actual)
|
|
502
|
+
num_expected = float(normalized_expected)
|
|
503
|
+
# Conversion succeeded, check if values match
|
|
504
|
+
return (
|
|
505
|
+
True,
|
|
506
|
+
abs(num_actual - num_expected) <= 0.001,
|
|
507
|
+
) # Small epsilon for float comparison
|
|
508
|
+
except (ValueError, TypeError):
|
|
509
|
+
pass
|
|
510
|
+
|
|
511
|
+
# Try to convert to dates
|
|
512
|
+
try:
|
|
513
|
+
date_actual = parser.parse(normalized_actual)
|
|
514
|
+
date_expected = parser.parse(normalized_expected)
|
|
515
|
+
# Conversion succeeded, check if values match
|
|
516
|
+
return True, date_actual == date_expected
|
|
517
|
+
except (ValueError, TypeError):
|
|
518
|
+
pass
|
|
519
|
+
|
|
520
|
+
# If we get here, neither number nor date conversion worked
|
|
521
|
+
return False, False
|
|
522
|
+
|
|
267
523
|
def traverse(self):
|
|
268
524
|
labelled_messages = []
|
|
269
525
|
message_outcomes = []
|
|
@@ -325,12 +581,12 @@ class EvaluationPackage:
|
|
|
325
581
|
possible_ground_truth_for_analysis = []
|
|
326
582
|
for goal_detail in matching_goal_details:
|
|
327
583
|
# {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
|
|
328
|
-
if
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
)
|
|
584
|
+
if self.argument_matching(
|
|
585
|
+
expected=goal_detail.args,
|
|
586
|
+
actual=msg_tool_call["args"],
|
|
587
|
+
matching_strategy=goal_detail.arg_matching,
|
|
333
588
|
):
|
|
589
|
+
|
|
334
590
|
labelled_messages.append(goal_detail.name)
|
|
335
591
|
labelled_messages_without_text_step.append(
|
|
336
592
|
goal_detail.name
|
|
@@ -399,6 +655,7 @@ class EvaluationPackage:
|
|
|
399
655
|
if message.event == EventTypes.message_created
|
|
400
656
|
and message.role == "assistant"
|
|
401
657
|
]
|
|
658
|
+
|
|
402
659
|
keyword_semantic_list = []
|
|
403
660
|
for message in assistant_responses:
|
|
404
661
|
for goal_detail in self.text_list:
|
|
@@ -407,7 +664,10 @@ class EvaluationPackage:
|
|
|
407
664
|
message.content, goal_detail.keywords
|
|
408
665
|
)
|
|
409
666
|
semantic_match: bool = self.matcher.semantic_match(
|
|
410
|
-
|
|
667
|
+
self.messages[0].content,
|
|
668
|
+
prediction=message.content,
|
|
669
|
+
ground_truth=goal_detail.response,
|
|
670
|
+
enable_fuzzy_matching=self.enable_fuzzy_matching,
|
|
411
671
|
)
|
|
412
672
|
keyword_semantic_match = KeywordSemanticSearchMetric(
|
|
413
673
|
keyword_match=keyword_match,
|
|
@@ -442,6 +702,29 @@ class EvaluationPackage:
|
|
|
442
702
|
else:
|
|
443
703
|
return TextMatchType.text_mismatch.value
|
|
444
704
|
|
|
705
|
+
def generate_custom_metrics(
|
|
706
|
+
self, extracted_context: Dict[str, Any]
|
|
707
|
+
) -> Optional[CustomEvalMetrics]:
|
|
708
|
+
if self.custom_evals is None:
|
|
709
|
+
return None
|
|
710
|
+
|
|
711
|
+
results: list[Metric] = []
|
|
712
|
+
for evaluation in self.custom_evals:
|
|
713
|
+
# TODO: cleanup. The compute method returns a Metric but pydantic thinks it is different.
|
|
714
|
+
# Probably because of some path issue when we auto-discover metrics
|
|
715
|
+
evaluate_result = evaluation.evaluate(
|
|
716
|
+
messages=self.messages,
|
|
717
|
+
ground_truth=self.ground_truth,
|
|
718
|
+
extracted_context=extracted_context,
|
|
719
|
+
)
|
|
720
|
+
if evaluate_result is not None:
|
|
721
|
+
results.append(Metric(**evaluate_result.model_dump()))
|
|
722
|
+
|
|
723
|
+
custom_eval_results = CustomEvalMetrics(
|
|
724
|
+
dataset_name=self.test_case_name, custom_metrics=results
|
|
725
|
+
)
|
|
726
|
+
return custom_eval_results
|
|
727
|
+
|
|
445
728
|
def generate_summary(self):
|
|
446
729
|
llm_steps = 0
|
|
447
730
|
total_step = 0
|
|
@@ -454,8 +737,20 @@ class EvaluationPackage:
|
|
|
454
737
|
message_with_reasons,
|
|
455
738
|
) = self.traverse()
|
|
456
739
|
|
|
740
|
+
extracted_context = {}
|
|
741
|
+
if self.extractors is not None and self.custom_evals is not None:
|
|
742
|
+
for extractor in self.extractors:
|
|
743
|
+
context = extractor.extract(
|
|
744
|
+
messages=self.messages,
|
|
745
|
+
ground_truth=self.ground_truth,
|
|
746
|
+
matcher=self.matcher,
|
|
747
|
+
)
|
|
748
|
+
extracted_context[extractor.name] = context
|
|
749
|
+
|
|
457
750
|
is_success = self.is_topological_sort(
|
|
458
|
-
self.ground_truth.goals,
|
|
751
|
+
graph=self.ground_truth.goals,
|
|
752
|
+
ordering=labelled_messages,
|
|
753
|
+
is_strict=self.strict_topological_matching,
|
|
459
754
|
)
|
|
460
755
|
match = self._is_text_match(matches)
|
|
461
756
|
|
|
@@ -474,6 +769,10 @@ class EvaluationPackage:
|
|
|
474
769
|
knowledge_base_metric_summary = (
|
|
475
770
|
self.generate_knowledge_base_metric_summary()
|
|
476
771
|
)
|
|
772
|
+
|
|
773
|
+
custom_metric_summary = self.generate_custom_metrics(
|
|
774
|
+
extracted_context=extracted_context
|
|
775
|
+
)
|
|
477
776
|
# TO-DO: the table is not printing properly anymore with the new columns introduced
|
|
478
777
|
# we need to introduce a separate table for these.
|
|
479
778
|
|
|
@@ -487,6 +786,7 @@ class EvaluationPackage:
|
|
|
487
786
|
knowledge_base_metric_summary,
|
|
488
787
|
message_with_reasons,
|
|
489
788
|
metrics,
|
|
789
|
+
custom_metric_summary,
|
|
490
790
|
)
|
|
491
791
|
|
|
492
792
|
def _get_messages_by_role_before_cs(
|
|
@@ -591,6 +891,51 @@ class EvaluationPackage:
|
|
|
591
891
|
|
|
592
892
|
return metrics
|
|
593
893
|
|
|
894
|
+
def evaluate_derailment(
|
|
895
|
+
self, instructions: str = None
|
|
896
|
+
) -> List[AnswerDerailment]:
|
|
897
|
+
derailments = []
|
|
898
|
+
last_user_message = None
|
|
899
|
+
for message in self.messages:
|
|
900
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
901
|
+
last_user_message = message
|
|
902
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
903
|
+
derailment = (
|
|
904
|
+
self.safety_llm_as_a_judge.judge_derailment_in_answer(
|
|
905
|
+
question=last_user_message.content,
|
|
906
|
+
instructions=instructions if instructions else "N/A",
|
|
907
|
+
answer=message.content,
|
|
908
|
+
)
|
|
909
|
+
)
|
|
910
|
+
derailments.append(derailment)
|
|
911
|
+
if derailment.in_scope == "no":
|
|
912
|
+
return (
|
|
913
|
+
derailments # short-circuit if any derailment is found
|
|
914
|
+
)
|
|
915
|
+
return derailments
|
|
916
|
+
|
|
917
|
+
def evaluate_unsafe_topics(
|
|
918
|
+
self, instructions: str = None
|
|
919
|
+
) -> List[AnswerUnsafeTopic]:
|
|
920
|
+
unsafe_topics = []
|
|
921
|
+
last_user_message = None
|
|
922
|
+
for message in self.messages:
|
|
923
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
924
|
+
last_user_message = message
|
|
925
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
926
|
+
unsafe_topic = (
|
|
927
|
+
self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
|
|
928
|
+
question=last_user_message.content,
|
|
929
|
+
instructions=instructions if instructions else "N/A",
|
|
930
|
+
answer=message.content,
|
|
931
|
+
)
|
|
932
|
+
)
|
|
933
|
+
unsafe_topics.append(unsafe_topic)
|
|
934
|
+
if unsafe_topic.is_safe == "no":
|
|
935
|
+
return unsafe_topics # short-circuit if any unsafe topic is found
|
|
936
|
+
|
|
937
|
+
return unsafe_topics
|
|
938
|
+
|
|
594
939
|
|
|
595
940
|
if __name__ == "__main__":
|
|
596
941
|
|
|
@@ -616,7 +961,7 @@ if __name__ == "__main__":
|
|
|
616
961
|
rich.print("[orange3]WXO:[/orange3]", message.content)
|
|
617
962
|
|
|
618
963
|
with open("./benchmarks/workday_tools/data/data18.json", "r") as f:
|
|
619
|
-
ground_truth =
|
|
964
|
+
ground_truth = OrchestrateDataset.model_validate(json.load(f))
|
|
620
965
|
|
|
621
966
|
evaluate_package = EvaluationPackage(
|
|
622
967
|
test_case_name="data1.messages.json",
|
|
@@ -7,7 +7,7 @@ from wxo_agentic_evaluation import prompt
|
|
|
7
7
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
8
8
|
StoryGenerationTemplateRenderer,
|
|
9
9
|
)
|
|
10
|
-
from wxo_agentic_evaluation.service_provider import
|
|
10
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
11
11
|
|
|
12
12
|
console = rich.console.Console()
|
|
13
13
|
|