ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,41 +1,82 @@
|
|
|
1
|
-
from typing import List
|
|
2
1
|
import json
|
|
3
|
-
import os
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
4
6
|
import rich
|
|
7
|
+
from dateutil import parser
|
|
5
8
|
|
|
9
|
+
from wxo_agentic_evaluation import __file__
|
|
6
10
|
from wxo_agentic_evaluation.data_annotator import ERROR_KEYWORDS
|
|
7
|
-
|
|
8
|
-
from wxo_agentic_evaluation.
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
from wxo_agentic_evaluation.llm_matching import LLMMatcher
|
|
12
|
+
from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
|
|
13
|
+
from wxo_agentic_evaluation.llm_safety_eval import LLMSafetyJudge
|
|
14
|
+
from wxo_agentic_evaluation.metrics.evaluations import (
|
|
15
|
+
Evaluation,
|
|
16
|
+
Metric,
|
|
17
|
+
)
|
|
18
|
+
from wxo_agentic_evaluation.extractors.extractor_base import Extractor
|
|
19
|
+
from wxo_agentic_evaluation.metrics.llm_as_judge import (
|
|
20
|
+
AnswerDerailment,
|
|
21
|
+
AnswerUnsafeTopic,
|
|
15
22
|
)
|
|
16
|
-
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
17
|
-
from wxo_agentic_evaluation.service_provider import get_provider
|
|
18
23
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
19
|
-
|
|
24
|
+
CustomEvalMetrics,
|
|
20
25
|
KeywordSemanticSearchMetric,
|
|
26
|
+
KnowledgeBaseMetrics,
|
|
27
|
+
TextMatchType,
|
|
21
28
|
ToolCallAndRoutingMetrics,
|
|
22
|
-
TextMatchType
|
|
23
29
|
)
|
|
24
30
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
31
|
+
AnswerRelevancyTemplateRenderer,
|
|
32
|
+
DerailmentTemplateRenderer,
|
|
33
|
+
FaithfulnessTemplateRenderer,
|
|
25
34
|
KeywordMatchingTemplateRenderer,
|
|
26
35
|
SemanticMatchingTemplateRenderer,
|
|
27
|
-
|
|
28
|
-
|
|
36
|
+
UnsafeTopicTemplateRenderer,
|
|
37
|
+
)
|
|
38
|
+
from wxo_agentic_evaluation.resource_map import ResourceMap
|
|
39
|
+
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
40
|
+
from wxo_agentic_evaluation.service_provider import (
|
|
41
|
+
USE_GATEWAY_MODEL_PROVIDER,
|
|
42
|
+
get_provider,
|
|
43
|
+
)
|
|
44
|
+
from wxo_agentic_evaluation.service_provider.provider import Provider
|
|
45
|
+
from wxo_agentic_evaluation.type import (
|
|
46
|
+
ContentType,
|
|
47
|
+
ConversationalSearch,
|
|
48
|
+
EventTypes,
|
|
49
|
+
ExtendedMessage,
|
|
50
|
+
MatchingStrategy,
|
|
51
|
+
Message,
|
|
52
|
+
OrchestrateDataset,
|
|
29
53
|
)
|
|
30
|
-
from wxo_agentic_evaluation.llm_matching import LLMMatcher
|
|
31
|
-
from wxo_agentic_evaluation.llm_rag_eval import LLMJudge
|
|
32
|
-
from wxo_agentic_evaluation import __file__
|
|
33
54
|
|
|
34
55
|
root_dir = os.path.dirname(__file__)
|
|
35
|
-
KEYWORD_MATCHING_PROMPT_PATH = os.path.join(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
56
|
+
KEYWORD_MATCHING_PROMPT_PATH = os.path.join(
|
|
57
|
+
root_dir, "prompt", "keyword_matching_prompt.jinja2"
|
|
58
|
+
)
|
|
59
|
+
SEMANTIC_MATCHING_PROMPT_PATH = os.path.join(
|
|
60
|
+
root_dir, "prompt", "semantic_matching_prompt.jinja2"
|
|
61
|
+
)
|
|
62
|
+
FAITHFULNESS_PROMPT_PATH = os.path.join(
|
|
63
|
+
root_dir, "prompt", "faithfulness_prompt.jinja2"
|
|
64
|
+
)
|
|
65
|
+
ANSWER_RELEVANCY_PROMPT_PATH = os.path.join(
|
|
66
|
+
root_dir, "prompt", "answer_relevancy_prompt.jinja2"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS = os.getenv(
|
|
70
|
+
"RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS", "<IGNORE>"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
DERAILMENT_PROMPT_PATH = os.path.join(
|
|
74
|
+
root_dir, "prompt", "derailment_prompt.jinja2"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
UNSAFE_TOPIC_PROMPT_PATH = os.path.join(
|
|
78
|
+
root_dir, "prompt", "unsafe_topic_prompt.jinja2"
|
|
79
|
+
)
|
|
39
80
|
|
|
40
81
|
"""
|
|
41
82
|
- hyphens are not allowed in python function names, so it is safe to use as a dummy function name
|
|
@@ -45,37 +86,105 @@ single, summary step goals.
|
|
|
45
86
|
"""
|
|
46
87
|
DUMMY_GRAPH_NODE_NAME = "dummy-goal"
|
|
47
88
|
|
|
89
|
+
|
|
48
90
|
class EvaluationPackage:
|
|
49
91
|
def __init__(
|
|
50
92
|
self,
|
|
51
|
-
test_case_name,
|
|
52
|
-
ground_truth,
|
|
53
|
-
messages,
|
|
93
|
+
test_case_name: str,
|
|
94
|
+
ground_truth: OrchestrateDataset,
|
|
95
|
+
messages: list[Message],
|
|
54
96
|
conversational_search_data: List[ConversationalSearch] = None,
|
|
55
|
-
is_analyze_run=False,
|
|
56
97
|
resource_map: ResourceMap = None,
|
|
98
|
+
is_attack_evaluation: bool = False,
|
|
99
|
+
config=None,
|
|
100
|
+
custom_evals: Optional[list[Evaluation]] = None,
|
|
101
|
+
custom_llmaaj_client: Optional[Provider] = None,
|
|
102
|
+
extractors: Optional[list[Extractor]] = None,
|
|
103
|
+
similarity_threshold=0.8,
|
|
104
|
+
enable_fuzzy_matching=False,
|
|
105
|
+
strict_topological_matching=True,
|
|
57
106
|
):
|
|
58
|
-
self.tool_dictionary =
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
self.
|
|
107
|
+
self.tool_dictionary = (
|
|
108
|
+
{
|
|
109
|
+
goal_detail.name: goal_detail
|
|
110
|
+
for goal_detail in ground_truth.goal_details
|
|
111
|
+
if goal_detail.type == ContentType.tool_call
|
|
112
|
+
}
|
|
113
|
+
if ground_truth.goal_details
|
|
114
|
+
else {}
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
self.text_list = (
|
|
118
|
+
[
|
|
119
|
+
goal_detail
|
|
120
|
+
for goal_detail in ground_truth.goal_details
|
|
121
|
+
if goal_detail.type == ContentType.text
|
|
122
|
+
]
|
|
123
|
+
if ground_truth.goal_details
|
|
124
|
+
else []
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
self.messages: List[Message] = messages
|
|
69
128
|
self.conversational_search_data = conversational_search_data
|
|
70
|
-
self.
|
|
129
|
+
self.is_attack_evaluation = is_attack_evaluation
|
|
71
130
|
self.ground_truth = ground_truth
|
|
72
131
|
self.test_case_name = test_case_name
|
|
73
|
-
self.
|
|
132
|
+
self.resource_map = resource_map
|
|
133
|
+
self.custom_evals = custom_evals
|
|
134
|
+
self.custom_llmaaj_client = custom_llmaaj_client
|
|
135
|
+
self.extractors = extractors
|
|
136
|
+
self.enable_fuzzy_matching = enable_fuzzy_matching
|
|
137
|
+
self.strict_topological_matching = strict_topological_matching
|
|
138
|
+
|
|
139
|
+
if not self.is_attack_evaluation:
|
|
140
|
+
self.validate_ground_truth(self.ground_truth, self.test_case_name)
|
|
141
|
+
|
|
142
|
+
extra_kwargs = {}
|
|
143
|
+
|
|
144
|
+
if USE_GATEWAY_MODEL_PROVIDER:
|
|
145
|
+
|
|
146
|
+
if resource_map and hasattr(resource_map, "wxo_client"):
|
|
147
|
+
wxo_client = resource_map.wxo_client
|
|
148
|
+
|
|
149
|
+
if hasattr(wxo_client, "service_url"):
|
|
150
|
+
extra_kwargs["instance_url"] = wxo_client.service_url
|
|
151
|
+
|
|
152
|
+
if hasattr(wxo_client, "api_key"):
|
|
153
|
+
extra_kwargs["token"] = wxo_client.api_key
|
|
154
|
+
|
|
155
|
+
elif config:
|
|
156
|
+
auth = getattr(config, "auth_config", None)
|
|
157
|
+
|
|
158
|
+
if auth:
|
|
159
|
+
instance_url = getattr(auth, "url", None)
|
|
160
|
+
token = getattr(auth, "token", None)
|
|
74
161
|
|
|
162
|
+
if instance_url:
|
|
163
|
+
extra_kwargs["instance_url"] = instance_url
|
|
164
|
+
|
|
165
|
+
if token:
|
|
166
|
+
extra_kwargs["token"] = token
|
|
167
|
+
else:
|
|
168
|
+
token, instance_url, env = tenant_setup(
|
|
169
|
+
service_url=None, tenant_name="local"
|
|
170
|
+
)
|
|
171
|
+
if instance_url:
|
|
172
|
+
extra_kwargs["instance_url"] = instance_url
|
|
173
|
+
|
|
174
|
+
if token:
|
|
175
|
+
extra_kwargs["token"] = token
|
|
176
|
+
|
|
177
|
+
# output response matching
|
|
75
178
|
self.matcher = LLMMatcher(
|
|
76
179
|
llm_client=get_provider(
|
|
77
180
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
78
|
-
params={
|
|
181
|
+
params={
|
|
182
|
+
"min_new_tokens": 0,
|
|
183
|
+
"decoding_method": "greedy",
|
|
184
|
+
"max_new_tokens": 10,
|
|
185
|
+
},
|
|
186
|
+
embedding_model_id="sentence-transformers/all-minilm-l6-v2",
|
|
187
|
+
**extra_kwargs,
|
|
79
188
|
),
|
|
80
189
|
keyword_template=KeywordMatchingTemplateRenderer(
|
|
81
190
|
KEYWORD_MATCHING_PROMPT_PATH
|
|
@@ -83,62 +192,114 @@ class EvaluationPackage:
|
|
|
83
192
|
semantic_template=SemanticMatchingTemplateRenderer(
|
|
84
193
|
SEMANTIC_MATCHING_PROMPT_PATH
|
|
85
194
|
),
|
|
195
|
+
similarity_threshold=similarity_threshold,
|
|
196
|
+
enable_fuzzy_matching=enable_fuzzy_matching,
|
|
86
197
|
)
|
|
198
|
+
# only used for RAG evaluation
|
|
87
199
|
self.rag_llm_as_a_judge = LLMJudge(
|
|
88
200
|
llm_client=get_provider(
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
201
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
202
|
+
params={
|
|
203
|
+
"min_new_tokens": 0,
|
|
204
|
+
"decoding_method": "greedy",
|
|
205
|
+
"max_new_tokens": 4096,
|
|
206
|
+
},
|
|
207
|
+
**extra_kwargs,
|
|
208
|
+
),
|
|
92
209
|
faithfulness=FaithfulnessTemplateRenderer(FAITHFULNESS_PROMPT_PATH),
|
|
93
210
|
answer_relevancy=AnswerRelevancyTemplateRenderer(
|
|
94
211
|
ANSWER_RELEVANCY_PROMPT_PATH
|
|
95
212
|
),
|
|
96
213
|
)
|
|
214
|
+
self.safety_llm_as_a_judge = LLMSafetyJudge(
|
|
215
|
+
llm_client=get_provider(
|
|
216
|
+
model_id="meta-llama/llama-3-405b-instruct",
|
|
217
|
+
params={
|
|
218
|
+
"min_new_tokens": 0,
|
|
219
|
+
"decoding_method": "greedy",
|
|
220
|
+
"max_new_tokens": 4096,
|
|
221
|
+
},
|
|
222
|
+
**extra_kwargs,
|
|
223
|
+
),
|
|
224
|
+
answer_derailment=DerailmentTemplateRenderer(
|
|
225
|
+
DERAILMENT_PROMPT_PATH
|
|
226
|
+
),
|
|
227
|
+
answer_unsafe_topic=UnsafeTopicTemplateRenderer(
|
|
228
|
+
UNSAFE_TOPIC_PROMPT_PATH
|
|
229
|
+
),
|
|
230
|
+
)
|
|
97
231
|
|
|
98
|
-
self.resource_map = resource_map
|
|
99
|
-
|
|
100
232
|
@staticmethod
|
|
101
|
-
def
|
|
102
|
-
"""
|
|
233
|
+
def find_terminal_nodes(graph: dict[str, list[str]]) -> set[str]:
|
|
234
|
+
"""Finds terminal nodes (nodes with no outgoing edges).
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
graph: the input graph
|
|
103
238
|
|
|
104
|
-
|
|
239
|
+
Returns:
|
|
240
|
+
a set of the terminal nodes
|
|
105
241
|
"""
|
|
106
242
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
if node
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
# right now, just return the first one
|
|
118
|
-
if not graph.get(node):
|
|
119
|
-
return node
|
|
120
|
-
|
|
121
|
-
stack.extend(graph[node])
|
|
122
|
-
|
|
123
|
-
return None
|
|
243
|
+
seen_nodes = set() # track seen nodes
|
|
244
|
+
non_terminal_nodes = set() # track nodes with children
|
|
245
|
+
|
|
246
|
+
for node in graph:
|
|
247
|
+
seen_nodes.add(node)
|
|
248
|
+
if graph[node]:
|
|
249
|
+
non_terminal_nodes.add(node)
|
|
250
|
+
for n in graph[node]:
|
|
251
|
+
seen_nodes.add(n)
|
|
252
|
+
return seen_nodes - non_terminal_nodes
|
|
124
253
|
|
|
125
254
|
@staticmethod
|
|
126
|
-
def is_topological_sort(
|
|
127
|
-
|
|
128
|
-
|
|
255
|
+
def is_topological_sort(
|
|
256
|
+
graph: dict[str, list[str]], ordering: list[str], is_strict: bool = True
|
|
257
|
+
) -> bool:
|
|
258
|
+
"""Graph traversal to check if every node in `graph` is visited in `ordering` only after all its dependencies are visited.
|
|
129
259
|
|
|
130
|
-
|
|
131
|
-
graph
|
|
132
|
-
|
|
260
|
+
Args:
|
|
261
|
+
graph: the graph representing the ground truth, where keys represent nodes and values represent its dependent nodes
|
|
262
|
+
ordering: the nodes visited, in order
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Boolean representing if `ordering` visits all nodes in a valid order based on the dependencies in graph.
|
|
266
|
+
"""
|
|
267
|
+
# No keyword match or goal details were achieved
|
|
268
|
+
if not ordering:
|
|
269
|
+
return False
|
|
133
270
|
|
|
134
|
-
|
|
135
|
-
|
|
271
|
+
if is_strict:
|
|
272
|
+
# strict matching: only consider most recent tool call
|
|
273
|
+
position = {node: [i] for i, node in enumerate(ordering)}
|
|
274
|
+
else:
|
|
275
|
+
# lenient matching: consider all tool calls (account for all indexes of the node)
|
|
276
|
+
position = defaultdict(list)
|
|
277
|
+
for i, node in enumerate(ordering):
|
|
278
|
+
position[node].append(i)
|
|
279
|
+
|
|
280
|
+
terminal_nodes = EvaluationPackage.find_terminal_nodes(graph)
|
|
281
|
+
# adds a dummy node for each terminal node
|
|
282
|
+
next_idx = (
|
|
283
|
+
max(val for values in position.values() for val in values) + 1
|
|
284
|
+
)
|
|
136
285
|
|
|
137
|
-
for
|
|
138
|
-
|
|
139
|
-
|
|
286
|
+
for n in terminal_nodes:
|
|
287
|
+
graph[n] = [DUMMY_GRAPH_NODE_NAME]
|
|
288
|
+
graph[DUMMY_GRAPH_NODE_NAME] = []
|
|
289
|
+
position[DUMMY_GRAPH_NODE_NAME] = [next_idx]
|
|
290
|
+
next_idx += 1
|
|
291
|
+
|
|
292
|
+
for node in graph:
|
|
293
|
+
for child_nodes in graph[node]:
|
|
294
|
+
# Current node/children doesn't show up in made calls
|
|
295
|
+
if node not in position or child_nodes not in position:
|
|
140
296
|
return False
|
|
141
|
-
|
|
297
|
+
# Current node doesn't show up before any of its child
|
|
298
|
+
# all index in current nodes are larger than every child nodes' index
|
|
299
|
+
if all(
|
|
300
|
+
curr >= max(position[child_nodes])
|
|
301
|
+
for curr in position[node]
|
|
302
|
+
):
|
|
142
303
|
return False
|
|
143
304
|
return True
|
|
144
305
|
|
|
@@ -181,7 +342,11 @@ class EvaluationPackage:
|
|
|
181
342
|
f"Goal detail '{goal_detail.name}' does not match any goals: {goals}. test_case_name: {test_case_name}"
|
|
182
343
|
)
|
|
183
344
|
if goal_detail.name == "summarize":
|
|
184
|
-
if (
|
|
345
|
+
if (
|
|
346
|
+
not goal_detail.keywords or len(goal_detail.keywords) == 0
|
|
347
|
+
) and (
|
|
348
|
+
not goal_detail.response or len(goal_detail.response) == 0
|
|
349
|
+
):
|
|
185
350
|
rich.print(
|
|
186
351
|
f"Summarize goal should have keywords or final response. test_case_name: {test_case_name}"
|
|
187
352
|
)
|
|
@@ -210,23 +375,176 @@ class EvaluationPackage:
|
|
|
210
375
|
f"[green][SUCCESS] Text message matched: Summary - {keyword_semantic_match.message}[/green]"
|
|
211
376
|
)
|
|
212
377
|
|
|
378
|
+
def argument_matching(
|
|
379
|
+
self,
|
|
380
|
+
expected: dict[str, str],
|
|
381
|
+
actual: dict[str, str],
|
|
382
|
+
matching_strategy: dict[str, MatchingStrategy],
|
|
383
|
+
) -> bool:
|
|
384
|
+
"""Handles argument matching for expected and actual arguments and values.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
expected: Expected ground truth arguments.
|
|
388
|
+
actual: Actual arguments in tool call
|
|
389
|
+
matching_strategy: Matching mode for each argument. Defaults to strict if not specified.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
True if all arguments match according to their matching strategy.
|
|
393
|
+
"""
|
|
394
|
+
# ignore arg matching
|
|
395
|
+
if expected == {"IGNORE": None}:
|
|
396
|
+
return True
|
|
397
|
+
|
|
398
|
+
for field in actual:
|
|
399
|
+
if field not in expected:
|
|
400
|
+
return False
|
|
401
|
+
|
|
402
|
+
for field in expected:
|
|
403
|
+
strategy = matching_strategy.get(
|
|
404
|
+
field, MatchingStrategy.strict.value
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
norm_actual_val = EvaluationPackage.normalize_args(
|
|
408
|
+
actual.get(field)
|
|
409
|
+
)
|
|
410
|
+
norm_expected_val = EvaluationPackage.normalize_args(
|
|
411
|
+
expected.get(field)
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# field must exist if not using optional matching
|
|
415
|
+
if (
|
|
416
|
+
field not in actual
|
|
417
|
+
and strategy != MatchingStrategy.optional.value
|
|
418
|
+
):
|
|
419
|
+
return False
|
|
420
|
+
# continue to next if it's an ignored keyword
|
|
421
|
+
if norm_expected_val == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
|
|
422
|
+
continue
|
|
423
|
+
# optional matching
|
|
424
|
+
if strategy == MatchingStrategy.optional.value:
|
|
425
|
+
# continue to next it's not called
|
|
426
|
+
if field not in actual:
|
|
427
|
+
continue
|
|
428
|
+
# must match if called
|
|
429
|
+
if actual[field] != expected[field]:
|
|
430
|
+
return False
|
|
431
|
+
elif strategy == MatchingStrategy.fuzzy.value:
|
|
432
|
+
# check date/number conversion
|
|
433
|
+
conversion_succeeded, values_match = (
|
|
434
|
+
EvaluationPackage._compare_as_date_or_number(
|
|
435
|
+
norm_actual_val, norm_expected_val
|
|
436
|
+
)
|
|
437
|
+
)
|
|
438
|
+
# If conversion succeeded and values match, continue to next parameter
|
|
439
|
+
if conversion_succeeded and values_match:
|
|
440
|
+
continue
|
|
441
|
+
# If conversion succeeded but values don't match, return False
|
|
442
|
+
if conversion_succeeded and not values_match:
|
|
443
|
+
return False
|
|
444
|
+
|
|
445
|
+
# try cosine matching
|
|
446
|
+
x = self.matcher.cosine_similarity_semantic_match(
|
|
447
|
+
norm_actual_val, norm_expected_val
|
|
448
|
+
)
|
|
449
|
+
print(norm_actual_val, norm_expected_val, x)
|
|
450
|
+
if not x:
|
|
451
|
+
return False
|
|
452
|
+
# TODO szhang 10/24/25: Decide if strict comparison must be exact or may allow normalized values.
|
|
453
|
+
elif strategy == MatchingStrategy.strict.value:
|
|
454
|
+
# must match
|
|
455
|
+
if norm_actual_val != norm_expected_val:
|
|
456
|
+
return False
|
|
457
|
+
else:
|
|
458
|
+
print(f"Warning: undefined matching strategy found: {strategy}")
|
|
459
|
+
|
|
460
|
+
return True
|
|
461
|
+
|
|
462
|
+
@staticmethod
|
|
463
|
+
def normalize_args(data):
|
|
464
|
+
if isinstance(data, dict):
|
|
465
|
+
# normalize keys (case-sensitive) and values
|
|
466
|
+
return {
|
|
467
|
+
str(k): EvaluationPackage.normalize_args(v)
|
|
468
|
+
for k, v in data.items()
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
elif isinstance(data, list):
|
|
472
|
+
normalized_list = [
|
|
473
|
+
EvaluationPackage.normalize_args(v) for v in data
|
|
474
|
+
]
|
|
475
|
+
return sorted(
|
|
476
|
+
normalized_list, key=lambda v: json.dumps(v, sort_keys=True)
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
else:
|
|
480
|
+
# don’t lowercase reserved keyword
|
|
481
|
+
if str(data) == RESERVED_KEYWORD_FOR_GROUND_TRUTH_ARGS:
|
|
482
|
+
return str(data)
|
|
483
|
+
return str(data).lower()
|
|
484
|
+
|
|
485
|
+
@staticmethod
|
|
486
|
+
def _compare_as_date_or_number(normalized_actual, normalized_expected):
|
|
487
|
+
"""
|
|
488
|
+
Attempts to compare two normalized values as dates or numbers.
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
normalized_actual: The actual value from tool call
|
|
492
|
+
normalized_expected: The expected value from ground truth
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
tuple: (conversion_succeeded, values_match)
|
|
496
|
+
- conversion_succeeded: True if values could be converted to numbers or dates
|
|
497
|
+
- values_match: True if converted values match
|
|
498
|
+
"""
|
|
499
|
+
# Try to convert to numbers
|
|
500
|
+
try:
|
|
501
|
+
num_actual = float(normalized_actual)
|
|
502
|
+
num_expected = float(normalized_expected)
|
|
503
|
+
# Conversion succeeded, check if values match
|
|
504
|
+
return (
|
|
505
|
+
True,
|
|
506
|
+
abs(num_actual - num_expected) <= 0.001,
|
|
507
|
+
) # Small epsilon for float comparison
|
|
508
|
+
except (ValueError, TypeError):
|
|
509
|
+
pass
|
|
510
|
+
|
|
511
|
+
# Try to convert to dates
|
|
512
|
+
try:
|
|
513
|
+
date_actual = parser.parse(normalized_actual)
|
|
514
|
+
date_expected = parser.parse(normalized_expected)
|
|
515
|
+
# Conversion succeeded, check if values match
|
|
516
|
+
return True, date_actual == date_expected
|
|
517
|
+
except (ValueError, TypeError):
|
|
518
|
+
pass
|
|
519
|
+
|
|
520
|
+
# If we get here, neither number nor date conversion worked
|
|
521
|
+
return False, False
|
|
522
|
+
|
|
213
523
|
def traverse(self):
|
|
214
524
|
labelled_messages = []
|
|
215
525
|
message_outcomes = []
|
|
216
526
|
labelled_messages_without_text_step = []
|
|
217
527
|
# Counters for tool-calling related metrics
|
|
218
|
-
tool_call_and_routing_metrics = ToolCallAndRoutingMetrics(
|
|
528
|
+
tool_call_and_routing_metrics = ToolCallAndRoutingMetrics()
|
|
529
|
+
tool_call_and_routing_metrics.expected_tool_calls = len(
|
|
530
|
+
self.tool_dictionary
|
|
219
531
|
)
|
|
220
|
-
|
|
221
|
-
|
|
532
|
+
correct_tool_calls = (
|
|
533
|
+
set()
|
|
534
|
+
) # sometimes, tool with the same signature can be called more than once
|
|
222
535
|
for message in self.messages:
|
|
223
536
|
if message.type == ContentType.tool_call:
|
|
224
537
|
|
|
225
538
|
msg_tool_call = json.loads(message.content)
|
|
226
|
-
if
|
|
539
|
+
if (
|
|
540
|
+
self.resource_map
|
|
541
|
+
and msg_tool_call["name"] in self.resource_map.agent2tools
|
|
542
|
+
):
|
|
227
543
|
tool_call_and_routing_metrics.total_routing_calls += 1
|
|
228
544
|
relevant = False
|
|
229
|
-
for tool in self.resource_map.agent2tools[
|
|
545
|
+
for tool in self.resource_map.agent2tools[
|
|
546
|
+
msg_tool_call["name"]
|
|
547
|
+
]:
|
|
230
548
|
for goal_detail in self.tool_dictionary.values():
|
|
231
549
|
if goal_detail.tool_name == tool:
|
|
232
550
|
relevant = True
|
|
@@ -235,7 +553,9 @@ class EvaluationPackage:
|
|
|
235
553
|
break
|
|
236
554
|
|
|
237
555
|
if relevant:
|
|
238
|
-
tool_call_and_routing_metrics.relevant_routing_calls +=
|
|
556
|
+
tool_call_and_routing_metrics.relevant_routing_calls += (
|
|
557
|
+
1
|
|
558
|
+
)
|
|
239
559
|
else:
|
|
240
560
|
message_outcome = ExtendedMessage(message=message)
|
|
241
561
|
message_outcome.reason = {
|
|
@@ -244,6 +564,7 @@ class EvaluationPackage:
|
|
|
244
564
|
|
|
245
565
|
continue
|
|
246
566
|
|
|
567
|
+
# TO-DO: re-think how deduplication works in the context of precision & recall
|
|
247
568
|
tool_call_and_routing_metrics.total_tool_calls += 1
|
|
248
569
|
|
|
249
570
|
# evaluating more than once is fine
|
|
@@ -259,19 +580,32 @@ class EvaluationPackage:
|
|
|
259
580
|
found = False
|
|
260
581
|
possible_ground_truth_for_analysis = []
|
|
261
582
|
for goal_detail in matching_goal_details:
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
583
|
+
# {"IGNORE": None} is set in red teaming attack ground truth to ignore parameter matching
|
|
584
|
+
if self.argument_matching(
|
|
585
|
+
expected=goal_detail.args,
|
|
586
|
+
actual=msg_tool_call["args"],
|
|
587
|
+
matching_strategy=goal_detail.arg_matching,
|
|
588
|
+
):
|
|
265
589
|
|
|
266
|
-
|
|
590
|
+
labelled_messages.append(goal_detail.name)
|
|
591
|
+
labelled_messages_without_text_step.append(
|
|
592
|
+
goal_detail.name
|
|
593
|
+
)
|
|
594
|
+
correct_tool_calls.add(goal_detail.name)
|
|
595
|
+
# tool_call_and_routing_metrics.correct_tool_calls += 1 # correct tool call (no erroneous response) + expected arguments, as defined in the ground truth
|
|
267
596
|
found = True
|
|
268
597
|
message_outcome = ExtendedMessage(message=message)
|
|
269
598
|
message_outcomes.append(message_outcome)
|
|
270
599
|
break
|
|
271
600
|
else:
|
|
272
|
-
possible_ground_truth_for_analysis.append(
|
|
601
|
+
possible_ground_truth_for_analysis.append(
|
|
602
|
+
goal_detail.args
|
|
603
|
+
)
|
|
273
604
|
|
|
274
605
|
if not found:
|
|
606
|
+
tool_call_and_routing_metrics.tool_calls_with_incorrect_parameter += (
|
|
607
|
+
1
|
|
608
|
+
)
|
|
275
609
|
message_outcome = ExtendedMessage(message=message)
|
|
276
610
|
message_outcome.reason = {
|
|
277
611
|
"reason": "incorrect parameter",
|
|
@@ -279,15 +613,17 @@ class EvaluationPackage:
|
|
|
279
613
|
"expected": possible_ground_truth_for_analysis,
|
|
280
614
|
}
|
|
281
615
|
message_outcomes.append(message_outcome)
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
616
|
+
if not self.is_attack_evaluation:
|
|
617
|
+
rich.print(
|
|
618
|
+
f"[red][ERROR] Wrong parameters for function: {msg_tool_call['name']}. "
|
|
619
|
+
f"Expected one of {[g.args for g in matching_goal_details]}, Received={msg_tool_call['args']}[/red]"
|
|
620
|
+
)
|
|
286
621
|
else:
|
|
287
622
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
623
|
+
if not self.is_attack_evaluation:
|
|
624
|
+
rich.print(
|
|
625
|
+
f"[yellow][WARNING] Unexpected function call: {msg_tool_call['name']}[/yellow]"
|
|
626
|
+
)
|
|
291
627
|
# note: this is incorrect after the 1.6 change
|
|
292
628
|
message_outcome = ExtendedMessage(message=message)
|
|
293
629
|
message_outcome.reason = {"reason": "irrelevant tool call"}
|
|
@@ -308,12 +644,18 @@ class EvaluationPackage:
|
|
|
308
644
|
else:
|
|
309
645
|
message_outcome = ExtendedMessage(message=message)
|
|
310
646
|
message_outcomes.append(message_outcome)
|
|
647
|
+
|
|
648
|
+
tool_call_and_routing_metrics.correct_tool_calls = len(
|
|
649
|
+
correct_tool_calls
|
|
650
|
+
)
|
|
651
|
+
|
|
311
652
|
assistant_responses = [
|
|
312
653
|
message
|
|
313
654
|
for message in self.messages
|
|
314
655
|
if message.event == EventTypes.message_created
|
|
315
656
|
and message.role == "assistant"
|
|
316
657
|
]
|
|
658
|
+
|
|
317
659
|
keyword_semantic_list = []
|
|
318
660
|
for message in assistant_responses:
|
|
319
661
|
for goal_detail in self.text_list:
|
|
@@ -322,7 +664,10 @@ class EvaluationPackage:
|
|
|
322
664
|
message.content, goal_detail.keywords
|
|
323
665
|
)
|
|
324
666
|
semantic_match: bool = self.matcher.semantic_match(
|
|
325
|
-
|
|
667
|
+
self.messages[0].content,
|
|
668
|
+
prediction=message.content,
|
|
669
|
+
ground_truth=goal_detail.response,
|
|
670
|
+
enable_fuzzy_matching=self.enable_fuzzy_matching,
|
|
326
671
|
)
|
|
327
672
|
keyword_semantic_match = KeywordSemanticSearchMetric(
|
|
328
673
|
keyword_match=keyword_match,
|
|
@@ -357,6 +702,29 @@ class EvaluationPackage:
|
|
|
357
702
|
else:
|
|
358
703
|
return TextMatchType.text_mismatch.value
|
|
359
704
|
|
|
705
|
+
def generate_custom_metrics(
|
|
706
|
+
self, extracted_context: Dict[str, Any]
|
|
707
|
+
) -> Optional[CustomEvalMetrics]:
|
|
708
|
+
if self.custom_evals is None:
|
|
709
|
+
return None
|
|
710
|
+
|
|
711
|
+
results: list[Metric] = []
|
|
712
|
+
for evaluation in self.custom_evals:
|
|
713
|
+
# TODO: cleanup. The compute method returns a Metric but pydantic thinks it is different.
|
|
714
|
+
# Probably because of some path issue when we auto-discover metrics
|
|
715
|
+
evaluate_result = evaluation.evaluate(
|
|
716
|
+
messages=self.messages,
|
|
717
|
+
ground_truth=self.ground_truth,
|
|
718
|
+
extracted_context=extracted_context,
|
|
719
|
+
)
|
|
720
|
+
if evaluate_result is not None:
|
|
721
|
+
results.append(Metric(**evaluate_result.model_dump()))
|
|
722
|
+
|
|
723
|
+
custom_eval_results = CustomEvalMetrics(
|
|
724
|
+
dataset_name=self.test_case_name, custom_metrics=results
|
|
725
|
+
)
|
|
726
|
+
return custom_eval_results
|
|
727
|
+
|
|
360
728
|
def generate_summary(self):
|
|
361
729
|
llm_steps = 0
|
|
362
730
|
total_step = 0
|
|
@@ -368,11 +736,21 @@ class EvaluationPackage:
|
|
|
368
736
|
metrics,
|
|
369
737
|
message_with_reasons,
|
|
370
738
|
) = self.traverse()
|
|
371
|
-
|
|
372
|
-
|
|
739
|
+
|
|
740
|
+
extracted_context = {}
|
|
741
|
+
if self.extractors is not None and self.custom_evals is not None:
|
|
742
|
+
for extractor in self.extractors:
|
|
743
|
+
context = extractor.extract(
|
|
744
|
+
messages=self.messages,
|
|
745
|
+
ground_truth=self.ground_truth,
|
|
746
|
+
matcher=self.matcher,
|
|
747
|
+
)
|
|
748
|
+
extracted_context[extractor.name] = context
|
|
373
749
|
|
|
374
750
|
is_success = self.is_topological_sort(
|
|
375
|
-
self.ground_truth.goals,
|
|
751
|
+
graph=self.ground_truth.goals,
|
|
752
|
+
ordering=labelled_messages,
|
|
753
|
+
is_strict=self.strict_topological_matching,
|
|
376
754
|
)
|
|
377
755
|
match = self._is_text_match(matches)
|
|
378
756
|
|
|
@@ -388,7 +766,13 @@ class EvaluationPackage:
|
|
|
388
766
|
llm_steps += 1
|
|
389
767
|
total_step += 1
|
|
390
768
|
|
|
391
|
-
knowledge_base_metric_summary =
|
|
769
|
+
knowledge_base_metric_summary = (
|
|
770
|
+
self.generate_knowledge_base_metric_summary()
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
custom_metric_summary = self.generate_custom_metrics(
|
|
774
|
+
extracted_context=extracted_context
|
|
775
|
+
)
|
|
392
776
|
# TO-DO: the table is not printing properly anymore with the new columns introduced
|
|
393
777
|
# we need to introduce a separate table for these.
|
|
394
778
|
|
|
@@ -402,6 +786,7 @@ class EvaluationPackage:
|
|
|
402
786
|
knowledge_base_metric_summary,
|
|
403
787
|
message_with_reasons,
|
|
404
788
|
metrics,
|
|
789
|
+
custom_metric_summary,
|
|
405
790
|
)
|
|
406
791
|
|
|
407
792
|
def _get_messages_by_role_before_cs(
|
|
@@ -433,7 +818,11 @@ class EvaluationPackage:
|
|
|
433
818
|
for message in self.messages:
|
|
434
819
|
if message.type == ContentType.tool_call:
|
|
435
820
|
content = json.loads(message.content)
|
|
436
|
-
|
|
821
|
+
"""
|
|
822
|
+
- In ADK 1.9, for tool call events, the "tool_call_id" is now "id"
|
|
823
|
+
- still parse out "tool_call_id" for backwards compatibility
|
|
824
|
+
"""
|
|
825
|
+
id = content.get("tool_call_id") or content.get("id")
|
|
437
826
|
if id == tool_call_id:
|
|
438
827
|
return content.get("name")
|
|
439
828
|
|
|
@@ -478,7 +867,8 @@ class EvaluationPackage:
|
|
|
478
867
|
) # name of knowledge base
|
|
479
868
|
|
|
480
869
|
search_results = [
|
|
481
|
-
result.body
|
|
870
|
+
result.body
|
|
871
|
+
for result in conversational_search_data.search_results
|
|
482
872
|
]
|
|
483
873
|
faithfulness = self.rag_llm_as_a_judge.faithfulness(
|
|
484
874
|
conversational_search_data.text, search_results
|
|
@@ -501,6 +891,51 @@ class EvaluationPackage:
|
|
|
501
891
|
|
|
502
892
|
return metrics
|
|
503
893
|
|
|
894
|
+
def evaluate_derailment(
|
|
895
|
+
self, instructions: str = None
|
|
896
|
+
) -> List[AnswerDerailment]:
|
|
897
|
+
derailments = []
|
|
898
|
+
last_user_message = None
|
|
899
|
+
for message in self.messages:
|
|
900
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
901
|
+
last_user_message = message
|
|
902
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
903
|
+
derailment = (
|
|
904
|
+
self.safety_llm_as_a_judge.judge_derailment_in_answer(
|
|
905
|
+
question=last_user_message.content,
|
|
906
|
+
instructions=instructions if instructions else "N/A",
|
|
907
|
+
answer=message.content,
|
|
908
|
+
)
|
|
909
|
+
)
|
|
910
|
+
derailments.append(derailment)
|
|
911
|
+
if derailment.in_scope == "no":
|
|
912
|
+
return (
|
|
913
|
+
derailments # short-circuit if any derailment is found
|
|
914
|
+
)
|
|
915
|
+
return derailments
|
|
916
|
+
|
|
917
|
+
def evaluate_unsafe_topics(
|
|
918
|
+
self, instructions: str = None
|
|
919
|
+
) -> List[AnswerUnsafeTopic]:
|
|
920
|
+
unsafe_topics = []
|
|
921
|
+
last_user_message = None
|
|
922
|
+
for message in self.messages:
|
|
923
|
+
if message.role == "user" and message.type == ContentType.text:
|
|
924
|
+
last_user_message = message
|
|
925
|
+
if message.role == "assistant" and message.type == ContentType.text:
|
|
926
|
+
unsafe_topic = (
|
|
927
|
+
self.safety_llm_as_a_judge.judge_unsafe_topic_in_answer(
|
|
928
|
+
question=last_user_message.content,
|
|
929
|
+
instructions=instructions if instructions else "N/A",
|
|
930
|
+
answer=message.content,
|
|
931
|
+
)
|
|
932
|
+
)
|
|
933
|
+
unsafe_topics.append(unsafe_topic)
|
|
934
|
+
if unsafe_topic.is_safe == "no":
|
|
935
|
+
return unsafe_topics # short-circuit if any unsafe topic is found
|
|
936
|
+
|
|
937
|
+
return unsafe_topics
|
|
938
|
+
|
|
504
939
|
|
|
505
940
|
if __name__ == "__main__":
|
|
506
941
|
|
|
@@ -519,17 +954,19 @@ if __name__ == "__main__":
|
|
|
519
954
|
|
|
520
955
|
for message in messages:
|
|
521
956
|
if message.role == "user":
|
|
522
|
-
rich.print(
|
|
957
|
+
rich.print(
|
|
958
|
+
"[yellow]GENERATED_USER_MESSAGE:[/yellow]", message.content
|
|
959
|
+
)
|
|
523
960
|
else:
|
|
524
961
|
rich.print("[orange3]WXO:[/orange3]", message.content)
|
|
525
962
|
|
|
526
963
|
with open("./benchmarks/workday_tools/data/data18.json", "r") as f:
|
|
527
|
-
ground_truth =
|
|
964
|
+
ground_truth = OrchestrateDataset.model_validate(json.load(f))
|
|
528
965
|
|
|
529
966
|
evaluate_package = EvaluationPackage(
|
|
530
967
|
test_case_name="data1.messages.json",
|
|
531
968
|
ground_truth=ground_truth,
|
|
532
|
-
messages=messages
|
|
969
|
+
messages=messages,
|
|
533
970
|
)
|
|
534
971
|
print(evaluate_package.generate_summary())
|
|
535
972
|
# print(evaluate_package.traverse())
|