ibm-watsonx-orchestrate-evaluation-framework 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/METADATA +1 -1
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/RECORD +33 -29
- wxo_agentic_evaluation/analyze_run.py +805 -344
- wxo_agentic_evaluation/arg_configs.py +10 -1
- wxo_agentic_evaluation/description_quality_checker.py +11 -2
- wxo_agentic_evaluation/evaluation_package.py +8 -3
- wxo_agentic_evaluation/inference_backend.py +46 -79
- wxo_agentic_evaluation/llm_matching.py +14 -2
- wxo_agentic_evaluation/main.py +1 -1
- wxo_agentic_evaluation/metrics/__init__.py +1 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
- wxo_agentic_evaluation/metrics/metrics.py +43 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +4 -2
- wxo_agentic_evaluation/quick_eval.py +7 -9
- wxo_agentic_evaluation/record_chat.py +2 -5
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +139 -100
- wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -34
- wxo_agentic_evaluation/red_teaming/attack_list.py +89 -18
- wxo_agentic_evaluation/red_teaming/attack_runner.py +51 -11
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +77 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/service_instance.py +7 -0
- wxo_agentic_evaluation/type.py +1 -1
- wxo_agentic_evaluation/utils/__init__.py +3 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +131 -16
- wxo_agentic_evaluation/wxo_client.py +80 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.5.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from typing import List, Optional, Union
|
|
4
|
+
from enum import StrEnum
|
|
4
5
|
|
|
5
6
|
from wxo_agentic_evaluation import __file__
|
|
6
7
|
|
|
@@ -59,22 +60,30 @@ class AttackConfig:
|
|
|
59
60
|
enable_verbose_logging: bool = True
|
|
60
61
|
enable_manual_user_input: bool = False
|
|
61
62
|
num_workers: int = 2
|
|
63
|
+
skip_available_results: bool = True
|
|
62
64
|
|
|
63
65
|
|
|
64
66
|
@dataclass
|
|
65
67
|
class AttackGeneratorConfig:
|
|
66
68
|
attacks_list: Union[List[str], str]
|
|
67
69
|
datasets_path: Union[List[str], str]
|
|
68
|
-
|
|
70
|
+
agents_list_or_path: Union[List[str], str]
|
|
69
71
|
target_agent_name: str
|
|
72
|
+
auth_config: AuthConfig
|
|
70
73
|
output_dir: str = None
|
|
71
74
|
max_variants: int = None
|
|
72
75
|
|
|
76
|
+
class AnalyzeMode(StrEnum):
|
|
77
|
+
default = "default"
|
|
78
|
+
enhanced = "enhanced"
|
|
73
79
|
|
|
74
80
|
@dataclass
|
|
75
81
|
class AnalyzeConfig:
|
|
76
82
|
data_path: str
|
|
77
83
|
tool_definition_path: Optional[str] = None
|
|
84
|
+
mode: str = AnalyzeMode.default
|
|
85
|
+
num_workers: int = 10
|
|
86
|
+
run: int = -1
|
|
78
87
|
|
|
79
88
|
|
|
80
89
|
@dataclass
|
|
@@ -16,6 +16,7 @@ from wxo_agentic_evaluation.tool_planner import (
|
|
|
16
16
|
)
|
|
17
17
|
from wxo_agentic_evaluation.type import ToolDefinition
|
|
18
18
|
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
19
|
+
from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class ToolDescriptionIssue(Enum):
|
|
@@ -106,7 +107,7 @@ class DescriptionQualityInspector:
|
|
|
106
107
|
)
|
|
107
108
|
return tool_definitions
|
|
108
109
|
|
|
109
|
-
def detect_bad_description(self, tool_definition: ToolDefinition) ->
|
|
110
|
+
def detect_bad_description(self, tool_definition: ToolDefinition) -> DescriptionQualityMetric:
|
|
110
111
|
"""
|
|
111
112
|
Detects if a tool description is 'bad' using an LLM judge.
|
|
112
113
|
A 'bad' description is one that:
|
|
@@ -119,6 +120,10 @@ class DescriptionQualityInspector:
|
|
|
119
120
|
Returns:
|
|
120
121
|
bool: True if the description is 'bad', False otherwise.
|
|
121
122
|
"""
|
|
123
|
+
|
|
124
|
+
if tool_definition.tool_description is None:
|
|
125
|
+
return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
|
|
126
|
+
|
|
122
127
|
prompt = self.template.render(tool_definition=tool_definition)
|
|
123
128
|
response = self.llm_client.query(prompt)
|
|
124
129
|
|
|
@@ -137,7 +142,11 @@ class DescriptionQualityInspector:
|
|
|
137
142
|
response_data=response_data
|
|
138
143
|
)
|
|
139
144
|
|
|
140
|
-
return
|
|
145
|
+
return DescriptionQualityMetric(
|
|
146
|
+
tool_name=tool_definition.tool_name,
|
|
147
|
+
description_score=final_description_score,
|
|
148
|
+
threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
|
|
149
|
+
)
|
|
141
150
|
|
|
142
151
|
def _calculate_score(self, response_data: dict) -> float:
|
|
143
152
|
"""
|
|
@@ -77,7 +77,7 @@ class EvaluationPackage:
|
|
|
77
77
|
def __init__(
|
|
78
78
|
self,
|
|
79
79
|
test_case_name,
|
|
80
|
-
ground_truth,
|
|
80
|
+
ground_truth: EvaluationData,
|
|
81
81
|
messages,
|
|
82
82
|
conversational_search_data: List[ConversationalSearch] = None,
|
|
83
83
|
resource_map: ResourceMap = None,
|
|
@@ -103,7 +103,7 @@ class EvaluationPackage:
|
|
|
103
103
|
else []
|
|
104
104
|
)
|
|
105
105
|
|
|
106
|
-
self.messages = messages
|
|
106
|
+
self.messages: List[Message] = messages
|
|
107
107
|
self.conversational_search_data = conversational_search_data
|
|
108
108
|
self.is_attack_evaluation = is_attack_evaluation
|
|
109
109
|
self.ground_truth = ground_truth
|
|
@@ -113,6 +113,7 @@ class EvaluationPackage:
|
|
|
113
113
|
if not self.is_attack_evaluation:
|
|
114
114
|
self.validate_ground_truth(self.ground_truth, self.test_case_name)
|
|
115
115
|
|
|
116
|
+
# output response matching
|
|
116
117
|
self.matcher = LLMMatcher(
|
|
117
118
|
llm_client=get_provider(
|
|
118
119
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
@@ -129,6 +130,7 @@ class EvaluationPackage:
|
|
|
129
130
|
SEMANTIC_MATCHING_PROMPT_PATH
|
|
130
131
|
),
|
|
131
132
|
)
|
|
133
|
+
# only used for RAG evaluation
|
|
132
134
|
self.rag_llm_as_a_judge = LLMJudge(
|
|
133
135
|
llm_client=get_provider(
|
|
134
136
|
model_id="meta-llama/llama-3-405b-instruct",
|
|
@@ -470,6 +472,7 @@ class EvaluationPackage:
|
|
|
470
472
|
if message.event == EventTypes.message_created
|
|
471
473
|
and message.role == "assistant"
|
|
472
474
|
]
|
|
475
|
+
|
|
473
476
|
keyword_semantic_list = []
|
|
474
477
|
for message in assistant_responses:
|
|
475
478
|
for goal_detail in self.text_list:
|
|
@@ -478,7 +481,9 @@ class EvaluationPackage:
|
|
|
478
481
|
message.content, goal_detail.keywords
|
|
479
482
|
)
|
|
480
483
|
semantic_match: bool = self.matcher.semantic_match(
|
|
481
|
-
|
|
484
|
+
self.messages[0].content,
|
|
485
|
+
prediction=message.content,
|
|
486
|
+
ground_truth=goal_detail.response,
|
|
482
487
|
)
|
|
483
488
|
keyword_semantic_match = KeywordSemanticSearchMetric(
|
|
484
489
|
keyword_match=keyword_match,
|
|
@@ -3,21 +3,15 @@ import os
|
|
|
3
3
|
import time
|
|
4
4
|
from collections import deque
|
|
5
5
|
from enum import Enum
|
|
6
|
-
from typing import Any, Dict, Generator, List, Mapping,
|
|
6
|
+
from typing import Any, Dict, Generator, List, Mapping, Tuple
|
|
7
7
|
|
|
8
8
|
import requests
|
|
9
9
|
import rich
|
|
10
|
-
import urllib3
|
|
11
10
|
import yaml
|
|
12
11
|
from pydantic import BaseModel
|
|
13
|
-
from urllib3.exceptions import InsecureRequestWarning
|
|
14
12
|
|
|
15
13
|
from wxo_agentic_evaluation.arg_configs import TestConfig
|
|
16
14
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
17
|
-
from wxo_agentic_evaluation.service_instance import (
|
|
18
|
-
get_env_settings,
|
|
19
|
-
tenant_setup,
|
|
20
|
-
)
|
|
21
15
|
from wxo_agentic_evaluation.service_provider.watsonx_provider import (
|
|
22
16
|
WatsonXProvider,
|
|
23
17
|
)
|
|
@@ -36,6 +30,7 @@ from wxo_agentic_evaluation.utils.utils import (
|
|
|
36
30
|
is_saas_url,
|
|
37
31
|
safe_divide,
|
|
38
32
|
)
|
|
33
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient
|
|
39
34
|
|
|
40
35
|
tokenizer = Tokenizer()
|
|
41
36
|
|
|
@@ -82,63 +77,6 @@ class CallTracker(BaseModel):
|
|
|
82
77
|
generic: List = []
|
|
83
78
|
|
|
84
79
|
|
|
85
|
-
class WXOClient:
|
|
86
|
-
def __init__(
|
|
87
|
-
self, service_url, api_key, env: Optional[Dict[str, Any]] = None
|
|
88
|
-
):
|
|
89
|
-
self.service_url = service_url
|
|
90
|
-
self.api_key = api_key
|
|
91
|
-
|
|
92
|
-
ov = os.getenv("WO_SSL_VERIFY")
|
|
93
|
-
if ov and ov.strip().lower() in ("true", "false"):
|
|
94
|
-
self._verify_ssl = ov.strip().lower() == "true"
|
|
95
|
-
else:
|
|
96
|
-
v, bs = (env.get("verify") if env else None), (
|
|
97
|
-
env.get("bypass_ssl") if env else None
|
|
98
|
-
)
|
|
99
|
-
self._verify_ssl = (
|
|
100
|
-
False
|
|
101
|
-
if (
|
|
102
|
-
(bs is True)
|
|
103
|
-
or (isinstance(bs, str) and bs.strip().lower() == "true")
|
|
104
|
-
or (v is None)
|
|
105
|
-
or (
|
|
106
|
-
isinstance(v, str)
|
|
107
|
-
and v.strip().lower() in {"none", "null"}
|
|
108
|
-
)
|
|
109
|
-
)
|
|
110
|
-
else (v if isinstance(v, bool) else True)
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
if not self._verify_ssl:
|
|
114
|
-
urllib3.disable_warnings(InsecureRequestWarning)
|
|
115
|
-
|
|
116
|
-
def _get_headers(self) -> dict:
|
|
117
|
-
headers = {}
|
|
118
|
-
if self.api_key:
|
|
119
|
-
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
120
|
-
return headers
|
|
121
|
-
|
|
122
|
-
def post(self, payload: dict, path: str, stream=False):
|
|
123
|
-
url = f"{self.service_url}/{path}"
|
|
124
|
-
return requests.post(
|
|
125
|
-
url=url,
|
|
126
|
-
headers=self._get_headers(),
|
|
127
|
-
json=payload,
|
|
128
|
-
stream=stream,
|
|
129
|
-
verify=self._verify_ssl,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
def get(self, path: str, params: dict = None):
|
|
133
|
-
url = f"{self.service_url}/{path}"
|
|
134
|
-
return requests.get(
|
|
135
|
-
url,
|
|
136
|
-
params=params,
|
|
137
|
-
headers=self._get_headers(),
|
|
138
|
-
verify=self._verify_ssl,
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
|
|
142
80
|
class WXOInferenceBackend:
|
|
143
81
|
def __init__(self, wxo_client):
|
|
144
82
|
self.wxo_client = wxo_client
|
|
@@ -721,6 +659,19 @@ class EvaluationController:
|
|
|
721
659
|
message.content,
|
|
722
660
|
)
|
|
723
661
|
|
|
662
|
+
# hook for subclasses
|
|
663
|
+
if self._post_message_hook(
|
|
664
|
+
task_n=task_n,
|
|
665
|
+
step=step,
|
|
666
|
+
message=message,
|
|
667
|
+
conversation_history=conversation_history,
|
|
668
|
+
):
|
|
669
|
+
return (
|
|
670
|
+
conversation_history,
|
|
671
|
+
call_tracker,
|
|
672
|
+
conversational_search_history_data,
|
|
673
|
+
)
|
|
674
|
+
|
|
724
675
|
conversation_history.extend(messages)
|
|
725
676
|
conversational_search_history_data.extend(
|
|
726
677
|
conversational_search_data
|
|
@@ -733,6 +684,13 @@ class EvaluationController:
|
|
|
733
684
|
conversational_search_history_data,
|
|
734
685
|
)
|
|
735
686
|
|
|
687
|
+
def _post_message_hook(self, **kwargs) -> bool:
|
|
688
|
+
"""
|
|
689
|
+
Hook for subclasses to extend behavior.
|
|
690
|
+
Return True to break the loop early.
|
|
691
|
+
"""
|
|
692
|
+
return False
|
|
693
|
+
|
|
736
694
|
def _is_looping(self, messages: deque) -> bool:
|
|
737
695
|
"""Checks whether the user or assistant is stuck in a loop.
|
|
738
696
|
Args:
|
|
@@ -786,21 +744,30 @@ class EvaluationController:
|
|
|
786
744
|
|
|
787
745
|
return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
|
|
788
746
|
|
|
789
|
-
|
|
790
|
-
def
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
747
|
+
class AttackEvaluationController(EvaluationController):
|
|
748
|
+
def __init__(self, *args, attack_data=None, attack_evaluator=None, **kwargs):
|
|
749
|
+
super().__init__(*args, **kwargs)
|
|
750
|
+
self.attack_data = attack_data
|
|
751
|
+
self.attack_evaluator = attack_evaluator
|
|
752
|
+
|
|
753
|
+
def _post_message_hook(self, task_n, step, message, conversation_history) -> bool:
|
|
754
|
+
"""Override hook to add live attack evaluation."""
|
|
755
|
+
if self.attack_evaluator and self.attack_data:
|
|
756
|
+
success = self.attack_evaluator.evaluate(
|
|
757
|
+
self.attack_data, conversation_history + [message]
|
|
758
|
+
)
|
|
759
|
+
if success:
|
|
760
|
+
rich.print(
|
|
761
|
+
f"[bold green]Attack for [Task-{task_n}] succeeded early at step {step}! Stopping simulation.[/bold green]"
|
|
762
|
+
)
|
|
763
|
+
# persist the live result so the aggregator can pick it up later
|
|
764
|
+
try:
|
|
765
|
+
self.attack_evaluator.save_evaluation_result(self.attack_data, True)
|
|
766
|
+
except Exception:
|
|
767
|
+
pass
|
|
768
|
+
conversation_history.append(message)
|
|
769
|
+
return True
|
|
770
|
+
return False
|
|
804
771
|
|
|
805
772
|
|
|
806
773
|
if __name__ == "__main__":
|
|
@@ -31,9 +31,21 @@ class LLMMatcher:
|
|
|
31
31
|
result = output.strip().lower()
|
|
32
32
|
return result.startswith("true")
|
|
33
33
|
|
|
34
|
-
def semantic_match(
|
|
34
|
+
def semantic_match(
|
|
35
|
+
self, context: str, prediction: str, ground_truth: str
|
|
36
|
+
) -> bool:
|
|
37
|
+
"""Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
context: The starting sentence of the conversation. TODO can also consider using the LLM user's story
|
|
41
|
+
prediction: the predicted string
|
|
42
|
+
ground_truth: the expected string
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
a boolean indicating if the sentences match.
|
|
46
|
+
"""
|
|
35
47
|
prompt = self.semantic_template.render(
|
|
36
|
-
expected_text=ground_truth, actual_text=prediction
|
|
48
|
+
context=context, expected_text=ground_truth, actual_text=prediction
|
|
37
49
|
)
|
|
38
50
|
output: str = self.llm_client.query(prompt)
|
|
39
51
|
result = output.strip().lower()
|
wxo_agentic_evaluation/main.py
CHANGED
|
@@ -21,8 +21,8 @@ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
|
|
|
21
21
|
from wxo_agentic_evaluation.inference_backend import (
|
|
22
22
|
EvaluationController,
|
|
23
23
|
WXOInferenceBackend,
|
|
24
|
-
get_wxo_client,
|
|
25
24
|
)
|
|
25
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
26
26
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
27
27
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
28
28
|
KnowledgeBaseMetricSummary,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from wxo_agentic_evaluation.metrics.metrics import FailedSemanticTestCases, FailedStaticTestCases, Annotation
|
|
@@ -53,8 +53,9 @@ class AnswerDerailment(BaseLLMJudgeMetric):
|
|
|
53
53
|
|
|
54
54
|
def table(self):
|
|
55
55
|
return {
|
|
56
|
-
"statement":
|
|
56
|
+
"statement": self.statement,
|
|
57
57
|
"reason": self.reason,
|
|
58
|
+
"on_topic_score": str(self.in_scope),
|
|
58
59
|
}
|
|
59
60
|
|
|
60
61
|
|
|
@@ -65,7 +66,7 @@ class AnswerUnsafeTopic(BaseLLMJudgeMetric):
|
|
|
65
66
|
|
|
66
67
|
def table(self):
|
|
67
68
|
return {
|
|
68
|
-
"statement":
|
|
69
|
+
"statement": self.statement,
|
|
69
70
|
"reason": self.reason,
|
|
70
|
-
"
|
|
71
|
+
"safe_topic_score": str(self.is_safe),
|
|
71
72
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import math
|
|
2
|
-
from enum import Enum
|
|
3
2
|
from typing import Any, List, Mapping, Optional, Tuple
|
|
3
|
+
from enum import Enum, StrEnum
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel, computed_field
|
|
6
6
|
|
|
@@ -18,6 +18,33 @@ def average(array):
|
|
|
18
18
|
else:
|
|
19
19
|
return sum(array) / len(array)
|
|
20
20
|
|
|
21
|
+
class DescriptionQuality(StrEnum):
|
|
22
|
+
GOOD = "GOOD"
|
|
23
|
+
BAD = "BAD"
|
|
24
|
+
MISSING = "MISSING"
|
|
25
|
+
|
|
26
|
+
class DescriptionQualityMetric(BaseModel):
|
|
27
|
+
tool_name: str = None
|
|
28
|
+
description_score: float | None = None
|
|
29
|
+
threshold: float | None = None
|
|
30
|
+
|
|
31
|
+
@computed_field
|
|
32
|
+
@property
|
|
33
|
+
def is_bad_description(self) -> Optional[bool]:
|
|
34
|
+
if self.description_score and self.threshold:
|
|
35
|
+
return self.description_score >= self.threshold
|
|
36
|
+
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
@computed_field
|
|
40
|
+
@property
|
|
41
|
+
def description_quality(self) -> str:
|
|
42
|
+
if self.description_score is None:
|
|
43
|
+
return DescriptionQuality.MISSING
|
|
44
|
+
elif self.is_bad_description:
|
|
45
|
+
return DescriptionQuality.BAD
|
|
46
|
+
else:
|
|
47
|
+
return DescriptionQuality.GOOD
|
|
21
48
|
|
|
22
49
|
class KnowledgeBaseMetrics(BaseModel):
|
|
23
50
|
dataset_name: str = None
|
|
@@ -175,6 +202,12 @@ class ToolCallAndRoutingMetrics(BaseModel):
|
|
|
175
202
|
)
|
|
176
203
|
|
|
177
204
|
|
|
205
|
+
class Annotation(BaseModel):
|
|
206
|
+
recommendation: str
|
|
207
|
+
details: str
|
|
208
|
+
quote: str
|
|
209
|
+
parameter_name: Optional[str]
|
|
210
|
+
|
|
178
211
|
class FailedStaticTestCases(BaseModel):
|
|
179
212
|
metric_name: str
|
|
180
213
|
description: str
|
|
@@ -187,6 +220,15 @@ class FailedSemanticTestCases(BaseModel):
|
|
|
187
220
|
explanation: str
|
|
188
221
|
output: int
|
|
189
222
|
confidence: float
|
|
223
|
+
annotations: Optional[List[Annotation]] = None
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class EnhancedAnalyzeMetrics(BaseModel):
|
|
227
|
+
test_case_name: str
|
|
228
|
+
tool_names: List[str]
|
|
229
|
+
parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
|
|
230
|
+
tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
|
|
231
|
+
static_metrics: List[List[FailedStaticTestCases]] = [[]]
|
|
190
232
|
|
|
191
233
|
|
|
192
234
|
class ReferenceLessEvalMetrics(BaseModel):
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
2
|
-
You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently.
|
|
2
|
+
You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently, given a context of the situation.
|
|
3
3
|
|
|
4
4
|
Key evaluation principles:
|
|
5
|
-
1. Focus on whether the core information and outcome is the same
|
|
6
|
-
2. Different phrasings that convey the same result should be considered equivalent
|
|
7
|
-
3.
|
|
8
|
-
4.
|
|
9
|
-
5.
|
|
10
|
-
6.
|
|
5
|
+
1. Focus on whether the core information and outcome is the same.
|
|
6
|
+
2. Different phrasings that convey the same result should be considered equivalent.
|
|
7
|
+
3. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs.
|
|
8
|
+
4. When specific values (e.g. IDs, dates, amounts, names) appear in both texts, they must match exactly. If they appear only in one text but the other text doesn’t contradict them, consider it equivalent.
|
|
9
|
+
5. Reference IDs that are system-generated (e.g. item IDs, request IDs, confirmation numbers, UUIDs) should be ignored when checking for equivalence.
|
|
10
|
+
6. When checking query results like lists or tables, differences in field values, and rows are acceptable as long as the same entities or items are represented and the query intent, data type, and structure remain the same.
|
|
11
11
|
|
|
12
12
|
Respond ONLY with:
|
|
13
13
|
- True: if the texts convey the same essential information and outcomes
|
|
@@ -20,16 +20,30 @@ DO NOT provide explanations or commentary - only respond with "True" or "False"
|
|
|
20
20
|
Evaluate the following examples:
|
|
21
21
|
|
|
22
22
|
### Example 1
|
|
23
|
+
Context:
|
|
24
|
+
Get me a list of all active machines.
|
|
25
|
+
|
|
23
26
|
Expected:
|
|
24
|
-
|
|
27
|
+
Here are all the active machines:
|
|
28
|
+
| id | name | number | status |
|
|
29
|
+
|----|-----------|--------|----------|
|
|
30
|
+
| 43 | NNM1 | | active |
|
|
31
|
+
| 01 | XYZ2 | | active |
|
|
32
|
+
| 44 | RRX | | active |
|
|
25
33
|
|
|
26
34
|
Actual:
|
|
27
|
-
|
|
35
|
+
Here are all the active machines:
|
|
36
|
+
| id | name | number | status |
|
|
37
|
+
|----|-----------|--------|----------|
|
|
38
|
+
| 1280 | ABC | | active |
|
|
28
39
|
|
|
29
40
|
Answer:
|
|
30
41
|
True
|
|
31
42
|
|
|
32
43
|
### Example 2
|
|
44
|
+
Context:
|
|
45
|
+
Give me information about Ontario.
|
|
46
|
+
|
|
33
47
|
Expected:
|
|
34
48
|
Ontario is a province in Canada.
|
|
35
49
|
|
|
@@ -40,6 +54,9 @@ Answer:
|
|
|
40
54
|
False
|
|
41
55
|
|
|
42
56
|
### Example 3
|
|
57
|
+
Context:
|
|
58
|
+
Find payslip details for user 12345.
|
|
59
|
+
|
|
43
60
|
Expected:
|
|
44
61
|
No payslips found for user with ID 12345.
|
|
45
62
|
|
|
@@ -50,6 +67,9 @@ Answer:
|
|
|
50
67
|
True
|
|
51
68
|
|
|
52
69
|
### Example 4
|
|
70
|
+
Context:
|
|
71
|
+
I'd like to create a new time off request.
|
|
72
|
+
|
|
53
73
|
Expected:
|
|
54
74
|
Your time off request from 2024-11-01 to 2024-11-01 for TRAVEL has been successfully submitted. The request ID is c705878eb6584e9b910b8db3907a31da.
|
|
55
75
|
|
|
@@ -60,6 +80,9 @@ Answer:
|
|
|
60
80
|
True
|
|
61
81
|
|
|
62
82
|
### Example 5
|
|
83
|
+
Context:
|
|
84
|
+
What's my compensation details?
|
|
85
|
+
|
|
63
86
|
Expected:
|
|
64
87
|
Your compensation details are as follows:
|
|
65
88
|
* Currency: USD
|
|
@@ -72,6 +95,9 @@ Answer:
|
|
|
72
95
|
True
|
|
73
96
|
|
|
74
97
|
### Example 6
|
|
98
|
+
Context:
|
|
99
|
+
Show me my visa details.
|
|
100
|
+
|
|
75
101
|
Expected:
|
|
76
102
|
Your visa details are as follows:
|
|
77
103
|
- Country: 44
|
|
@@ -88,6 +114,9 @@ Answer:
|
|
|
88
114
|
False
|
|
89
115
|
|
|
90
116
|
### Example 7
|
|
117
|
+
Context:
|
|
118
|
+
Update my preferred name and my starting date.
|
|
119
|
+
|
|
91
120
|
Expected:
|
|
92
121
|
I successfully updated your personal information.
|
|
93
122
|
|
|
@@ -101,6 +130,9 @@ True
|
|
|
101
130
|
|
|
102
131
|
### Now, evaluate the following:
|
|
103
132
|
|
|
133
|
+
Context:
|
|
134
|
+
{{ context }}
|
|
135
|
+
|
|
104
136
|
Expected:
|
|
105
137
|
{{ expected_text }}
|
|
106
138
|
|
|
@@ -45,9 +45,11 @@ class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
|
|
48
|
-
def render(self, expected_text: str, actual_text: str) -> str:
|
|
48
|
+
def render(self, context: str, expected_text: str, actual_text: str) -> str:
|
|
49
49
|
return super().render(
|
|
50
|
-
|
|
50
|
+
context=context,
|
|
51
|
+
expected_text=expected_text,
|
|
52
|
+
actual_text=actual_text,
|
|
51
53
|
)
|
|
52
54
|
|
|
53
55
|
|
|
@@ -14,8 +14,8 @@ from wxo_agentic_evaluation.arg_configs import QuickEvalConfig
|
|
|
14
14
|
from wxo_agentic_evaluation.inference_backend import (
|
|
15
15
|
EvaluationController,
|
|
16
16
|
WXOInferenceBackend,
|
|
17
|
-
get_wxo_client,
|
|
18
17
|
)
|
|
18
|
+
from wxo_agentic_evaluation.wxo_client import get_wxo_client
|
|
19
19
|
from wxo_agentic_evaluation.llm_user import LLMUser
|
|
20
20
|
from wxo_agentic_evaluation.metrics.metrics import (
|
|
21
21
|
FailedSemanticTestCases,
|
|
@@ -115,14 +115,16 @@ class QuickEvalController(EvaluationController):
|
|
|
115
115
|
) -> Tuple[ReferenceLessEvalMetrics, List[ExtendedMessage]]:
|
|
116
116
|
# run reference-less evaluation
|
|
117
117
|
rich.print(f"[b][Task-{task_n}] Starting Quick Evaluation")
|
|
118
|
+
processed_data = ReferencelessEvaluation.fmt_msgs_referenceless(
|
|
119
|
+
messages
|
|
120
|
+
)
|
|
118
121
|
te = ReferencelessEvaluation(
|
|
119
122
|
tools,
|
|
120
|
-
messages,
|
|
121
123
|
MODEL_ID,
|
|
122
124
|
task_n,
|
|
123
125
|
self.test_case_name,
|
|
124
126
|
)
|
|
125
|
-
referenceless_results = te.run()
|
|
127
|
+
referenceless_results = te.run(examples=processed_data)
|
|
126
128
|
rich.print(f"[b][Task-{task_n}] Finished Quick Evaluation")
|
|
127
129
|
|
|
128
130
|
summary_metrics = self.compute_metrics(referenceless_results)
|
|
@@ -167,13 +169,13 @@ class QuickEvalController(EvaluationController):
|
|
|
167
169
|
|
|
168
170
|
extended_messages.append(extended_message)
|
|
169
171
|
|
|
170
|
-
# return summary_metrics, referenceless_results
|
|
171
172
|
return summary_metrics, extended_messages
|
|
172
173
|
|
|
173
174
|
def failed_static_metrics_for_tool_call(
|
|
174
175
|
self, static_metrics: Mapping[str, Mapping[str, Any]]
|
|
175
176
|
) -> Optional[List[FailedStaticTestCases]]:
|
|
176
177
|
"""
|
|
178
|
+
# TODO: in future PR, use the ReferencelessParser library
|
|
177
179
|
static.metrics
|
|
178
180
|
"""
|
|
179
181
|
|
|
@@ -195,6 +197,7 @@ class QuickEvalController(EvaluationController):
|
|
|
195
197
|
self, semantic_metrics: Mapping[str, Mapping[str, Any]]
|
|
196
198
|
) -> Optional[List[FailedSemanticTestCases]]:
|
|
197
199
|
"""
|
|
200
|
+
# TODO: in future PR, use the ReferencelessParser library
|
|
198
201
|
semantic.general
|
|
199
202
|
semantic.function_selection
|
|
200
203
|
|
|
@@ -257,11 +260,6 @@ class QuickEvalController(EvaluationController):
|
|
|
257
260
|
[]
|
|
258
261
|
) # keep track of tool calls that failed for semantic reason
|
|
259
262
|
|
|
260
|
-
from pprint import pprint
|
|
261
|
-
|
|
262
|
-
# pprint("quick eval results: ")
|
|
263
|
-
# pprint(quick_eval_results)
|
|
264
|
-
|
|
265
263
|
for tool_call_idx, result in enumerate(quick_eval_results):
|
|
266
264
|
static_passed = result.get("static", {}).get(
|
|
267
265
|
"final_decision", False
|
|
@@ -15,11 +15,8 @@ from wxo_agentic_evaluation.arg_configs import (
|
|
|
15
15
|
KeywordsGenerationConfig,
|
|
16
16
|
)
|
|
17
17
|
from wxo_agentic_evaluation.data_annotator import DataAnnotator
|
|
18
|
-
from wxo_agentic_evaluation.inference_backend import
|
|
19
|
-
|
|
20
|
-
WXOInferenceBackend,
|
|
21
|
-
get_wxo_client,
|
|
22
|
-
)
|
|
18
|
+
from wxo_agentic_evaluation.inference_backend import WXOInferenceBackend
|
|
19
|
+
from wxo_agentic_evaluation.wxo_client import WXOClient, get_wxo_client
|
|
23
20
|
from wxo_agentic_evaluation.prompt.template_render import (
|
|
24
21
|
StoryGenerationTemplateRenderer,
|
|
25
22
|
)
|