ibm-watsonx-orchestrate-evaluation-framework 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (35) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/METADATA +1 -1
  2. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/RECORD +35 -31
  3. wxo_agentic_evaluation/analyze_run.py +805 -344
  4. wxo_agentic_evaluation/arg_configs.py +10 -1
  5. wxo_agentic_evaluation/description_quality_checker.py +11 -2
  6. wxo_agentic_evaluation/evaluation_package.py +8 -3
  7. wxo_agentic_evaluation/external_agent/external_validate.py +5 -5
  8. wxo_agentic_evaluation/external_agent/types.py +3 -9
  9. wxo_agentic_evaluation/inference_backend.py +46 -79
  10. wxo_agentic_evaluation/llm_matching.py +14 -2
  11. wxo_agentic_evaluation/main.py +1 -1
  12. wxo_agentic_evaluation/metrics/__init__.py +1 -0
  13. wxo_agentic_evaluation/metrics/llm_as_judge.py +4 -3
  14. wxo_agentic_evaluation/metrics/metrics.py +43 -1
  15. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  16. wxo_agentic_evaluation/prompt/template_render.py +4 -2
  17. wxo_agentic_evaluation/quick_eval.py +7 -9
  18. wxo_agentic_evaluation/record_chat.py +22 -29
  19. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +139 -100
  20. wxo_agentic_evaluation/red_teaming/attack_generator.py +38 -34
  21. wxo_agentic_evaluation/red_teaming/attack_list.py +89 -18
  22. wxo_agentic_evaluation/red_teaming/attack_runner.py +51 -11
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  24. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  26. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +77 -39
  27. wxo_agentic_evaluation/resource_map.py +3 -1
  28. wxo_agentic_evaluation/service_instance.py +7 -0
  29. wxo_agentic_evaluation/type.py +1 -1
  30. wxo_agentic_evaluation/utils/__init__.py +3 -0
  31. wxo_agentic_evaluation/utils/parsers.py +71 -0
  32. wxo_agentic_evaluation/utils/utils.py +131 -16
  33. wxo_agentic_evaluation/wxo_client.py +80 -0
  34. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/WHEEL +0 -0
  35. {ibm_watsonx_orchestrate_evaluation_framework-1.1.4.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.6.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  from dataclasses import dataclass, field
3
3
  from typing import List, Optional, Union
4
+ from enum import StrEnum
4
5
 
5
6
  from wxo_agentic_evaluation import __file__
6
7
 
@@ -59,22 +60,30 @@ class AttackConfig:
59
60
  enable_verbose_logging: bool = True
60
61
  enable_manual_user_input: bool = False
61
62
  num_workers: int = 2
63
+ skip_available_results: bool = True
62
64
 
63
65
 
64
66
  @dataclass
65
67
  class AttackGeneratorConfig:
66
68
  attacks_list: Union[List[str], str]
67
69
  datasets_path: Union[List[str], str]
68
- agents_path: str
70
+ agents_list_or_path: Union[List[str], str]
69
71
  target_agent_name: str
72
+ auth_config: AuthConfig
70
73
  output_dir: str = None
71
74
  max_variants: int = None
72
75
 
76
+ class AnalyzeMode(StrEnum):
77
+ default = "default"
78
+ enhanced = "enhanced"
73
79
 
74
80
  @dataclass
75
81
  class AnalyzeConfig:
76
82
  data_path: str
77
83
  tool_definition_path: Optional[str] = None
84
+ mode: str = AnalyzeMode.default
85
+ num_workers: int = 10
86
+ run: int = -1
78
87
 
79
88
 
80
89
  @dataclass
@@ -16,6 +16,7 @@ from wxo_agentic_evaluation.tool_planner import (
16
16
  )
17
17
  from wxo_agentic_evaluation.type import ToolDefinition
18
18
  from wxo_agentic_evaluation.utils.utils import safe_divide
19
+ from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
19
20
 
20
21
 
21
22
  class ToolDescriptionIssue(Enum):
@@ -106,7 +107,7 @@ class DescriptionQualityInspector:
106
107
  )
107
108
  return tool_definitions
108
109
 
109
- def detect_bad_description(self, tool_definition: ToolDefinition) -> bool:
110
+ def detect_bad_description(self, tool_definition: ToolDefinition) -> DescriptionQualityMetric:
110
111
  """
111
112
  Detects if a tool description is 'bad' using an LLM judge.
112
113
  A 'bad' description is one that:
@@ -119,6 +120,10 @@ class DescriptionQualityInspector:
119
120
  Returns:
120
121
  bool: True if the description is 'bad', False otherwise.
121
122
  """
123
+
124
+ if tool_definition.tool_description is None:
125
+ return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
126
+
122
127
  prompt = self.template.render(tool_definition=tool_definition)
123
128
  response = self.llm_client.query(prompt)
124
129
 
@@ -137,7 +142,11 @@ class DescriptionQualityInspector:
137
142
  response_data=response_data
138
143
  )
139
144
 
140
- return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
145
+ return DescriptionQualityMetric(
146
+ tool_name=tool_definition.tool_name,
147
+ description_score=final_description_score,
148
+ threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
149
+ )
141
150
 
142
151
  def _calculate_score(self, response_data: dict) -> float:
143
152
  """
@@ -77,7 +77,7 @@ class EvaluationPackage:
77
77
  def __init__(
78
78
  self,
79
79
  test_case_name,
80
- ground_truth,
80
+ ground_truth: EvaluationData,
81
81
  messages,
82
82
  conversational_search_data: List[ConversationalSearch] = None,
83
83
  resource_map: ResourceMap = None,
@@ -103,7 +103,7 @@ class EvaluationPackage:
103
103
  else []
104
104
  )
105
105
 
106
- self.messages = messages
106
+ self.messages: List[Message] = messages
107
107
  self.conversational_search_data = conversational_search_data
108
108
  self.is_attack_evaluation = is_attack_evaluation
109
109
  self.ground_truth = ground_truth
@@ -113,6 +113,7 @@ class EvaluationPackage:
113
113
  if not self.is_attack_evaluation:
114
114
  self.validate_ground_truth(self.ground_truth, self.test_case_name)
115
115
 
116
+ # output response matching
116
117
  self.matcher = LLMMatcher(
117
118
  llm_client=get_provider(
118
119
  model_id="meta-llama/llama-3-405b-instruct",
@@ -129,6 +130,7 @@ class EvaluationPackage:
129
130
  SEMANTIC_MATCHING_PROMPT_PATH
130
131
  ),
131
132
  )
133
+ # only used for RAG evaluation
132
134
  self.rag_llm_as_a_judge = LLMJudge(
133
135
  llm_client=get_provider(
134
136
  model_id="meta-llama/llama-3-405b-instruct",
@@ -470,6 +472,7 @@ class EvaluationPackage:
470
472
  if message.event == EventTypes.message_created
471
473
  and message.role == "assistant"
472
474
  ]
475
+
473
476
  keyword_semantic_list = []
474
477
  for message in assistant_responses:
475
478
  for goal_detail in self.text_list:
@@ -478,7 +481,9 @@ class EvaluationPackage:
478
481
  message.content, goal_detail.keywords
479
482
  )
480
483
  semantic_match: bool = self.matcher.semantic_match(
481
- message.content, goal_detail.response
484
+ self.messages[0].content,
485
+ prediction=message.content,
486
+ ground_truth=goal_detail.response,
482
487
  )
483
488
  keyword_semantic_match = KeywordSemanticSearchMetric(
484
489
  keyword_match=keyword_match,
@@ -41,15 +41,15 @@ class ExternalAgentValidation:
41
41
  data = b""
42
42
  for chunk in resp:
43
43
  for line in chunk.splitlines(True):
44
- if line.startswith(b"data:"):
45
- line = line.replace(b"data:", b"")
46
- if line.strip() == b"[DONE]":
47
- return
44
+ if line.startswith(b"event:"):
45
+ continue
48
46
  data += line
49
47
  if data.endswith((b"\r\r", b"\n\n", b"\r\n\r\n")):
50
48
  # NOTE: edge case, "data" can be sent in two different chunks
51
49
  if data.startswith(b"data:"):
52
50
  data = data.replace(b"data:", b"")
51
+ if data.strip() == b"[DONE]":
52
+ return
53
53
  yield data
54
54
  data = b""
55
55
  if data:
@@ -74,7 +74,7 @@ class ExternalAgentValidation:
74
74
  payload = {"stream": True}
75
75
  payload["messages"] = messages
76
76
  resp = requests.post(
77
- url=self.service_url, headers=self.header, json=payload
77
+ url=self.service_url, headers=self.header, json=payload,
78
78
  )
79
79
  success, logged_events = self._validate_streaming_response(resp)
80
80
 
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Literal, Mapping, Union
1
+ from typing import Any, List, Literal, Mapping, Union, Optional
2
2
 
3
3
  from pydantic import BaseModel
4
4
 
@@ -46,7 +46,7 @@ class ThreadRunStepDeltaChoice(BaseModel):
46
46
  class BaseEventData(BaseModel):
47
47
  id: str
48
48
  object: str
49
- thread_id: str
49
+ thread_id: Optional[str] = None
50
50
  model: str | None = None
51
51
  created: int | None = None
52
52
 
@@ -62,13 +62,7 @@ class ThreadRunStepDeltaData(BaseEventData):
62
62
 
63
63
 
64
64
  class UniversalData(BaseEventData):
65
- object: Union[
66
- Literal["thread.message.delta"],
67
- Literal["thread.run.step.delta"],
68
- Literal["thread.run.step.created"],
69
- Literal["thread.run.step.completed"],
70
- ]
71
- choices: List[ThreadMessageDeltaChoice]
65
+ object: Optional[str]
72
66
  choices: List[Union[ThreadMessageDeltaChoice, dict]]
73
67
 
74
68
 
@@ -3,21 +3,15 @@ import os
3
3
  import time
4
4
  from collections import deque
5
5
  from enum import Enum
6
- from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple
6
+ from typing import Any, Dict, Generator, List, Mapping, Tuple
7
7
 
8
8
  import requests
9
9
  import rich
10
- import urllib3
11
10
  import yaml
12
11
  from pydantic import BaseModel
13
- from urllib3.exceptions import InsecureRequestWarning
14
12
 
15
13
  from wxo_agentic_evaluation.arg_configs import TestConfig
16
14
  from wxo_agentic_evaluation.llm_user import LLMUser
17
- from wxo_agentic_evaluation.service_instance import (
18
- get_env_settings,
19
- tenant_setup,
20
- )
21
15
  from wxo_agentic_evaluation.service_provider.watsonx_provider import (
22
16
  WatsonXProvider,
23
17
  )
@@ -36,6 +30,7 @@ from wxo_agentic_evaluation.utils.utils import (
36
30
  is_saas_url,
37
31
  safe_divide,
38
32
  )
33
+ from wxo_agentic_evaluation.wxo_client import WXOClient
39
34
 
40
35
  tokenizer = Tokenizer()
41
36
 
@@ -82,63 +77,6 @@ class CallTracker(BaseModel):
82
77
  generic: List = []
83
78
 
84
79
 
85
- class WXOClient:
86
- def __init__(
87
- self, service_url, api_key, env: Optional[Dict[str, Any]] = None
88
- ):
89
- self.service_url = service_url
90
- self.api_key = api_key
91
-
92
- ov = os.getenv("WO_SSL_VERIFY")
93
- if ov and ov.strip().lower() in ("true", "false"):
94
- self._verify_ssl = ov.strip().lower() == "true"
95
- else:
96
- v, bs = (env.get("verify") if env else None), (
97
- env.get("bypass_ssl") if env else None
98
- )
99
- self._verify_ssl = (
100
- False
101
- if (
102
- (bs is True)
103
- or (isinstance(bs, str) and bs.strip().lower() == "true")
104
- or (v is None)
105
- or (
106
- isinstance(v, str)
107
- and v.strip().lower() in {"none", "null"}
108
- )
109
- )
110
- else (v if isinstance(v, bool) else True)
111
- )
112
-
113
- if not self._verify_ssl:
114
- urllib3.disable_warnings(InsecureRequestWarning)
115
-
116
- def _get_headers(self) -> dict:
117
- headers = {}
118
- if self.api_key:
119
- headers["Authorization"] = f"Bearer {self.api_key}"
120
- return headers
121
-
122
- def post(self, payload: dict, path: str, stream=False):
123
- url = f"{self.service_url}/{path}"
124
- return requests.post(
125
- url=url,
126
- headers=self._get_headers(),
127
- json=payload,
128
- stream=stream,
129
- verify=self._verify_ssl,
130
- )
131
-
132
- def get(self, path: str, params: dict = None):
133
- url = f"{self.service_url}/{path}"
134
- return requests.get(
135
- url,
136
- params=params,
137
- headers=self._get_headers(),
138
- verify=self._verify_ssl,
139
- )
140
-
141
-
142
80
  class WXOInferenceBackend:
143
81
  def __init__(self, wxo_client):
144
82
  self.wxo_client = wxo_client
@@ -721,6 +659,19 @@ class EvaluationController:
721
659
  message.content,
722
660
  )
723
661
 
662
+ # hook for subclasses
663
+ if self._post_message_hook(
664
+ task_n=task_n,
665
+ step=step,
666
+ message=message,
667
+ conversation_history=conversation_history,
668
+ ):
669
+ return (
670
+ conversation_history,
671
+ call_tracker,
672
+ conversational_search_history_data,
673
+ )
674
+
724
675
  conversation_history.extend(messages)
725
676
  conversational_search_history_data.extend(
726
677
  conversational_search_data
@@ -733,6 +684,13 @@ class EvaluationController:
733
684
  conversational_search_history_data,
734
685
  )
735
686
 
687
+ def _post_message_hook(self, **kwargs) -> bool:
688
+ """
689
+ Hook for subclasses to extend behavior.
690
+ Return True to break the loop early.
691
+ """
692
+ return False
693
+
736
694
  def _is_looping(self, messages: deque) -> bool:
737
695
  """Checks whether the user or assistant is stuck in a loop.
738
696
  Args:
@@ -786,21 +744,30 @@ class EvaluationController:
786
744
 
787
745
  return False # Final fallback for termination is in the main inference loop, which defines MAX_CONVERSATION_STEPS
788
746
 
789
-
790
- def get_wxo_client(
791
- service_url: Optional[str], tenant_name: str, token: Optional[str] = None
792
- ) -> WXOClient:
793
-
794
- token, resolved_url, env = tenant_setup(service_url, tenant_name)
795
- service_url = service_url or resolved_url
796
-
797
- if not (service_url and str(service_url).strip()):
798
- raise ValueError(
799
- f"service_url not provided and not found in config for tenant '{tenant_name}'"
800
- )
801
-
802
- wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
803
- return wxo_client
747
+ class AttackEvaluationController(EvaluationController):
748
+ def __init__(self, *args, attack_data=None, attack_evaluator=None, **kwargs):
749
+ super().__init__(*args, **kwargs)
750
+ self.attack_data = attack_data
751
+ self.attack_evaluator = attack_evaluator
752
+
753
+ def _post_message_hook(self, task_n, step, message, conversation_history) -> bool:
754
+ """Override hook to add live attack evaluation."""
755
+ if self.attack_evaluator and self.attack_data:
756
+ success = self.attack_evaluator.evaluate(
757
+ self.attack_data, conversation_history + [message]
758
+ )
759
+ if success:
760
+ rich.print(
761
+ f"[bold green]Attack for [Task-{task_n}] succeeded early at step {step}! Stopping simulation.[/bold green]"
762
+ )
763
+ # persist the live result so the aggregator can pick it up later
764
+ try:
765
+ self.attack_evaluator.save_evaluation_result(self.attack_data, True)
766
+ except Exception:
767
+ pass
768
+ conversation_history.append(message)
769
+ return True
770
+ return False
804
771
 
805
772
 
806
773
  if __name__ == "__main__":
@@ -31,9 +31,21 @@ class LLMMatcher:
31
31
  result = output.strip().lower()
32
32
  return result.startswith("true")
33
33
 
34
- def semantic_match(self, prediction: str, ground_truth: str) -> bool:
34
+ def semantic_match(
35
+ self, context: str, prediction: str, ground_truth: str
36
+ ) -> bool:
37
+ """Performs semantic matching for the agent's final response and the expected response using the starting sentence of the conversation as the context
38
+
39
+ Args:
40
+ context: The starting sentence of the conversation. TODO can also consider using the LLM user's story
41
+ prediction: the predicted string
42
+ ground_truth: the expected string
43
+
44
+ Returns:
45
+ a boolean indicating if the sentences match.
46
+ """
35
47
  prompt = self.semantic_template.render(
36
- expected_text=ground_truth, actual_text=prediction
48
+ context=context, expected_text=ground_truth, actual_text=prediction
37
49
  )
38
50
  output: str = self.llm_client.query(prompt)
39
51
  result = output.strip().lower()
@@ -21,8 +21,8 @@ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
21
21
  from wxo_agentic_evaluation.inference_backend import (
22
22
  EvaluationController,
23
23
  WXOInferenceBackend,
24
- get_wxo_client,
25
24
  )
25
+ from wxo_agentic_evaluation.wxo_client import get_wxo_client
26
26
  from wxo_agentic_evaluation.llm_user import LLMUser
27
27
  from wxo_agentic_evaluation.metrics.metrics import (
28
28
  KnowledgeBaseMetricSummary,
@@ -0,0 +1 @@
1
+ from wxo_agentic_evaluation.metrics.metrics import FailedSemanticTestCases, FailedStaticTestCases, Annotation
@@ -53,8 +53,9 @@ class AnswerDerailment(BaseLLMJudgeMetric):
53
53
 
54
54
  def table(self):
55
55
  return {
56
- "statement": ",".join(self.statement),
56
+ "statement": self.statement,
57
57
  "reason": self.reason,
58
+ "on_topic_score": str(self.in_scope),
58
59
  }
59
60
 
60
61
 
@@ -65,7 +66,7 @@ class AnswerUnsafeTopic(BaseLLMJudgeMetric):
65
66
 
66
67
  def table(self):
67
68
  return {
68
- "statement": ",".join(self.statement),
69
+ "statement": self.statement,
69
70
  "reason": self.reason,
70
- "unsafe_topic_score": str(self.is_safe),
71
+ "safe_topic_score": str(self.is_safe),
71
72
  }
@@ -1,6 +1,6 @@
1
1
  import math
2
- from enum import Enum
3
2
  from typing import Any, List, Mapping, Optional, Tuple
3
+ from enum import Enum, StrEnum
4
4
 
5
5
  from pydantic import BaseModel, computed_field
6
6
 
@@ -18,6 +18,33 @@ def average(array):
18
18
  else:
19
19
  return sum(array) / len(array)
20
20
 
21
+ class DescriptionQuality(StrEnum):
22
+ GOOD = "GOOD"
23
+ BAD = "BAD"
24
+ MISSING = "MISSING"
25
+
26
+ class DescriptionQualityMetric(BaseModel):
27
+ tool_name: str = None
28
+ description_score: float | None = None
29
+ threshold: float | None = None
30
+
31
+ @computed_field
32
+ @property
33
+ def is_bad_description(self) -> Optional[bool]:
34
+ if self.description_score and self.threshold:
35
+ return self.description_score >= self.threshold
36
+
37
+ return None
38
+
39
+ @computed_field
40
+ @property
41
+ def description_quality(self) -> str:
42
+ if self.description_score is None:
43
+ return DescriptionQuality.MISSING
44
+ elif self.is_bad_description:
45
+ return DescriptionQuality.BAD
46
+ else:
47
+ return DescriptionQuality.GOOD
21
48
 
22
49
  class KnowledgeBaseMetrics(BaseModel):
23
50
  dataset_name: str = None
@@ -175,6 +202,12 @@ class ToolCallAndRoutingMetrics(BaseModel):
175
202
  )
176
203
 
177
204
 
205
+ class Annotation(BaseModel):
206
+ recommendation: str
207
+ details: str
208
+ quote: str
209
+ parameter_name: Optional[str]
210
+
178
211
  class FailedStaticTestCases(BaseModel):
179
212
  metric_name: str
180
213
  description: str
@@ -187,6 +220,15 @@ class FailedSemanticTestCases(BaseModel):
187
220
  explanation: str
188
221
  output: int
189
222
  confidence: float
223
+ annotations: Optional[List[Annotation]] = None
224
+
225
+
226
+ class EnhancedAnalyzeMetrics(BaseModel):
227
+ test_case_name: str
228
+ tool_names: List[str]
229
+ parameter_annotations: List[List[FailedSemanticTestCases]] = [[]]
230
+ tool_annotations: List[List[FailedSemanticTestCases]] = [[]]
231
+ static_metrics: List[List[FailedStaticTestCases]] = [[]]
190
232
 
191
233
 
192
234
  class ReferenceLessEvalMetrics(BaseModel):
@@ -1,13 +1,13 @@
1
1
  <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
- You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently.
2
+ You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently, given a context of the situation.
3
3
 
4
4
  Key evaluation principles:
5
- 1. Focus on whether the core information and outcome is the same
6
- 2. Different phrasings that convey the same result should be considered equivalent
7
- 3. When specific values (IDs, dates, amounts, names) appear in both texts, they must match exactly
8
- 4. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs
9
- 5. Different levels of detail are acceptable if they don't contradict each other and the primary information remains intact
10
- 6. Reference IDs that are clearly system-generated (like request IDs, confirmation numbers, UUIDs) may vary and should be ignored
5
+ 1. Focus on whether the core information and outcome is the same.
6
+ 2. Different phrasings that convey the same result should be considered equivalent.
7
+ 3. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs.
8
+ 4. When specific values (e.g. IDs, dates, amounts, names) appear in both texts, they must match exactly. If they appear only in one text but the other text doesn’t contradict them, consider it equivalent.
9
+ 5. Reference IDs that are system-generated (e.g. item IDs, request IDs, confirmation numbers, UUIDs) should be ignored when checking for equivalence.
10
+ 6. When checking query results like lists or tables, differences in field values, and rows are acceptable as long as the same entities or items are represented and the query intent, data type, and structure remain the same.
11
11
 
12
12
  Respond ONLY with:
13
13
  - True: if the texts convey the same essential information and outcomes
@@ -20,16 +20,30 @@ DO NOT provide explanations or commentary - only respond with "True" or "False"
20
20
  Evaluate the following examples:
21
21
 
22
22
  ### Example 1
23
+ Context:
24
+ Get me a list of all active machines.
25
+
23
26
  Expected:
24
- Your email has been successfully updated.
27
+ Here are all the active machines:
28
+ | id | name | number | status |
29
+ |----|-----------|--------|----------|
30
+ | 43 | NNM1 | | active |
31
+ | 01 | XYZ2 | | active |
32
+ | 44 | RRX | | active |
25
33
 
26
34
  Actual:
27
- You have successfully updated your email.
35
+ Here are all the active machines:
36
+ | id | name | number | status |
37
+ |----|-----------|--------|----------|
38
+ | 1280 | ABC | | active |
28
39
 
29
40
  Answer:
30
41
  True
31
42
 
32
43
  ### Example 2
44
+ Context:
45
+ Give me information about Ontario.
46
+
33
47
  Expected:
34
48
  Ontario is a province in Canada.
35
49
 
@@ -40,6 +54,9 @@ Answer:
40
54
  False
41
55
 
42
56
  ### Example 3
57
+ Context:
58
+ Find payslip details for user 12345.
59
+
43
60
  Expected:
44
61
  No payslips found for user with ID 12345.
45
62
 
@@ -50,6 +67,9 @@ Answer:
50
67
  True
51
68
 
52
69
  ### Example 4
70
+ Context:
71
+ I'd like to create a new time off request.
72
+
53
73
  Expected:
54
74
  Your time off request from 2024-11-01 to 2024-11-01 for TRAVEL has been successfully submitted. The request ID is c705878eb6584e9b910b8db3907a31da.
55
75
 
@@ -60,6 +80,9 @@ Answer:
60
80
  True
61
81
 
62
82
  ### Example 5
83
+ Context:
84
+ What's my compensation details?
85
+
63
86
  Expected:
64
87
  Your compensation details are as follows:
65
88
  * Currency: USD
@@ -72,6 +95,9 @@ Answer:
72
95
  True
73
96
 
74
97
  ### Example 6
98
+ Context:
99
+ Show me my visa details.
100
+
75
101
  Expected:
76
102
  Your visa details are as follows:
77
103
  - Country: 44
@@ -88,6 +114,9 @@ Answer:
88
114
  False
89
115
 
90
116
  ### Example 7
117
+ Context:
118
+ Update my preferred name and my starting date.
119
+
91
120
  Expected:
92
121
  I successfully updated your personal information.
93
122
 
@@ -101,6 +130,9 @@ True
101
130
 
102
131
  ### Now, evaluate the following:
103
132
 
133
+ Context:
134
+ {{ context }}
135
+
104
136
  Expected:
105
137
  {{ expected_text }}
106
138
 
@@ -45,9 +45,11 @@ class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
45
45
 
46
46
 
47
47
  class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
48
- def render(self, expected_text: str, actual_text: str) -> str:
48
+ def render(self, context: str, expected_text: str, actual_text: str) -> str:
49
49
  return super().render(
50
- expected_text=expected_text, actual_text=actual_text
50
+ context=context,
51
+ expected_text=expected_text,
52
+ actual_text=actual_text,
51
53
  )
52
54
 
53
55