unique_toolkit 1.45.5__py3-none-any.whl → 1.45.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. unique_toolkit/agentic/evaluation/config.py +25 -6
  2. unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
  3. unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
  4. unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
  5. unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
  6. unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
  7. unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
  8. unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
  9. unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
  10. unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
  11. unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
  12. unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
  13. unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
  14. unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
  15. unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
  16. unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
  17. unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
  18. unique_toolkit/agentic/evaluation/utils.py +8 -0
  19. {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.6.dist-info}/METADATA +4 -1
  20. {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.6.dist-info}/RECORD +22 -12
  21. unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
  22. {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.6.dist-info}/LICENSE +0 -0
  23. {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.6.dist-info}/WHEEL +0 -0
@@ -1,9 +1,13 @@
1
- import logging
2
- from enum import StrEnum
3
- from string import Template
1
+ import re
2
+ from logging import getLogger
4
3
 
4
+ from unique_toolkit._common.utils.jinja.render import render_template
5
5
  from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
6
6
  from unique_toolkit.agentic.evaluation.exception import EvaluatorException
7
+ from unique_toolkit.agentic.evaluation.hallucination.constants import (
8
+ SourceSelectionMode,
9
+ hallucination_required_input_fields,
10
+ )
7
11
  from unique_toolkit.agentic.evaluation.output_parser import parse_eval_metric_result
8
12
  from unique_toolkit.agentic.evaluation.schemas import (
9
13
  EvaluationMetricInput,
@@ -20,19 +24,7 @@ from unique_toolkit.language_model.schemas import (
20
24
  )
21
25
  from unique_toolkit.language_model.service import LanguageModelService
22
26
 
23
- from .constants import (
24
- SYSTEM_MSG_DEFAULT_KEY,
25
- SYSTEM_MSG_KEY,
26
- USER_MSG_DEFAULT_KEY,
27
- USER_MSG_KEY,
28
- hallucination_required_input_fields,
29
- )
30
- from .prompts import (
31
- HALLUCINATION_METRIC_SYSTEM_MSG,
32
- HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
33
- HALLUCINATION_METRIC_USER_MSG,
34
- HALLUCINATION_METRIC_USER_MSG_DEFAULT,
35
- )
27
+ _LOGGER = getLogger(__name__)
36
28
 
37
29
 
38
30
  async def check_hallucination(
@@ -72,15 +64,14 @@ async def check_hallucination(
72
64
  EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
73
65
  """
74
66
 
75
- logger = logging.getLogger(f"check_hallucination.{__name__}")
76
-
77
67
  model_name = config.language_model.name
78
- logger.info(f"Analyzing level of hallucination with {model_name}.")
68
+ _LOGGER.info(f"Analyzing level of hallucination with {model_name}.")
79
69
 
80
70
  input.validate_required_fields(hallucination_required_input_fields)
81
71
 
82
72
  try:
83
- msgs = _get_msgs(input, config, logger)
73
+ msgs = _get_msgs(input, config)
74
+
84
75
  result = await LanguageModelService.complete_async_util(
85
76
  company_id=company_id, user_id=user_id, messages=msgs, model_name=model_name
86
77
  )
@@ -91,10 +82,12 @@ async def check_hallucination(
91
82
  error_message=error_message,
92
83
  user_message=error_message,
93
84
  )
94
- return parse_eval_metric_result(
85
+ result = parse_eval_metric_result(
95
86
  result_content, # type: ignore
96
87
  EvaluationMetricName.HALLUCINATION,
97
88
  )
89
+
90
+ return result
98
91
  except Exception as e:
99
92
  error_message = "Error occurred during hallucination metric analysis"
100
93
  raise EvaluatorException(
@@ -107,131 +100,142 @@ async def check_hallucination(
107
100
  def _get_msgs(
108
101
  input: EvaluationMetricInput,
109
102
  config: EvaluationMetricConfig,
110
- logger: logging.Logger,
111
103
  ):
112
104
  """
113
105
  Composes the messages for hallucination analysis based on the provided input and configuration.
114
106
 
115
- This method decides how to compose the messages based on the availability of context texts and history
116
- message texts in the `input`
107
+ This method composes messages with or without context based on the availability of context texts
108
+ and history message texts in the input.
117
109
 
118
110
  Args:
119
111
  input (EvaluationMetricInput): The input data that includes context texts and history message texts
120
112
  for the analysis.
121
113
  config (EvaluationMetricConfig): The configuration settings for composing messages.
122
114
  logger (Optional[logging.Logger], optional): The logger used for logging debug information.
123
- Defaults to the logger for the current module.
124
115
 
125
116
  Returns:
126
- The composed messages as per the provided input and configuration. The exact type and structure
127
- depend on the implementation of the `compose_msgs` and `compose_msgs_default` methods.
128
-
117
+ The composed messages as per the provided input and configuration.
129
118
  """
130
- if input.context_texts or input.history_messages:
131
- logger.debug("Using context / history for hallucination evaluation.")
132
- return _compose_msgs(input, config)
119
+ has_context = bool(input.context_texts or input.history_messages)
120
+
121
+ if has_context:
122
+ _LOGGER.debug("Using context / history for hallucination evaluation.")
133
123
  else:
134
- logger.debug("No contexts and history provided for hallucination evaluation.")
135
- return _compose_msgs_default(input, config)
124
+ _LOGGER.debug("No contexts and history provided for hallucination evaluation.")
125
+
126
+ return _compose_msgs(input, config, has_context)
136
127
 
137
128
 
138
129
  def _compose_msgs(
139
130
  input: EvaluationMetricInput,
140
131
  config: EvaluationMetricConfig,
132
+ has_context: bool,
141
133
  ):
142
134
  """
143
- Composes the hallucination analysis messages.
144
- """
145
- system_msg_content = _get_system_prompt_with_contexts(config)
146
- system_msg = LanguageModelSystemMessage(content=system_msg_content)
147
-
148
- user_msg_templ = Template(_get_user_prompt_with_contexts(config))
149
- user_msg_content = user_msg_templ.substitute(
150
- input_text=input.input_text,
151
- contexts_text=input.get_joined_context_texts(tag_name="reference"),
152
- history_messages_text=input.get_joined_history_texts(tag_name="conversation"),
153
- output_text=input.output_text,
154
- )
155
- user_msg = LanguageModelUserMessage(content=user_msg_content)
156
- return LanguageModelMessages([system_msg, user_msg])
135
+ Composes the hallucination analysis messages using Jinja2 templates.
157
136
 
137
+ Args:
138
+ input (EvaluationMetricInput): The input data for evaluation.
139
+ config (EvaluationMetricConfig): The configuration settings.
140
+ has_context (bool): Whether context/history is available.
158
141
 
159
- def _compose_msgs_default(
160
- input: EvaluationMetricInput,
161
- config: EvaluationMetricConfig,
162
- ):
163
- """
164
- Composes the hallucination analysis prompt without messages.
142
+ Returns:
143
+ LanguageModelMessages: The composed messages for evaluation.
165
144
  """
166
- system_msg_content = _get_system_prompt_default(config)
145
+ # Get templates
146
+ system_template = config.prompts_config.system_prompt_template
147
+ user_template = config.prompts_config.user_prompt_template
148
+
149
+ # Render system message
150
+ system_msg_content = render_template(
151
+ system_template,
152
+ has_context=has_context,
153
+ )
167
154
  system_msg = LanguageModelSystemMessage(content=system_msg_content)
168
155
 
169
- user_msg_templ = Template(_get_user_prompt_default(config))
170
- user_msg_content = user_msg_templ.substitute(
156
+ # Render user message
157
+ user_msg_content = render_template(
158
+ user_template,
171
159
  input_text=input.input_text,
160
+ contexts_text=input.get_joined_context_texts(tag_name="reference")
161
+ if has_context
162
+ else None,
163
+ history_messages_text=input.get_joined_history_texts(tag_name="conversation")
164
+ if has_context
165
+ else None,
172
166
  output_text=input.output_text,
173
167
  )
174
168
  user_msg = LanguageModelUserMessage(content=user_msg_content)
175
- return LanguageModelMessages([system_msg, user_msg])
176
-
177
-
178
- def _get_system_prompt_with_contexts(config: EvaluationMetricConfig):
179
- return config.custom_prompts.setdefault(
180
- SYSTEM_MSG_KEY,
181
- HALLUCINATION_METRIC_SYSTEM_MSG,
182
- )
183
-
184
-
185
- def _get_user_prompt_with_contexts(config: EvaluationMetricConfig):
186
- return config.custom_prompts.setdefault(
187
- USER_MSG_KEY,
188
- HALLUCINATION_METRIC_USER_MSG,
189
- )
190
169
 
170
+ return LanguageModelMessages([system_msg, user_msg])
191
171
 
192
- def _get_system_prompt_default(config: EvaluationMetricConfig):
193
- return config.custom_prompts.setdefault(
194
- SYSTEM_MSG_DEFAULT_KEY,
195
- HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
196
- )
197
172
 
173
+ def context_text_from_stream_response(
174
+ response: LanguageModelStreamResponse,
175
+ selected_chunks: list[ContentChunk],
176
+ source_selection_mode: SourceSelectionMode = SourceSelectionMode.FROM_ORIGINAL_RESPONSE,
177
+ reference_pattern: str = r"[\[<]?source(\d+)[>\]]?",
178
+ ) -> list[str]:
179
+ """Extract context text from stream response based on selected chunks.
198
180
 
199
- def _get_user_prompt_default(config: EvaluationMetricConfig):
200
- return config.custom_prompts.setdefault(
201
- USER_MSG_DEFAULT_KEY,
202
- HALLUCINATION_METRIC_USER_MSG_DEFAULT,
203
- )
181
+ Args:
182
+ response: The language model stream response containing references.
183
+ selected_chunks: List of content chunks to select from.
184
+ source_selection_mode: Strategy for selecting referenced chunks.
185
+ - FROM_IDS: Match by chunk IDs (default)
186
+ - FROM_ORDER: Select by order of appearance
187
+ - FROM_ORIGINAL_RESPONSE: Extract from original response text using regex
188
+ ref_pattern: Regex pattern for extracting source numbers (only used with FROM_ORIGINAL_RESPONSE).
204
189
 
190
+ Returns:
191
+ List of text strings from the referenced chunks.
205
192
 
206
- class SourceSelectionMode(StrEnum):
207
- FROM_IDS = "FROM_IDS"
208
- FROM_ORDER = "FROM_ORDER"
193
+ Raises:
194
+ ValueError: If source_selection_mode is invalid or required data is missing.
195
+ """
196
+ response_references = response.message.references
209
197
 
198
+ # Define selection strategies
199
+ strategies = {
200
+ SourceSelectionMode.FROM_IDS: lambda: _default_source_selection_mode(
201
+ response_references, selected_chunks
202
+ ),
203
+ SourceSelectionMode.FROM_ORDER: lambda: _from_order_source_selection_mode(
204
+ response_references, selected_chunks
205
+ ),
206
+ SourceSelectionMode.FROM_ORIGINAL_RESPONSE: lambda: _from_original_response_source_selection_mode(
207
+ response.message.original_text, selected_chunks, reference_pattern
208
+ ),
209
+ }
210
210
 
211
- def context_text_from_stream_response(
212
- response: LanguageModelStreamResponse,
213
- selected_chunks: list[ContentChunk],
214
- source_selection_mode: SourceSelectionMode = SourceSelectionMode.FROM_IDS,
215
- ):
216
- response_references = response.message.references
217
- match source_selection_mode:
218
- case SourceSelectionMode.FROM_IDS:
219
- referenced_chunks = _default_source_selection_mode(
220
- response_references, selected_chunks
221
- )
222
- case SourceSelectionMode.FROM_ORDER:
223
- referenced_chunks = _from_order_source_selection_mode(
224
- response_references, selected_chunks
225
- )
226
- case _:
211
+ try:
212
+ if source_selection_mode not in strategies:
227
213
  raise ValueError(f"Invalid source selection mode: {source_selection_mode}")
228
214
 
215
+ _LOGGER.info(f"Selecting context text using {source_selection_mode} mode.")
216
+ referenced_chunks = strategies[source_selection_mode]()
217
+ except Exception as e:
218
+ _LOGGER.exception(f"Error selecting context text: {e}")
219
+ _LOGGER.info("Falling back to default source selection mode.")
220
+ referenced_chunks = _default_source_selection_mode(
221
+ response_references, selected_chunks
222
+ )
223
+
229
224
  return [chunk.text for chunk in referenced_chunks]
230
225
 
231
226
 
232
227
  def _default_source_selection_mode(
233
228
  references: list[ContentReference], selected_chunks: list[ContentChunk]
234
- ):
229
+ ) -> list[ContentChunk]:
230
+ """Select chunks by matching reference IDs.
231
+
232
+ Args:
233
+ references: List of content references with source IDs.
234
+ selected_chunks: List of content chunks to select from.
235
+
236
+ Returns:
237
+ List of referenced content chunks.
238
+ """
235
239
  reference_ids = {reference.source_id for reference in references}
236
240
 
237
241
  def build_chunk_id(chunk: ContentChunk) -> str:
@@ -246,7 +250,16 @@ def _default_source_selection_mode(
246
250
 
247
251
  def _from_order_source_selection_mode(
248
252
  references: list[ContentReference], selected_chunks: list[ContentChunk]
249
- ):
253
+ ) -> list[ContentChunk]:
254
+ """Select chunks by order of appearance in references.
255
+
256
+ Args:
257
+ references: List of content references with original indices.
258
+ selected_chunks: List of content chunks to select from.
259
+
260
+ Returns:
261
+ List of referenced content chunks in order of appearance.
262
+ """
250
263
  original_chunks_order: list[int] = []
251
264
  for reference in references:
252
265
  for original_index in reference.original_index:
@@ -258,3 +271,41 @@ def _from_order_source_selection_mode(
258
271
  referenced_chunks.append(selected_chunks[index])
259
272
 
260
273
  return referenced_chunks
274
+
275
+
276
+ def _from_original_response_source_selection_mode(
277
+ original_text: str | None,
278
+ selected_chunks: list[ContentChunk],
279
+ reference_pattern: str,
280
+ ) -> list[ContentChunk]:
281
+ """Extract referenced chunks from original text using regex pattern.
282
+
283
+ Args:
284
+ original_text: The original response text containing source references.
285
+ selected_chunks: List of content chunks to select from.
286
+ ref_pattern: Regex pattern for extracting source numbers.
287
+
288
+ Returns:
289
+ List of referenced content chunks.
290
+ """
291
+ if original_text is None:
292
+ raise ValueError("original_text is required for FROM_ORIGINAL_RESPONSE mode")
293
+ _LOGGER.debug("Processing original text for source extraction")
294
+ source_number_matches = re.findall(reference_pattern, original_text)
295
+
296
+ # Remove duplicates and preserve order
297
+ source_numbers = list(dict.fromkeys(int(num) for num in source_number_matches))
298
+
299
+ # Add bounds checking
300
+ max_index = len(selected_chunks) - 1
301
+ valid_source_numbers = [idx for idx in source_numbers if 0 <= idx <= max_index]
302
+
303
+ if len(valid_source_numbers) < len(source_numbers):
304
+ invalid_numbers = set(source_numbers) - set(valid_source_numbers)
305
+ _LOGGER.warning(
306
+ f"Some source indices were out of bounds (max index: {max_index}). "
307
+ f"Valid indices: {sorted(valid_source_numbers)}, Invalid indices: {sorted(invalid_numbers)}"
308
+ )
309
+
310
+ referenced_chunks = [selected_chunks[idx] for idx in valid_source_numbers]
311
+ return referenced_chunks
@@ -0,0 +1,102 @@
1
+ """Centralized fixtures for evaluation tests."""
2
+
3
+ from unittest.mock import MagicMock
4
+
5
+ import pytest
6
+
7
+ from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
8
+ from unique_toolkit.agentic.evaluation.context_relevancy.service import (
9
+ ContextRelevancyEvaluator,
10
+ )
11
+ from unique_toolkit.agentic.evaluation.schemas import (
12
+ EvaluationMetricInput,
13
+ EvaluationMetricName,
14
+ )
15
+ from unique_toolkit.app.schemas import ChatEvent
16
+ from unique_toolkit.chat.service import LanguageModelName
17
+ from unique_toolkit.language_model.infos import LanguageModelInfo
18
+
19
+
20
+ @pytest.fixture
21
+ def base_chat_event() -> MagicMock:
22
+ """
23
+ Create a base chat event mock for evaluation tests.
24
+
25
+ Returns:
26
+ MagicMock configured with standard test event properties.
27
+ """
28
+ event = MagicMock(spec=ChatEvent)
29
+ event.payload = MagicMock()
30
+ event.payload.user_message = MagicMock()
31
+ event.payload.user_message.text = "Test query"
32
+ event.user_id = "user_0"
33
+ event.company_id = "company_0"
34
+ return event
35
+
36
+
37
+ @pytest.fixture
38
+ def context_relevancy_evaluator(
39
+ base_chat_event: MagicMock,
40
+ ) -> ContextRelevancyEvaluator:
41
+ """
42
+ Create a ContextRelevancyEvaluator instance with base event.
43
+
44
+ Args:
45
+ base_chat_event: Mock chat event fixture.
46
+
47
+ Returns:
48
+ Configured ContextRelevancyEvaluator instance.
49
+ """
50
+ return ContextRelevancyEvaluator(base_chat_event)
51
+
52
+
53
+ @pytest.fixture
54
+ def basic_evaluation_config() -> EvaluationMetricConfig:
55
+ """
56
+ Create a basic evaluation config for context relevancy tests.
57
+
58
+ Returns:
59
+ EvaluationMetricConfig with standard settings.
60
+ """
61
+ return EvaluationMetricConfig(
62
+ enabled=True,
63
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
64
+ language_model=LanguageModelInfo.from_name(
65
+ LanguageModelName.AZURE_GPT_4o_2024_0806
66
+ ),
67
+ )
68
+
69
+
70
+ @pytest.fixture
71
+ def structured_evaluation_config(
72
+ basic_evaluation_config: EvaluationMetricConfig,
73
+ ) -> EvaluationMetricConfig:
74
+ """
75
+ Create evaluation config with structured output enabled.
76
+
77
+ Args:
78
+ basic_evaluation_config: Base config fixture.
79
+
80
+ Returns:
81
+ EvaluationMetricConfig configured for structured output.
82
+ """
83
+ model_info = LanguageModelInfo.from_name(LanguageModelName.AZURE_GPT_4o_2024_0806)
84
+ return EvaluationMetricConfig(
85
+ enabled=True,
86
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
87
+ language_model=model_info,
88
+ )
89
+
90
+
91
+ @pytest.fixture
92
+ def sample_evaluation_input() -> EvaluationMetricInput:
93
+ """
94
+ Create sample evaluation input with test data.
95
+
96
+ Returns:
97
+ EvaluationMetricInput with test query and contexts.
98
+ """
99
+ return EvaluationMetricInput(
100
+ input_text="test query",
101
+ context_texts=["test context 1", "test context 2"],
102
+ )
@@ -0,0 +1,247 @@
1
+ """Tests for evaluation config module."""
2
+
3
+ import pytest
4
+
5
+ from unique_toolkit.agentic.evaluation.config import (
6
+ EvaluationMetricConfig,
7
+ EvaluationMetricPromptsConfig,
8
+ )
9
+ from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricName
10
+ from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
11
+ from unique_toolkit.language_model.infos import LanguageModelInfo
12
+
13
+
14
+ @pytest.mark.ai
15
+ def test_evaluation_metric_prompts_config__initializes_with_empty_strings__by_default() -> (
16
+ None
17
+ ):
18
+ """
19
+ Purpose: Verify that EvaluationMetricPromptsConfig initializes with empty template strings.
20
+ Why this matters: Default initialization should not load templates automatically.
21
+ Setup summary: Create config with no arguments, assert empty string defaults.
22
+ """
23
+ # Arrange - No setup needed
24
+
25
+ # Act
26
+ config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig()
27
+
28
+ # Assert
29
+ assert config.system_prompt_template == ""
30
+ assert config.user_prompt_template == ""
31
+
32
+
33
+ @pytest.mark.ai
34
+ def test_evaluation_metric_prompts_config__accepts_custom_templates__on_initialization() -> (
35
+ None
36
+ ):
37
+ """
38
+ Purpose: Verify that EvaluationMetricPromptsConfig accepts custom template values.
39
+ Why this matters: Allows customization of prompts for different evaluation scenarios.
40
+ Setup summary: Initialize with custom prompts, assert they are stored correctly.
41
+ """
42
+ # Arrange
43
+ system_prompt: str = "Custom system prompt"
44
+ user_prompt: str = "Custom user prompt"
45
+
46
+ # Act
47
+ config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig(
48
+ system_prompt_template=system_prompt,
49
+ user_prompt_template=user_prompt,
50
+ )
51
+
52
+ # Assert
53
+ assert config.system_prompt_template == system_prompt
54
+ assert config.user_prompt_template == user_prompt
55
+
56
+
57
+ @pytest.mark.ai
58
+ def test_evaluation_metric_prompts_config__stores_strings__for_template_fields() -> (
59
+ None
60
+ ):
61
+ """
62
+ Purpose: Verify that prompt template fields accept and store string values.
63
+ Why this matters: Type safety for prompt templates is critical for rendering.
64
+ Setup summary: Create config with string prompts, assert type is string.
65
+ """
66
+ # Arrange
67
+ system_template: str = "Test system prompt"
68
+ user_template: str = "Test user prompt"
69
+
70
+ # Act
71
+ config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig(
72
+ system_prompt_template=system_template,
73
+ user_prompt_template=user_template,
74
+ )
75
+
76
+ # Assert
77
+ assert isinstance(config.system_prompt_template, str)
78
+ assert isinstance(config.user_prompt_template, str)
79
+
80
+
81
+ @pytest.mark.ai
82
+ def test_evaluation_metric_prompts_config__allows_modification__after_initialization() -> (
83
+ None
84
+ ):
85
+ """
86
+ Purpose: Verify that prompt config fields can be modified after creation.
87
+ Why this matters: Enables dynamic prompt updates during runtime.
88
+ Setup summary: Create config, modify fields, assert new values.
89
+ """
90
+ # Arrange
91
+ config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig()
92
+
93
+ # Act
94
+ config.system_prompt_template = "New system prompt"
95
+ config.user_prompt_template = "New user prompt"
96
+
97
+ # Assert
98
+ assert config.system_prompt_template == "New system prompt"
99
+ assert config.user_prompt_template == "New user prompt"
100
+
101
+
102
+ @pytest.mark.ai
103
+ def test_evaluation_metric_config__initializes_with_default_prompts_config__when_not_provided() -> (
104
+ None
105
+ ):
106
+ """
107
+ Purpose: Verify that EvaluationMetricConfig creates default prompts config.
108
+ Why this matters: Ensures config is always in valid state even without explicit prompts.
109
+ Setup summary: Create config without prompts_config, assert default empty prompts.
110
+ """
111
+ # Arrange - No setup needed
112
+
113
+ # Act
114
+ config: EvaluationMetricConfig = EvaluationMetricConfig(
115
+ enabled=True,
116
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
117
+ )
118
+
119
+ # Assert
120
+ assert isinstance(config.prompts_config, EvaluationMetricPromptsConfig)
121
+ assert config.prompts_config.system_prompt_template == ""
122
+ assert config.prompts_config.user_prompt_template == ""
123
+
124
+
125
+ @pytest.mark.ai
126
+ def test_evaluation_metric_config__accepts_custom_prompts_config__on_initialization() -> (
127
+ None
128
+ ):
129
+ """
130
+ Purpose: Verify that EvaluationMetricConfig accepts custom prompts configuration.
131
+ Why this matters: Allows full customization of evaluation prompts per metric.
132
+ Setup summary: Create custom prompts config, pass to metric config, assert values.
133
+ """
134
+ # Arrange
135
+ prompts_config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig(
136
+ system_prompt_template="Custom system",
137
+ user_prompt_template="Custom user",
138
+ )
139
+
140
+ # Act
141
+ config: EvaluationMetricConfig = EvaluationMetricConfig(
142
+ enabled=True,
143
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
144
+ prompts_config=prompts_config,
145
+ )
146
+
147
+ # Assert
148
+ assert config.prompts_config.system_prompt_template == "Custom system"
149
+ assert config.prompts_config.user_prompt_template == "Custom user"
150
+
151
+
152
+ @pytest.mark.ai
153
+ def test_evaluation_metric_config__has_all_required_fields__on_initialization() -> None:
154
+ """
155
+ Purpose: Verify that EvaluationMetricConfig has all expected configuration fields.
156
+ Why this matters: Ensures complete config structure for evaluation metrics.
157
+ Setup summary: Create config with language model, assert all fields exist.
158
+ """
159
+ # Arrange
160
+ language_model: LanguageModelInfo = LanguageModelInfo.from_name(DEFAULT_GPT_4o)
161
+
162
+ # Act
163
+ config: EvaluationMetricConfig = EvaluationMetricConfig(
164
+ enabled=True,
165
+ name=EvaluationMetricName.HALLUCINATION,
166
+ language_model=language_model,
167
+ )
168
+
169
+ # Assert
170
+ assert hasattr(config, "enabled")
171
+ assert hasattr(config, "name")
172
+ assert hasattr(config, "language_model")
173
+ assert hasattr(config, "additional_llm_options")
174
+ assert hasattr(config, "prompts_config")
175
+ assert hasattr(config, "score_to_label")
176
+ assert hasattr(config, "score_to_title")
177
+
178
+
179
+ @pytest.mark.ai
180
+ def test_evaluation_metric_config__defaults_to_empty_dict__for_additional_llm_options() -> (
181
+ None
182
+ ):
183
+ """
184
+ Purpose: Verify that additional_llm_options defaults to empty dictionary.
185
+ Why this matters: Provides safe default for optional LLM configuration.
186
+ Setup summary: Create config without options, assert empty dict default.
187
+ """
188
+ # Arrange - No setup needed
189
+
190
+ # Act
191
+ config: EvaluationMetricConfig = EvaluationMetricConfig(
192
+ enabled=True,
193
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
194
+ )
195
+
196
+ # Assert
197
+ assert config.additional_llm_options == {}
198
+ assert isinstance(config.additional_llm_options, dict)
199
+
200
+
201
+ @pytest.mark.ai
202
+ def test_evaluation_metric_config__defaults_to_empty_dicts__for_score_mappings() -> (
203
+ None
204
+ ):
205
+ """
206
+ Purpose: Verify that score mapping dictionaries default to empty.
207
+ Why this matters: Allows optional score labeling and titling per metric.
208
+ Setup summary: Create config without mappings, assert empty dict defaults.
209
+ """
210
+ # Arrange - No setup needed
211
+
212
+ # Act
213
+ config: EvaluationMetricConfig = EvaluationMetricConfig(
214
+ enabled=True,
215
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
216
+ )
217
+
218
+ # Assert
219
+ assert config.score_to_label == {}
220
+ assert config.score_to_title == {}
221
+
222
+
223
+ @pytest.mark.ai
224
+ def test_evaluation_metric_config__serializes_to_dict__with_all_fields() -> None:
225
+ """
226
+ Purpose: Verify that config can be serialized to dictionary format.
227
+ Why this matters: Required for persistence and API serialization.
228
+ Setup summary: Create config with custom prompts, serialize, assert structure.
229
+ """
230
+ # Arrange
231
+ prompts_config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig(
232
+ system_prompt_template="System",
233
+ user_prompt_template="User",
234
+ )
235
+
236
+ # Act
237
+ config: EvaluationMetricConfig = EvaluationMetricConfig(
238
+ enabled=True,
239
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
240
+ prompts_config=prompts_config,
241
+ )
242
+ config_dict: dict = config.model_dump()
243
+
244
+ # Assert
245
+ assert "prompts_config" in config_dict
246
+ assert config_dict["prompts_config"]["system_prompt_template"] == "System"
247
+ assert config_dict["prompts_config"]["user_prompt_template"] == "User"