unique_toolkit 0.8.14__py3-none-any.whl → 0.8.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. unique_toolkit/_common/default_language_model.py +6 -0
  2. unique_toolkit/_common/token/image_token_counting.py +67 -0
  3. unique_toolkit/_common/token/token_counting.py +196 -0
  4. unique_toolkit/evals/config.py +36 -0
  5. unique_toolkit/evals/context_relevancy/prompts.py +56 -0
  6. unique_toolkit/evals/context_relevancy/schema.py +88 -0
  7. unique_toolkit/evals/context_relevancy/service.py +241 -0
  8. unique_toolkit/evals/hallucination/constants.py +61 -0
  9. unique_toolkit/evals/hallucination/hallucination_evaluation.py +92 -0
  10. unique_toolkit/evals/hallucination/prompts.py +79 -0
  11. unique_toolkit/evals/hallucination/service.py +57 -0
  12. unique_toolkit/evals/hallucination/utils.py +213 -0
  13. unique_toolkit/evals/output_parser.py +48 -0
  14. unique_toolkit/evals/tests/test_context_relevancy_service.py +252 -0
  15. unique_toolkit/evals/tests/test_output_parser.py +80 -0
  16. unique_toolkit/history_manager/history_construction_with_contents.py +307 -0
  17. unique_toolkit/history_manager/history_manager.py +80 -111
  18. unique_toolkit/history_manager/loop_token_reducer.py +457 -0
  19. unique_toolkit/language_model/schemas.py +8 -0
  20. unique_toolkit/reference_manager/reference_manager.py +15 -2
  21. {unique_toolkit-0.8.14.dist-info → unique_toolkit-0.8.16.dist-info}/METADATA +7 -1
  22. {unique_toolkit-0.8.14.dist-info → unique_toolkit-0.8.16.dist-info}/RECORD +24 -7
  23. {unique_toolkit-0.8.14.dist-info → unique_toolkit-0.8.16.dist-info}/LICENSE +0 -0
  24. {unique_toolkit-0.8.14.dist-info → unique_toolkit-0.8.16.dist-info}/WHEEL +0 -0
@@ -0,0 +1,6 @@
1
+ from unique_toolkit.language_model.infos import LanguageModelName
2
+
3
+ DEFAULT_GPT_35_TURBO = LanguageModelName.AZURE_GPT_35_TURBO_0125
4
+ DEFAULT_GPT_4o = LanguageModelName.AZURE_GPT_4o_2024_1120
5
+ DEFAULT_GPT_4o_STRUCTURED_OUTPUT = LanguageModelName.AZURE_GPT_4o_2024_0806
6
+ DEFAULT_GPT_4o_MINI = LanguageModelName.AZURE_GPT_4o_MINI_2024_0718
@@ -0,0 +1,67 @@
1
+ import base64
2
+ import math
3
+ import re
4
+ from enum import Enum
5
+ from io import BytesIO
6
+
7
+ from PIL import Image
8
+
9
+
10
+ class DetailLevel(Enum):
11
+ LOW = "low"
12
+ HIGH = "high"
13
+
14
+
15
+ # https://platform.openai.com/docs/guides/vision/calculating-costs#calculating-costs
16
+ def calculate_image_tokens(width, height, detail: DetailLevel):
17
+ """
18
+ Calculate the token cost of an image based on its dimensions and detail level.
19
+ NOTE: While we followed the documentation provided by openai to calculate image token cost, in practice,
20
+ we notice that this function overestimate the number of tokens consumed by the model.
21
+
22
+ Parameters:
23
+ - width (int): The width of the image in pixels.
24
+ - height (int): The height of the image in pixels.
25
+ - detail (str): The detail level, either "low" or "high".
26
+
27
+ Returns:
28
+ - int: The token cost of the image.
29
+ """
30
+ # Base cost for low detail
31
+ if detail == DetailLevel.LOW:
32
+ return 85
33
+
34
+ # Scaling for high detail
35
+ # Scale down to fit within 2048x2048 square
36
+ max_long_dim = 2048
37
+ long_dim = max(width, height)
38
+ if long_dim > max_long_dim:
39
+ scale_factor = long_dim / max_long_dim
40
+ width = int(width / scale_factor)
41
+ height = int(height / scale_factor)
42
+
43
+ # Scale down the shortest side to 768
44
+ max_short_dim = 768
45
+ short_dim = min(width, height)
46
+ if short_dim > max_short_dim:
47
+ scale_factor = short_dim / max_short_dim
48
+ width = int(width / scale_factor)
49
+ height = int(height / scale_factor)
50
+
51
+ # Step 3: Calculate the number of 512x512 tiles
52
+ tiles = math.ceil(width / 512) * math.ceil(height / 512)
53
+ # Step 4: Compute token cost
54
+ token_cost = (tiles * 170) + 85
55
+ return token_cost
56
+
57
+
58
+ def calculate_image_tokens_from_base64(base64_string: str):
59
+ base64_string = remove_base64_header(base64_string)
60
+ image = Image.open(BytesIO(base64.b64decode(base64_string)))
61
+ # DETAIL LEVEL HIGH IS THE DEFAULT TO BE ON THE SAFE SIDE
62
+ return calculate_image_tokens(image.width, image.height, DetailLevel.HIGH)
63
+
64
+
65
+ def remove_base64_header(base64_string: str):
66
+ header_pattern = r"^data:image/\w+;base64,"
67
+ return re.sub(header_pattern, "", base64_string)
@@ -0,0 +1,196 @@
1
+ # Original source
2
+ # https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
3
+
4
+ import json
5
+ from typing import Any, Callable
6
+
7
+ from pydantic import BaseModel
8
+ from unique_toolkit.language_model import (
9
+ LanguageModelMessage,
10
+ LanguageModelMessages,
11
+ LanguageModelName,
12
+ )
13
+
14
+ from _common.utils.token.image_token_counting import (
15
+ calculate_image_tokens_from_base64,
16
+ )
17
+
18
+
19
+ class SpecialToolCallingTokens(BaseModel):
20
+ func_init: int = 0
21
+ prop_init: int = 0
22
+ prop_key: int = 0
23
+ enum_init: int = 0
24
+ enum_item: int = 0
25
+ func_end: int = 0
26
+
27
+
28
+ def get_special_token(model: LanguageModelName) -> SpecialToolCallingTokens:
29
+ special_token = SpecialToolCallingTokens()
30
+
31
+ match model:
32
+ case (
33
+ LanguageModelName.AZURE_GPT_4o_2024_0513
34
+ | LanguageModelName.AZURE_GPT_4o_2024_0806
35
+ | LanguageModelName.AZURE_GPT_4o_MINI_2024_0718
36
+ | LanguageModelName.AZURE_GPT_4o_2024_1120
37
+ ):
38
+ special_token.func_init = 7
39
+ special_token.prop_init = 3
40
+ special_token.prop_key = 3
41
+ special_token.enum_init = -3
42
+ special_token.enum_item = 3
43
+ special_token.func_end = 12
44
+
45
+ case (
46
+ LanguageModelName.AZURE_GPT_35_TURBO_0125
47
+ | LanguageModelName.AZURE_GPT_4_0613
48
+ | LanguageModelName.AZURE_GPT_4_32K_0613
49
+ | LanguageModelName.AZURE_GPT_4_TURBO_2024_0409
50
+ ):
51
+ special_token.func_init = 10
52
+ special_token.prop_init = 3
53
+ special_token.prop_key = 3
54
+ special_token.enum_init = -3
55
+ special_token.enum_item = 3
56
+ special_token.func_end = 12
57
+
58
+ case _:
59
+ raise NotImplementedError(
60
+ f"""num_tokens_for_tools() is not implemented for model {model}."""
61
+ )
62
+ return special_token
63
+
64
+
65
+ def num_tokens_per_messages(
66
+ messages: list[dict[str, str]], encode: Callable[[str], list[int]]
67
+ ) -> list[int]:
68
+ """Return the number of tokens used by a list of messages."""
69
+
70
+ num_token_per_message = []
71
+ for message in messages:
72
+ num_tokens = 3 # extra_tokens_per_message
73
+ for key, value in message.items():
74
+ if isinstance(value, str):
75
+ num_tokens += len(encode(value))
76
+ elif isinstance(value, list):
77
+ # NOTE: The result returned by the function below is not 100% accurate.
78
+ num_tokens += handle_message_with_images(value, encode)
79
+ if key == "name":
80
+ num_tokens += 1 # extra_tokens_per_name
81
+
82
+ num_token_per_message.append(num_tokens)
83
+
84
+ return num_token_per_message
85
+
86
+
87
+ def num_tokens_from_messages(
88
+ messages: list[dict[str, str]], encode: Callable[[str], list[int]]
89
+ ) -> int:
90
+ """Return the number of tokens used by a list of messages."""
91
+
92
+ num_tokens_per_message = num_tokens_per_messages(messages, encode)
93
+ num_tokens = sum(num_tokens_per_message) + 3
94
+
95
+ return num_tokens
96
+
97
+
98
+ def num_tokens_for_tools(
99
+ functions: list[dict[str, Any]],
100
+ special_token: SpecialToolCallingTokens,
101
+ encode: Callable[[str], list[int]],
102
+ ):
103
+ def num_token_function_enum(
104
+ properties: dict[str, Any], encode: Callable[[str], list[int]]
105
+ ):
106
+ enum_token_count = 0
107
+ enum_token_count += special_token.enum_init
108
+ for item in properties[key]["enum"]:
109
+ enum_token_count += special_token.enum_item
110
+ enum_token_count += len(encode(item))
111
+
112
+ return enum_token_count
113
+
114
+ func_token_count = 0
115
+ if len(functions) > 0:
116
+ for func in functions:
117
+ func_token_count += special_token.func_init
118
+ function = func.get("function", {})
119
+ func_token_count += len(
120
+ encode(
121
+ function.get("name", "")
122
+ + ":"
123
+ + function.get("description", "").rstrip(".").rstrip()
124
+ )
125
+ )
126
+ if len(function.get("parameters", {}).get("properties", "")) > 0:
127
+ properties = function.get("parameters", {}).get(
128
+ "properties", ""
129
+ )
130
+ func_token_count += special_token.prop_init
131
+
132
+ for key in list(properties.keys()):
133
+ func_token_count += special_token.prop_key
134
+
135
+ if "enum" in properties[key].keys():
136
+ func_token_count += num_token_function_enum(
137
+ properties, encode
138
+ )
139
+
140
+ func_token_count += len(
141
+ encode(
142
+ f"{key}:{properties[key]['type']}:{properties[key]['description'].rstrip('.').rstrip()}"
143
+ )
144
+ )
145
+
146
+ func_token_count += special_token.func_end
147
+
148
+ return func_token_count
149
+
150
+
151
+ def handle_message_with_images(
152
+ message: list[dict], encode: Callable[[str], list[int]]
153
+ ):
154
+ token_count = 0
155
+ for item in message:
156
+ if item.get("type") == "image_url":
157
+ image_url = item.get("imageUrl", {}).get("url")
158
+ if image_url:
159
+ token_count += calculate_image_tokens_from_base64(image_url)
160
+ elif item.get("type") == "text":
161
+ token_count += len(encode(item.get("text", "")))
162
+ return token_count
163
+
164
+
165
+ def messages_to_openai_messages(
166
+ messages: LanguageModelMessages | list[LanguageModelMessage],
167
+ ):
168
+ if isinstance(messages, list):
169
+ messages = LanguageModelMessages(messages)
170
+
171
+ return [
172
+ {
173
+ k: v
174
+ for k, v in m.items()
175
+ if (k in ["content", "role"] and v is not None)
176
+ }
177
+ for m in json.loads(messages.model_dump_json())
178
+ ]
179
+
180
+
181
+ def num_tokens_per_language_model_message(
182
+ messages: LanguageModelMessages | list[LanguageModelMessage],
183
+ encode: Callable[[str], list[int]],
184
+ ) -> list[int]:
185
+ return num_tokens_per_messages(
186
+ messages=messages_to_openai_messages(messages), encode=encode
187
+ )
188
+
189
+
190
+ def num_token_for_language_model_messages(
191
+ messages: LanguageModelMessages | list[LanguageModelMessage],
192
+ encode: Callable[[str], list[int]],
193
+ ) -> int:
194
+ return num_tokens_from_messages(
195
+ messages_to_openai_messages(messages), encode
196
+ )
@@ -0,0 +1,36 @@
1
+ from typing import Any
2
+
3
+ from humps import camelize
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+
6
+ from unique_toolkit._common.validators import LMI
7
+ from unique_toolkit.language_model.infos import LanguageModelInfo, LanguageModelName
8
+
9
+
10
+ from .schemas import (
11
+ EvaluationMetricName,
12
+ )
13
+
14
+ model_config = ConfigDict(
15
+ alias_generator=camelize,
16
+ populate_by_name=True,
17
+ arbitrary_types_allowed=True,
18
+ validate_default=True,
19
+ )
20
+
21
+
22
+ class EvaluationMetricConfig(BaseModel):
23
+ model_config = model_config
24
+
25
+ enabled: bool = False
26
+ name: EvaluationMetricName
27
+ language_model: LMI = LanguageModelInfo.from_name(
28
+ LanguageModelName.AZURE_GPT_35_TURBO_0125,
29
+ )
30
+ additional_llm_options: dict[str, Any] = Field(
31
+ default={},
32
+ description="Additional options to pass to the language model.",
33
+ )
34
+ custom_prompts: dict[str, str] = {}
35
+ score_to_label: dict[str, str] = {}
36
+ score_to_title: dict[str, str] = {}
@@ -0,0 +1,56 @@
1
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG = """
2
+ You will receive an input and a set of contexts.
3
+ Your task is to evaluate how relevant the contexts are to the input text.
4
+
5
+ Use the following rating scale to generate a score:
6
+ [low] - The contexts are not relevant to the input.
7
+ [medium] - The contexts are somewhat relevant to the input.
8
+ [high] - The contexts are highly relevant to the input.
9
+
10
+ Your answer must be in JSON format:
11
+ {
12
+ "reason": Your explanation of your judgement of the evaluation,
13
+ "value": decision, must be one of the following ["low", "medium", "high"]
14
+ }
15
+ """
16
+
17
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT = """
18
+ You will receive an input and a set of contexts.
19
+ Your task is to evaluate how relevant the contexts are to the input text.
20
+ Further you should extract relevant facts from the contexts.
21
+
22
+ # Output Format
23
+ - Generate data according to the provided data schema.
24
+ - Ensure the output adheres to the format required by the pydantic object.
25
+ - All necessary fields should be populated as per the data schema guidelines.
26
+ """
27
+
28
+ CONTEXT_RELEVANCY_METRIC_USER_MSG = """
29
+ Here is the data:
30
+
31
+ Input:
32
+ '''
33
+ $input_text
34
+ '''
35
+
36
+ Contexts:
37
+ '''
38
+ $context_texts
39
+ '''
40
+
41
+ Answer as JSON:
42
+ """
43
+
44
+ CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT = """
45
+ Here is the data:
46
+
47
+ Input:
48
+ '''
49
+ $input_text
50
+ '''
51
+
52
+ Contexts:
53
+ '''
54
+ $context_texts
55
+ '''
56
+ """
@@ -0,0 +1,88 @@
1
+ from pydantic import BaseModel, Field, create_model
2
+ from pydantic.json_schema import SkipJsonSchema
3
+
4
+
5
+
6
+ from pydantic import BaseModel, ConfigDict
7
+
8
+ from unique_toolkit.tools.config import get_configuration_dict
9
+
10
+
11
+ class StructuredOutputModel(BaseModel):
12
+ model_config = ConfigDict(extra="forbid")
13
+
14
+
15
+
16
+ class StructuredOutputConfig(BaseModel):
17
+ model_config = get_configuration_dict()
18
+
19
+ enabled: bool = Field(
20
+ default=False,
21
+ description="Whether to use structured output for the evaluation.",
22
+ )
23
+ extract_fact_list: bool = Field(
24
+ default=False,
25
+ description="Whether to extract a list of relevant facts from context chunks with structured output.",
26
+ )
27
+ reason_description: str = Field(
28
+ default="A brief explanation justifying your evaluation decision.",
29
+ description="The description of the reason field for structured output.",
30
+ )
31
+ value_description: str = Field(
32
+ default="Assessment of how relevant the facts are to the query. Must be one of: ['low', 'medium', 'high'].",
33
+ description="The description of the value field for structured output.",
34
+ )
35
+
36
+ fact_description: str = Field(
37
+ default="A fact is an information that is directly answers the user's query. Make sure to emphasize the important information from the fact with bold text.",
38
+ description="The description of the fact field for structured output.",
39
+ )
40
+ fact_list_description: str = Field(
41
+ default="A list of relevant facts extracted from the source that supports or answers the user's query.",
42
+ description="The description of the fact list field for structured output.",
43
+ )
44
+
45
+
46
+ class Fact(StructuredOutputModel):
47
+ fact: str
48
+
49
+
50
+ class EvaluationSchemaStructuredOutput(StructuredOutputModel):
51
+ reason: str
52
+ value: str
53
+
54
+ fact_list: list[Fact] = Field(default_factory=list[Fact])
55
+
56
+ @classmethod
57
+ def get_with_descriptions(cls, config: StructuredOutputConfig):
58
+ if config.extract_fact_list:
59
+ FactWithDescription = create_model(
60
+ "Fact",
61
+ fact=(str, Field(..., description=config.fact_description)),
62
+ __base__=Fact,
63
+ )
64
+ fact_list_field = (
65
+ list[FactWithDescription],
66
+ Field(
67
+ description=config.fact_list_description,
68
+ ),
69
+ )
70
+ else:
71
+ fact_list_field = (
72
+ SkipJsonSchema[list[Fact]],
73
+ Field(default_factory=list[Fact]),
74
+ )
75
+
76
+ return create_model(
77
+ "EvaluationSchemaStructuredOutputWithDescription",
78
+ reason=(
79
+ str,
80
+ Field(..., description=config.reason_description),
81
+ ),
82
+ value=(
83
+ str,
84
+ Field(..., description=config.value_description),
85
+ ),
86
+ fact_list=fact_list_field,
87
+ __base__=cls,
88
+ )
@@ -0,0 +1,241 @@
1
+ import logging
2
+
3
+ from pydantic import BaseModel, ValidationError
4
+ from unique_toolkit.app.schemas import ChatEvent
5
+ from unique_toolkit.chat.service import ChatService
6
+ from unique_toolkit.language_model.infos import (
7
+ LanguageModelInfo,
8
+ LanguageModelName,
9
+ ModelCapabilities,
10
+ )
11
+ from unique_toolkit.language_model.prompt import Prompt
12
+ from unique_toolkit.language_model.schemas import (
13
+ LanguageModelMessages,
14
+ )
15
+ from unique_toolkit.language_model.service import (
16
+ LanguageModelService,
17
+ )
18
+ from unique_toolkit.evals.config import EvaluationMetricConfig
19
+ from unique_toolkit.evals.context_relevancy.schema import (
20
+ EvaluationSchemaStructuredOutput,
21
+ )
22
+ from unique_toolkit.evals.exception import EvaluatorException
23
+ from unique_toolkit.evals.output_parser import (
24
+ parse_eval_metric_result,
25
+ parse_eval_metric_result_structured_output,
26
+ )
27
+ from unique_toolkit.evals.schemas import (
28
+ EvaluationMetricInput,
29
+ EvaluationMetricInputFieldName,
30
+ EvaluationMetricName,
31
+ EvaluationMetricResult,
32
+ )
33
+
34
+
35
+ from .prompts import (
36
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
37
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
38
+ CONTEXT_RELEVANCY_METRIC_USER_MSG,
39
+ CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
40
+ )
41
+
42
+ SYSTEM_MSG_KEY = "systemPrompt"
43
+ USER_MSG_KEY = "userPrompt"
44
+
45
+ default_config = EvaluationMetricConfig(
46
+ enabled=False,
47
+ name=EvaluationMetricName.CONTEXT_RELEVANCY,
48
+ language_model=LanguageModelInfo.from_name(
49
+ LanguageModelName.AZURE_GPT_4o_2024_1120
50
+ ),
51
+ custom_prompts={
52
+ SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
53
+ USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
54
+ },
55
+ )
56
+
57
+ relevancy_required_input_fields = [
58
+ EvaluationMetricInputFieldName.INPUT_TEXT,
59
+ EvaluationMetricInputFieldName.CONTEXT_TEXTS,
60
+ ]
61
+
62
+
63
+ class ContextRelevancyEvaluator:
64
+ def __init__(
65
+ self,
66
+ event: ChatEvent,
67
+ ):
68
+ self.chat_service = ChatService(event)
69
+ self.language_model_service = LanguageModelService(event)
70
+ self.logger = logging.getLogger(f"ContextRelevancyEvaluator.{__name__}")
71
+
72
+ async def analyze(
73
+ self,
74
+ input: EvaluationMetricInput,
75
+ config: EvaluationMetricConfig = default_config,
76
+ structured_output_schema: type[BaseModel] | None = None,
77
+ ) -> EvaluationMetricResult | None:
78
+ """
79
+ Analyzes the level of relevancy of a context by comparing
80
+ it with the input text.
81
+
82
+ Args:
83
+ input (EvaluationMetricInput): The input for the metric.
84
+ config (EvaluationMetricConfig): The configuration for the metric.
85
+
86
+ Returns:
87
+ EvaluationMetricResult | None
88
+
89
+ Raises:
90
+ EvaluatorException: If the context texts are empty or required fields are missing or error occurred during evaluation.
91
+ """
92
+ if config.enabled is False:
93
+ self.logger.info("Hallucination metric is not enabled.")
94
+ return None
95
+
96
+ input.validate_required_fields(relevancy_required_input_fields)
97
+
98
+ if len(input.context_texts) == 0: # type: ignore
99
+ error_message = "No context texts provided."
100
+ raise EvaluatorException(
101
+ user_message=error_message,
102
+ error_message=error_message,
103
+ )
104
+
105
+ try:
106
+ # Handle structured output if enabled and supported by the model
107
+ if (
108
+ structured_output_schema
109
+ and ModelCapabilities.STRUCTURED_OUTPUT
110
+ in config.language_model.capabilities
111
+ ):
112
+ return await self._handle_structured_output(
113
+ input, config, structured_output_schema
114
+ )
115
+
116
+ # Handle regular output
117
+ return await self._handle_regular_output(input, config)
118
+
119
+ except Exception as e:
120
+ error_message = (
121
+ "Unknown error occurred during context relevancy metric analysis"
122
+ )
123
+ raise EvaluatorException(
124
+ error_message=f"{error_message}: {e}",
125
+ user_message=error_message,
126
+ exception=e,
127
+ )
128
+
129
+ async def _handle_structured_output(
130
+ self,
131
+ input: EvaluationMetricInput,
132
+ config: EvaluationMetricConfig,
133
+ structured_output_schema: type[BaseModel],
134
+ ) -> EvaluationMetricResult:
135
+ """Handle the structured output case for context relevancy evaluation."""
136
+ self.logger.info("Using structured output for context relevancy evaluation.")
137
+ msgs = self._compose_msgs(input, config, enable_structured_output=True)
138
+ result = await self.language_model_service.complete_async(
139
+ messages=msgs,
140
+ model_name=config.language_model.name,
141
+ structured_output_model=structured_output_schema,
142
+ structured_output_enforce_schema=True,
143
+ other_options=config.additional_llm_options,
144
+ )
145
+
146
+ try:
147
+ result_content = EvaluationSchemaStructuredOutput.model_validate(
148
+ result.choices[0].message.parsed
149
+ )
150
+ except ValidationError as e:
151
+ error_message = "Error occurred during structured output validation of the context relevancy evaluation."
152
+ raise EvaluatorException(
153
+ error_message=error_message,
154
+ user_message=error_message,
155
+ exception=e,
156
+ )
157
+
158
+ return parse_eval_metric_result_structured_output(
159
+ result_content, EvaluationMetricName.CONTEXT_RELEVANCY
160
+ )
161
+
162
+ async def _handle_regular_output(
163
+ self,
164
+ input: EvaluationMetricInput,
165
+ config: EvaluationMetricConfig,
166
+ ) -> EvaluationMetricResult:
167
+ """Handle the regular output case for context relevancy evaluation."""
168
+ msgs = self._compose_msgs(input, config, enable_structured_output=False)
169
+ result = await self.language_model_service.complete_async(
170
+ messages=msgs,
171
+ model_name=config.language_model.name,
172
+ other_options=config.additional_llm_options,
173
+ )
174
+
175
+ result_content = result.choices[0].message.content
176
+ if not result_content or not isinstance(result_content, str):
177
+ error_message = "Context relevancy evaluation did not return a result."
178
+ raise EvaluatorException(
179
+ error_message=error_message,
180
+ user_message=error_message,
181
+ )
182
+
183
+ return parse_eval_metric_result(
184
+ result_content, EvaluationMetricName.CONTEXT_RELEVANCY
185
+ )
186
+
187
+ def _compose_msgs(
188
+ self,
189
+ input: EvaluationMetricInput,
190
+ config: EvaluationMetricConfig,
191
+ enable_structured_output: bool,
192
+ ) -> LanguageModelMessages:
193
+ """
194
+ Composes the messages for the relevancy metric.
195
+ """
196
+ system_msg_content = self._get_system_prompt(config, enable_structured_output)
197
+ system_msg = Prompt(system_msg_content).to_system_msg()
198
+
199
+ user_msg = Prompt(
200
+ self._get_user_prompt(config, enable_structured_output),
201
+ input_text=input.input_text,
202
+ context_texts=input.get_joined_context_texts(),
203
+ ).to_user_msg()
204
+
205
+ return LanguageModelMessages([system_msg, user_msg])
206
+
207
+ def _get_system_prompt(
208
+ self,
209
+ config: EvaluationMetricConfig,
210
+ enable_structured_output: bool,
211
+ ):
212
+ if (
213
+ enable_structured_output
214
+ and ModelCapabilities.STRUCTURED_OUTPUT
215
+ in config.language_model.capabilities
216
+ ):
217
+ return config.custom_prompts.setdefault(
218
+ SYSTEM_MSG_KEY,
219
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG_STRUCTURED_OUTPUT,
220
+ )
221
+ else:
222
+ return config.custom_prompts.setdefault(
223
+ SYSTEM_MSG_KEY,
224
+ CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
225
+ )
226
+
227
+ def _get_user_prompt(
228
+ self,
229
+ config: EvaluationMetricConfig,
230
+ enable_structured_output: bool,
231
+ ):
232
+ if enable_structured_output:
233
+ return config.custom_prompts.setdefault(
234
+ USER_MSG_KEY,
235
+ CONTEXT_RELEVANCY_METRIC_USER_MSG_STRUCTURED_OUTPUT,
236
+ )
237
+ else:
238
+ return config.custom_prompts.setdefault(
239
+ USER_MSG_KEY,
240
+ CONTEXT_RELEVANCY_METRIC_USER_MSG,
241
+ )