unique_toolkit 0.8.14__py3-none-any.whl → 0.8.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. unique_toolkit/_common/default_language_model.py +6 -0
  2. unique_toolkit/_common/token/image_token_counting.py +67 -0
  3. unique_toolkit/_common/token/token_counting.py +196 -0
  4. unique_toolkit/evals/config.py +36 -0
  5. unique_toolkit/evals/context_relevancy/prompts.py +56 -0
  6. unique_toolkit/evals/context_relevancy/schema.py +88 -0
  7. unique_toolkit/evals/context_relevancy/service.py +241 -0
  8. unique_toolkit/evals/hallucination/constants.py +61 -0
  9. unique_toolkit/evals/hallucination/hallucination_evaluation.py +92 -0
  10. unique_toolkit/evals/hallucination/prompts.py +79 -0
  11. unique_toolkit/evals/hallucination/service.py +57 -0
  12. unique_toolkit/evals/hallucination/utils.py +213 -0
  13. unique_toolkit/evals/output_parser.py +48 -0
  14. unique_toolkit/evals/tests/test_context_relevancy_service.py +252 -0
  15. unique_toolkit/evals/tests/test_output_parser.py +80 -0
  16. unique_toolkit/history_manager/history_construction_with_contents.py +307 -0
  17. unique_toolkit/history_manager/history_manager.py +80 -111
  18. unique_toolkit/history_manager/loop_token_reducer.py +457 -0
  19. unique_toolkit/language_model/schemas.py +8 -0
  20. unique_toolkit/reference_manager/reference_manager.py +15 -2
  21. {unique_toolkit-0.8.14.dist-info → unique_toolkit-0.8.16.dist-info}/METADATA +7 -1
  22. {unique_toolkit-0.8.14.dist-info → unique_toolkit-0.8.16.dist-info}/RECORD +24 -7
  23. {unique_toolkit-0.8.14.dist-info → unique_toolkit-0.8.16.dist-info}/LICENSE +0 -0
  24. {unique_toolkit-0.8.14.dist-info → unique_toolkit-0.8.16.dist-info}/WHEEL +0 -0
@@ -0,0 +1,61 @@
1
+ from typing import Any
2
+
3
+ from pydantic import Field
4
+
5
+ from unique_toolkit._common.validators import LMI
6
+ from unique_toolkit.evals.config import EvaluationMetricConfig
7
+ from unique_toolkit.evals.hallucination.prompts import (
8
+ HALLUCINATION_METRIC_SYSTEM_MSG,
9
+ HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
10
+ HALLUCINATION_METRIC_USER_MSG,
11
+ HALLUCINATION_METRIC_USER_MSG_DEFAULT,
12
+ )
13
+ from unique_toolkit.evals.schemas import (
14
+ EvaluationMetricInputFieldName,
15
+ EvaluationMetricName,
16
+ )
17
+ from unique_toolkit.language_model.infos import LanguageModelInfo, LanguageModelName
18
+
19
+
20
+ SYSTEM_MSG_KEY = "systemPrompt"
21
+ USER_MSG_KEY = "userPrompt"
22
+ SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
23
+ USER_MSG_DEFAULT_KEY = "userPromptDefault"
24
+
25
+
26
+ class HallucinationConfig(EvaluationMetricConfig):
27
+ enabled: bool = False
28
+ name: EvaluationMetricName = EvaluationMetricName.HALLUCINATION
29
+ language_model: LMI = LanguageModelInfo.from_name(
30
+ LanguageModelName.AZURE_GPT_35_TURBO_0125,
31
+ )
32
+ additional_llm_options: dict[str, Any] = Field(
33
+ default={},
34
+ description="Additional options to pass to the language model.",
35
+ )
36
+ custom_prompts: dict = {
37
+ SYSTEM_MSG_KEY: HALLUCINATION_METRIC_SYSTEM_MSG,
38
+ USER_MSG_KEY: HALLUCINATION_METRIC_USER_MSG,
39
+ SYSTEM_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
40
+ USER_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_USER_MSG_DEFAULT,
41
+ }
42
+ score_to_label: dict = {
43
+ "LOW": "GREEN",
44
+ "MEDIUM": "YELLOW",
45
+ "HIGH": "RED",
46
+ }
47
+ score_to_title: dict = {
48
+ "LOW": "No Hallucination Detected",
49
+ "MEDIUM": "Hallucination Warning",
50
+ "HIGH": "High Hallucination",
51
+ }
52
+
53
+
54
+ hallucination_metric_default_config = HallucinationConfig()
55
+
56
+ hallucination_required_input_fields = [
57
+ EvaluationMetricInputFieldName.INPUT_TEXT,
58
+ EvaluationMetricInputFieldName.CONTEXT_TEXTS,
59
+ EvaluationMetricInputFieldName.HISTORY_MESSAGES,
60
+ EvaluationMetricInputFieldName.OUTPUT_TEXT,
61
+ ]
@@ -0,0 +1,92 @@
1
+ from typing import Any
2
+
3
+ from unique_toolkit.unique_toolkit.app.schemas import ChatEvent
4
+ from unique_toolkit.unique_toolkit.chat.schemas import (
5
+ ChatMessageAssessmentLabel,
6
+ ChatMessageAssessmentStatus,
7
+ ChatMessageAssessmentType,
8
+ )
9
+ from unique_toolkit.unique_toolkit.evals.config import EvaluationMetricConfig
10
+ from unique_toolkit.unique_toolkit.evals.evaluation_manager import Evaluation
11
+ from unique_toolkit.unique_toolkit.evals.hallucination.utils import check_hallucination
12
+ from unique_toolkit.unique_toolkit.evals.schemas import (
13
+ EvaluationAssessmentMessage,
14
+ EvaluationMetricInput,
15
+ EvaluationMetricName,
16
+ EvaluationMetricResult,
17
+ )
18
+ from unique_toolkit.unique_toolkit.evals.hallucination.constants import (
19
+ HallucinationConfig,
20
+ )
21
+ from unique_toolkit.unique_toolkit.reference_manager.reference_manager import (
22
+ ReferenceManager,
23
+ )
24
+
25
+ from unique_toolkit.language_model.schemas import (
26
+ LanguageModelStreamResponse,
27
+ )
28
+
29
+
30
+ class HallucinationEvaluation(Evaluation):
31
+ def __init__(
32
+ self,
33
+ config: HallucinationConfig,
34
+ event: ChatEvent,
35
+ reference_manager: ReferenceManager,
36
+ ):
37
+ self.config = config
38
+ self._company_id = event.company_id
39
+ self._user_id = event.user_id
40
+ self._reference_manager = reference_manager
41
+ self._user_message = event.payload.user_message.text
42
+ super().__init__(EvaluationMetricName.HALLUCINATION)
43
+
44
+ async def run(
45
+ self, loop_response: LanguageModelStreamResponse
46
+ ) -> EvaluationMetricResult: # type: ignore
47
+ chunks = self._reference_manager.get_chunks()
48
+
49
+ evaluation_result: EvaluationMetricResult = await check_hallucination(
50
+ company_id=self._company_id,
51
+ input=EvaluationMetricInput(
52
+ input_text=self._user_message,
53
+ context_texts=[context.text for context in chunks],
54
+ history_messages=[], # TODO include loop_history messages
55
+ output_text=loop_response.message.text,
56
+ ),
57
+ config=self.config,
58
+ )
59
+
60
+ score_to_label = self.config.score_to_label
61
+ evaluation_result.is_positive = (
62
+ score_to_label.get(evaluation_result.value.upper(), "RED") != "RED"
63
+ )
64
+ return evaluation_result
65
+
66
+ def get_assessment_type(self) -> ChatMessageAssessmentType:
67
+ return ChatMessageAssessmentType.HALLUCINATION
68
+
69
+ async def evaluation_metric_to_assessment(
70
+ self, evaluation_result: EvaluationMetricResult
71
+ ) -> EvaluationAssessmentMessage:
72
+ title = self.config.score_to_title.get(
73
+ evaluation_result.value.upper(), evaluation_result.value
74
+ )
75
+ label = ChatMessageAssessmentLabel(
76
+ self.config.score_to_label.get(
77
+ evaluation_result.value.upper(), evaluation_result.value.upper()
78
+ )
79
+ )
80
+ status = (
81
+ ChatMessageAssessmentStatus.DONE
82
+ if not evaluation_result.error
83
+ else ChatMessageAssessmentStatus.ERROR
84
+ )
85
+
86
+ return EvaluationAssessmentMessage(
87
+ status=status,
88
+ title=title,
89
+ explanation=evaluation_result.reason,
90
+ label=label,
91
+ type=self.get_assessment_type(),
92
+ )
@@ -0,0 +1,79 @@
1
+ HALLUCINATION_METRIC_SYSTEM_MSG = """
2
+ You will receive a question, references, a conversation between a user and an agent, and an output.
3
+ The output is the answer to the question.
4
+ Your task is to evaluate if the output is fully supported by the information provided in the references and conversation, and provide explanations on your judgement in 2 sentences.
5
+
6
+ Use the following entailment scale to generate a score:
7
+ [low] - All information in output is supported by the references/conversation, or extractions from the references/conversation.
8
+ [medium] - The output is supported by the references/conversation to some extent, but there is at least some information in the output that is not discussed in the references/conversation. For example, if an instruction asks about two concepts and the references/conversation only discusses either of them, it should be considered a [medium] hallucination level.
9
+ [high] - The output contains information that is not part of the references/conversation, is unrelated to the references/conversation, or contradicts the references/conversation.
10
+
11
+ Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check whether the output is supported by the references/conversation, and not whether the output is correct or not. Also do not evaluate if the references/conversation contain further information that is not part of the output but could be relevant to the question. If the output mentions a plot or chart, ignore this information in your evaluation.
12
+
13
+ Your answer must be in JSON format:
14
+ {
15
+ "reason": Your explanation of your judgement of the evaluation,
16
+ "value": decision, must be one of the following: ["high", "medium", "low"]
17
+ }
18
+ """
19
+
20
+ HALLUCINATION_METRIC_USER_MSG = """
21
+ Here is the data:
22
+
23
+ Input:
24
+ '''
25
+ $input_text
26
+ '''
27
+
28
+ References:
29
+ '''
30
+ $contexts_text
31
+ '''
32
+
33
+ Conversation:
34
+ '''
35
+ $history_messages_text
36
+ '''
37
+
38
+ Output:
39
+ '''
40
+ $output_text
41
+ '''
42
+
43
+ Answer as JSON:
44
+ """
45
+
46
+ HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT = """
47
+ You will receive a question and an output.
48
+ The output is the answer to the question.
49
+ The situation is that no references could be found to answer the question. Your task is to evaluate if the output contains any information to answer the question,
50
+ and provide a short explanations of your reasoning in 2 sentences. Also mention in your explanation that no references were provided to answer the question.
51
+
52
+ Use the following entailment scale to generate a score:
53
+ [low] - The output does not contain any information to answer the question.
54
+ [medium] - The output contains some information to answer the question, but does not answer the question entirely.
55
+ [high] - The output answers the question.
56
+
57
+ It is not considered an answer when the output relates to the questions subject. Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check that the output does not answer the question, and not whether the output is correct or not.
58
+ Your answer must be in JSON format:
59
+ {
60
+ "reason": Your explanation of your reasoning of the evaluation,
61
+ "value": decision, must be one of the following: ["low", "medium", "high"]
62
+ }
63
+ """
64
+
65
+ HALLUCINATION_METRIC_USER_MSG_DEFAULT = """
66
+ Here is the data:
67
+
68
+ Input:
69
+ '''
70
+ $input_text
71
+ '''
72
+
73
+ Output:
74
+ '''
75
+ $output_text
76
+ '''
77
+
78
+ Answer as JSON:
79
+ """
@@ -0,0 +1,57 @@
1
+ import logging
2
+
3
+ from unique_toolkit.app.schemas import ChatEvent
4
+ from unique_toolkit.evals.config import EvaluationMetricConfig
5
+ from unique_toolkit.evals.schemas import EvaluationMetricInput, EvaluationMetricResult
6
+
7
+
8
+ from .constants import hallucination_metric_default_config
9
+ from .utils import check_hallucination
10
+
11
+ SYSTEM_MSG_KEY = "systemPrompt"
12
+ USER_MSG_KEY = "userPrompt"
13
+ SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
14
+ USER_MSG_DEFAULT_KEY = "userPromptDefault"
15
+
16
+
17
+ class HallucinationEvaluator:
18
+ def __init__(self, event: ChatEvent):
19
+ self.event = event
20
+
21
+ self.logger = logging.getLogger(f"HallucinationEvaluator.{__name__}")
22
+
23
+ async def analyze(
24
+ self,
25
+ input: EvaluationMetricInput,
26
+ config: EvaluationMetricConfig = hallucination_metric_default_config,
27
+ ) -> EvaluationMetricResult | None:
28
+ """
29
+ Analyzes the level of hallucination in the generated output by comparing it with the input
30
+ and the provided contexts or history. The analysis classifies the hallucination level as:
31
+ - low
32
+ - medium
33
+ - high
34
+
35
+ If no contexts or history are referenced in the generated output, the method verifies
36
+ that the output does not contain any relevant information to answer the question.
37
+
38
+ This method calls `check_hallucination` to perform the actual analysis. The `check_hallucination`
39
+ function handles the evaluation using the company ID from the event, the provided input, and the configuration.
40
+
41
+ Args:
42
+ input (EvaluationMetricInput): The input data used for evaluation, including the generated output and reference information.
43
+ config (EvaluationMetricConfig, optional): Configuration settings for the evaluation. Defaults to `hallucination_metric_default_config`.
44
+
45
+ Returns:
46
+ EvaluationMetricResult | None: The result of the evaluation, indicating the level of hallucination. Returns `None` if the analysis cannot be performed.
47
+
48
+ Raises:
49
+ EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
50
+ """
51
+ if config.enabled is False:
52
+ self.logger.info("Hallucination metric is not enabled.")
53
+ return None
54
+
55
+ return await check_hallucination(
56
+ company_id=self.event.company_id, input=input, config=config
57
+ )
@@ -0,0 +1,213 @@
1
+ import logging
2
+ from string import Template
3
+
4
+ from unique_toolkit.content.schemas import ContentChunk
5
+ from unique_toolkit.language_model.schemas import (
6
+ LanguageModelMessages,
7
+ LanguageModelStreamResponse,
8
+ LanguageModelSystemMessage,
9
+ LanguageModelUserMessage,
10
+ )
11
+ from unique_toolkit.language_model.service import LanguageModelService
12
+ from unique_toolkit.evals.config import EvaluationMetricConfig
13
+ from unique_toolkit.evals.exception import EvaluatorException
14
+ from unique_toolkit.evals.output_parser import parse_eval_metric_result
15
+ from unique_toolkit.evals.schemas import (
16
+ EvaluationMetricInput,
17
+ EvaluationMetricName,
18
+ EvaluationMetricResult,
19
+ )
20
+
21
+
22
+ from .constants import (
23
+ SYSTEM_MSG_DEFAULT_KEY,
24
+ SYSTEM_MSG_KEY,
25
+ USER_MSG_DEFAULT_KEY,
26
+ USER_MSG_KEY,
27
+ hallucination_required_input_fields,
28
+ )
29
+ from .prompts import (
30
+ HALLUCINATION_METRIC_SYSTEM_MSG,
31
+ HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
32
+ HALLUCINATION_METRIC_USER_MSG,
33
+ HALLUCINATION_METRIC_USER_MSG_DEFAULT,
34
+ )
35
+
36
+
37
+ async def check_hallucination(
38
+ company_id: str,
39
+ input: EvaluationMetricInput,
40
+ config: EvaluationMetricConfig,
41
+ ) -> EvaluationMetricResult:
42
+ """
43
+ Analyzes the level of hallucination in the generated output by comparing it with the provided input
44
+ and the contexts or history. The analysis classifies the hallucination level as:
45
+ - low
46
+ - medium
47
+ - high
48
+
49
+ If no contexts or history are referenced in the generated output, the method checks that the output
50
+ does not contain any relevant information to answer the question.
51
+
52
+ This method performs the following steps:
53
+ 1. Checks if the hallucination metric is enabled using the provided `config`.
54
+ 2. Logs the start of the analysis using the provided `logger`.
55
+ 3. Validates the required fields in the `input` data.
56
+ 4. Retrieves the messages using the `_get_msgs` method.
57
+ 5. Calls `LanguageModelService.complete_async_util` to get a completion result.
58
+ 6. Parses and returns the evaluation metric result based on the content of the completion result.
59
+
60
+ Args:
61
+ company_id (str): The company ID for the analysis.
62
+ input (EvaluationMetricInput): The input data used for evaluation, including the generated output and reference information.
63
+ config (EvaluationMetricConfig, optional): Configuration settings for the evaluation. Defaults to `hallucination_metric_default_config`.
64
+ logger (Optional[logging.Logger], optional): The logger used for logging information and errors. Defaults to the logger for the current module.
65
+
66
+ Returns:
67
+ EvaluationMetricResult | None: The result of the evaluation, indicating the level of hallucination. Returns `None` if the metric is not enabled or if an error occurs.
68
+
69
+ Raises:
70
+ EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
71
+ """
72
+
73
+ logger = logging.getLogger(f"check_hallucination.{__name__}")
74
+
75
+ model_name = config.language_model.name
76
+ logger.info(f"Analyzing level of hallucination with {model_name}.")
77
+
78
+ input.validate_required_fields(hallucination_required_input_fields)
79
+
80
+ try:
81
+ msgs = _get_msgs(input, config, logger)
82
+ result = await LanguageModelService.complete_async_util(
83
+ company_id=company_id, messages=msgs, model_name=model_name
84
+ )
85
+ result_content = result.choices[0].message.content
86
+ if not result_content:
87
+ error_message = "Hallucination evaluation did not return a result."
88
+ raise EvaluatorException(
89
+ error_message=error_message,
90
+ user_message=error_message,
91
+ )
92
+ return parse_eval_metric_result(
93
+ result_content, # type: ignore
94
+ EvaluationMetricName.HALLUCINATION,
95
+ )
96
+ except Exception as e:
97
+ error_message = "Error occurred during hallucination metric analysis"
98
+ raise EvaluatorException(
99
+ error_message=f"{error_message}: {e}",
100
+ user_message=error_message,
101
+ exception=e,
102
+ )
103
+
104
+
105
+ def _get_msgs(
106
+ input: EvaluationMetricInput,
107
+ config: EvaluationMetricConfig,
108
+ logger: logging.Logger,
109
+ ):
110
+ """
111
+ Composes the messages for hallucination analysis based on the provided input and configuration.
112
+
113
+ This method decides how to compose the messages based on the availability of context texts and history
114
+ message texts in the `input`
115
+
116
+ Args:
117
+ input (EvaluationMetricInput): The input data that includes context texts and history message texts
118
+ for the analysis.
119
+ config (EvaluationMetricConfig): The configuration settings for composing messages.
120
+ logger (Optional[logging.Logger], optional): The logger used for logging debug information.
121
+ Defaults to the logger for the current module.
122
+
123
+ Returns:
124
+ The composed messages as per the provided input and configuration. The exact type and structure
125
+ depend on the implementation of the `compose_msgs` and `compose_msgs_default` methods.
126
+
127
+ """
128
+ if input.context_texts or input.history_messages:
129
+ logger.debug("Using context / history for hallucination evaluation.")
130
+ return _compose_msgs(input, config)
131
+ else:
132
+ logger.debug("No contexts and history provided for hallucination evaluation.")
133
+ return _compose_msgs_default(input, config)
134
+
135
+
136
+ def _compose_msgs(
137
+ input: EvaluationMetricInput,
138
+ config: EvaluationMetricConfig,
139
+ ):
140
+ """
141
+ Composes the hallucination analysis messages.
142
+ """
143
+ system_msg_content = _get_system_prompt_with_contexts(config)
144
+ system_msg = LanguageModelSystemMessage(content=system_msg_content)
145
+
146
+ user_msg_templ = Template(_get_user_prompt_with_contexts(config))
147
+ user_msg_content = user_msg_templ.substitute(
148
+ input_text=input.input_text,
149
+ contexts_text=input.get_joined_context_texts(tag_name="reference"),
150
+ history_messages_text=input.get_joined_history_texts(tag_name="conversation"),
151
+ output_text=input.output_text,
152
+ )
153
+ user_msg = LanguageModelUserMessage(content=user_msg_content)
154
+ return LanguageModelMessages([system_msg, user_msg])
155
+
156
+
157
+ def _compose_msgs_default(
158
+ input: EvaluationMetricInput,
159
+ config: EvaluationMetricConfig,
160
+ ):
161
+ """
162
+ Composes the hallucination analysis prompt without messages.
163
+ """
164
+ system_msg_content = _get_system_prompt_default(config)
165
+ system_msg = LanguageModelSystemMessage(content=system_msg_content)
166
+
167
+ user_msg_templ = Template(_get_user_prompt_default(config))
168
+ user_msg_content = user_msg_templ.substitute(
169
+ input_text=input.input_text,
170
+ output_text=input.output_text,
171
+ )
172
+ user_msg = LanguageModelUserMessage(content=user_msg_content)
173
+ return LanguageModelMessages([system_msg, user_msg])
174
+
175
+
176
+ def _get_system_prompt_with_contexts(config: EvaluationMetricConfig):
177
+ return config.custom_prompts.setdefault(
178
+ SYSTEM_MSG_KEY,
179
+ HALLUCINATION_METRIC_SYSTEM_MSG,
180
+ )
181
+
182
+
183
+ def _get_user_prompt_with_contexts(config: EvaluationMetricConfig):
184
+ return config.custom_prompts.setdefault(
185
+ USER_MSG_KEY,
186
+ HALLUCINATION_METRIC_USER_MSG,
187
+ )
188
+
189
+
190
+ def _get_system_prompt_default(config: EvaluationMetricConfig):
191
+ return config.custom_prompts.setdefault(
192
+ SYSTEM_MSG_DEFAULT_KEY,
193
+ HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
194
+ )
195
+
196
+
197
+ def _get_user_prompt_default(config: EvaluationMetricConfig):
198
+ return config.custom_prompts.setdefault(
199
+ USER_MSG_DEFAULT_KEY,
200
+ HALLUCINATION_METRIC_USER_MSG_DEFAULT,
201
+ )
202
+
203
+
204
+ def context_text_from_stream_response(
205
+ response: LanguageModelStreamResponse, selected_chunks: list[ContentChunk]
206
+ ):
207
+ response_references = response.message.references
208
+ reference_ids = [reference.source_id for reference in response_references]
209
+ filtered_contexts: list[str] = []
210
+ for chunk in selected_chunks:
211
+ if f"{chunk.id}_{chunk.chunk_id}" in reference_ids:
212
+ filtered_contexts.append(chunk.text)
213
+ return filtered_contexts
@@ -0,0 +1,48 @@
1
+ from unique_toolkit.language_model.utils import convert_string_to_json
2
+ from unique_toolkit.evals.context_relevancy.schema import (
3
+ EvaluationSchemaStructuredOutput,
4
+ )
5
+ from unique_toolkit.evals.exception import EvaluatorException
6
+ from unique_toolkit.evals.schemas import (
7
+ EvaluationMetricName,
8
+ EvaluationMetricResult,
9
+ )
10
+
11
+
12
+ def parse_eval_metric_result(
13
+ result: str,
14
+ metric_name: EvaluationMetricName,
15
+ ):
16
+ """
17
+ Parses the evaluation metric result.
18
+ """
19
+
20
+ try:
21
+ parsed_result = convert_string_to_json(result)
22
+ except Exception as e:
23
+ error_message = "Error occurred during parsing the evaluation metric result"
24
+ raise EvaluatorException(
25
+ user_message=f"{error_message}.",
26
+ error_message=f"{error_message}: {str(e)}",
27
+ )
28
+
29
+ return EvaluationMetricResult(
30
+ name=metric_name,
31
+ value=parsed_result.get("value", "None"),
32
+ reason=parsed_result.get("reason", "None"),
33
+ )
34
+
35
+
36
+ def parse_eval_metric_result_structured_output(
37
+ result: EvaluationSchemaStructuredOutput,
38
+ metric_name: EvaluationMetricName,
39
+ ) -> EvaluationMetricResult:
40
+ """
41
+ Parses the evaluation metric result.
42
+ """
43
+ return EvaluationMetricResult(
44
+ name=metric_name,
45
+ value=result.value,
46
+ reason=result.reason,
47
+ fact_list=[item.fact for item in result.fact_list],
48
+ )