unique_toolkit 0.8.46__py3-none-any.whl → 0.8.47__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ from typing import Any
2
2
 
3
3
  from pydantic import Field
4
4
 
5
+ from unique_toolkit._common.default_language_model import DEFAULT_GPT_4o
5
6
  from unique_toolkit._common.validators import LMI
6
7
  from unique_toolkit.evals.config import EvaluationMetricConfig
7
8
  from unique_toolkit.evals.hallucination.prompts import (
@@ -14,7 +15,7 @@ from unique_toolkit.evals.schemas import (
14
15
  EvaluationMetricInputFieldName,
15
16
  EvaluationMetricName,
16
17
  )
17
- from unique_toolkit.language_model.infos import LanguageModelInfo, LanguageModelName
18
+ from unique_toolkit.language_model.infos import LanguageModelInfo
18
19
 
19
20
  SYSTEM_MSG_KEY = "systemPrompt"
20
21
  USER_MSG_KEY = "userPrompt"
@@ -26,7 +27,7 @@ class HallucinationConfig(EvaluationMetricConfig):
26
27
  enabled: bool = False
27
28
  name: EvaluationMetricName = EvaluationMetricName.HALLUCINATION
28
29
  language_model: LMI = LanguageModelInfo.from_name(
29
- LanguageModelName.AZURE_GPT_35_TURBO_0125,
30
+ DEFAULT_GPT_4o,
30
31
  )
31
32
  additional_llm_options: dict[str, Any] = Field(
32
33
  default={},
@@ -36,7 +36,7 @@ class UploadedContentConfig(BaseModel):
36
36
  percent_for_uploaded_content: float = Field(
37
37
  default=0.6,
38
38
  ge=0.0,
39
- lt=1.0,
39
+ le=1.0,
40
40
  description="The fraction of the max input tokens that will be reserved for the uploaded content.",
41
41
  )
42
42
 
@@ -10,7 +10,6 @@ from pydantic import BaseModel
10
10
 
11
11
  from unique_toolkit.chat.schemas import ChatMessage, ChatMessageRole
12
12
  from unique_toolkit.content.schemas import ContentChunk, ContentReference
13
- from unique_toolkit.evaluators import DOMAIN_NAME
14
13
  from unique_toolkit.language_model import (
15
14
  LanguageModelMessageRole,
16
15
  LanguageModelMessages,
@@ -34,7 +33,7 @@ from .constants import (
34
33
  DEFAULT_COMPLETE_TIMEOUT,
35
34
  )
36
35
 
37
- logger = logging.getLogger(f"toolkit.{DOMAIN_NAME}.{__name__}")
36
+ logger = logging.getLogger(f"toolkit.language_model.{__name__}")
38
37
 
39
38
 
40
39
  def complete(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unique_toolkit
3
- Version: 0.8.46
3
+ Version: 0.8.47
4
4
  Summary:
5
5
  License: Proprietary
6
6
  Author: Cedric Klinkert
@@ -117,6 +117,10 @@ All notable changes to this project will be documented in this file.
117
117
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
118
118
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
119
119
 
120
+ ## [0.8.47] - 2025-09-05
121
+ - Removed old code
122
+ - Fixed small bugs in history manager & set the hallucination to use gpt4o as default.
123
+
120
124
  ## [0.8.46] - 2025-09-04
121
125
  - Bugfix for hostname identification inside Unique cluster in `unique_settings.py`
122
126
 
@@ -53,7 +53,7 @@ unique_toolkit/evals/context_relevancy/schema.py,sha256=ILA0ClTBFJbtZavkVIrqPz2v
53
53
  unique_toolkit/evals/context_relevancy/service.py,sha256=xoe0ezA4xy-KaPjSO22xhiqo_WKMuYjlYFKEYJWl25A,9601
54
54
  unique_toolkit/evals/evaluation_manager.py,sha256=g-8qa_6_p53C9Okx8iNkuoIXYSJrf-6sQ-xku7bo9kI,7895
55
55
  unique_toolkit/evals/exception.py,sha256=7lcVbCyoN4Md1chNJDFxpUYyWbVrcr9dcc3TxWykJTc,115
56
- unique_toolkit/evals/hallucination/constants.py,sha256=FLcXl5XU07jCvS8YPX9l6UjTaqyQ8YvnSKpx4Z6wZ2Y,1997
56
+ unique_toolkit/evals/hallucination/constants.py,sha256=gQTZdU5cawBcZXn71FZ1sazaN-ihGvBcgBkZHYrdy0I,2024
57
57
  unique_toolkit/evals/hallucination/hallucination_evaluation.py,sha256=TuZ88jeVn0tVr9d0GhWyJSxKNA16nhvr2xRPo-yK8OM,3063
58
58
  unique_toolkit/evals/hallucination/prompts.py,sha256=O3Hi_rOzZlujvnO2wn2jhoPmrYLjzVtRWwxn5Q81m9Y,3405
59
59
  unique_toolkit/evals/hallucination/service.py,sha256=FsNAwC7tU1SXrge1gLBxGbEO433Tr3v1-udn0i4b8qM,2408
@@ -62,20 +62,6 @@ unique_toolkit/evals/output_parser.py,sha256=y9-mLC3ny40cNUZvvu_RK8pGIMGKWLFpJFs
62
62
  unique_toolkit/evals/schemas.py,sha256=oDj_mNfzDrETZWwv0dOelya4TMOaJm8U6Gd_dFnzVjE,3115
63
63
  unique_toolkit/evals/tests/test_context_relevancy_service.py,sha256=izDbtU3g9Th4LOQMcN2yylB-0f-H_5Ky70Q-0-0vgOY,7899
64
64
  unique_toolkit/evals/tests/test_output_parser.py,sha256=J73mG6Ly3nwLil8MjiCTRk_f_yjV6ud0Jh9W2iJjvlY,3042
65
- unique_toolkit/evaluators/__init__.py,sha256=3Rfpnowm7MUXHWmeU4UV4s_3Hk-sw3V20oBwQCYlejQ,50
66
- unique_toolkit/evaluators/config.py,sha256=_DIXToJ-hGNpDAdWa7Q6GMjAsxiC_DquLF-SS5s9rTE,717
67
- unique_toolkit/evaluators/constants.py,sha256=1oI93jsh0R_TjX_8OenliiiywVe3vTooSnaMqtq6R18,27
68
- unique_toolkit/evaluators/context_relevancy/constants.py,sha256=QG2x32LzV42kAkeWTPuLvOX9NlTSxJlsAgDyxomUBmY,1158
69
- unique_toolkit/evaluators/context_relevancy/prompts.py,sha256=gTlWP7fDuxhrXhCYNCqXMbCey_DalZMdi5l-a6RHgk0,713
70
- unique_toolkit/evaluators/context_relevancy/service.py,sha256=9hzdMuF4A4T97-3X3zcXgrDISLn1bleZ6tTL1bHa9dQ,1722
71
- unique_toolkit/evaluators/context_relevancy/utils.py,sha256=qwTkKah6S2hkEGOHxVdQ6RvV6OcjKj4eyd09TcJZlho,5813
72
- unique_toolkit/evaluators/exception.py,sha256=7lcVbCyoN4Md1chNJDFxpUYyWbVrcr9dcc3TxWykJTc,115
73
- unique_toolkit/evaluators/hallucination/constants.py,sha256=KDhmSlRBnUkfEAFQLaD80rKtj6p-ZJ3L98hqNmNL7xI,1458
74
- unique_toolkit/evaluators/hallucination/prompts.py,sha256=9yCpO_WGLDvYfPWKL1VuRA-jt0P_-A-qvLUOmuv-Nks,3320
75
- unique_toolkit/evaluators/hallucination/service.py,sha256=k8qro5Lw4Ak58m4HYp3G4HPLIaexeFySIIVvW6fAdeA,2408
76
- unique_toolkit/evaluators/hallucination/utils.py,sha256=gO2AOzDQwVTev2_5vDKgJ9A6A9e0himJyAta_wglVG8,8326
77
- unique_toolkit/evaluators/output_parser.py,sha256=eI72qkzK1dZyUvnfP2SOAQCGBj_-PwX5wy_aLPMsJMY,883
78
- unique_toolkit/evaluators/schemas.py,sha256=Jaue6Uhx75X1CyHKWj8sT3RE1JZXTqoLtfLt2xQNCX8,2507
79
65
  unique_toolkit/framework_utilities/__init__.py,sha256=fvAn9y4MRL1JgoO14ufQtLVRPRHn4jP07XRqt-TItCA,68
80
66
  unique_toolkit/framework_utilities/langchain/client.py,sha256=9LDRS2l9XGxL0HoFLh0ZrFUXrlt8o_J-o-1rU8j-uMQ,1432
81
67
  unique_toolkit/framework_utilities/langchain/history.py,sha256=R9RuCeSFNaUO3OZ0G_LmIC4gmOCIANcl91MfyWLnZ1c,650
@@ -84,13 +70,13 @@ unique_toolkit/framework_utilities/openai/client.py,sha256=ct1cqPcIK1wPl11G9sJV3
84
70
  unique_toolkit/framework_utilities/openai/message_builder.py,sha256=VU6mJm_upLcarJQKFft_t1RlLRncWDxDuLC5LIJ5lQQ,4339
85
71
  unique_toolkit/framework_utilities/utils.py,sha256=JK7g2yMfEx3eMprug26769xqNpS5WJcizf8n2zWMBng,789
86
72
  unique_toolkit/history_manager/history_construction_with_contents.py,sha256=c8Zy3erSbHGT8AdICRRlSK91T_FN6tNpTznvUzpLbWk,9023
87
- unique_toolkit/history_manager/history_manager.py,sha256=e18ldkof6ZRSzD8FefodazNFNHvmQIxP9cG0j6Yrl18,8350
73
+ unique_toolkit/history_manager/history_manager.py,sha256=7mdT8li4Oo-t0d1q0pCwJksdal-Y0wLzZj-YnIlJ6xQ,8350
88
74
  unique_toolkit/history_manager/loop_token_reducer.py,sha256=9kqJioUehfoYs5-XMoCv-b_5JpNRrhOz62QwhC3LF3E,17899
89
75
  unique_toolkit/history_manager/utils.py,sha256=iu4LsYOElx8HlZjcx3ZC75I-TmEYBiEP9q2J93Q63Mg,5606
90
76
  unique_toolkit/language_model/__init__.py,sha256=lRQyLlbwHbNFf4-0foBU13UGb09lwEeodbVsfsSgaCk,1971
91
77
  unique_toolkit/language_model/builder.py,sha256=4OKfwJfj3TrgO1ezc_ewIue6W7BCQ2ZYQXUckWVPPTA,3369
92
78
  unique_toolkit/language_model/constants.py,sha256=B-topqW0r83dkC_25DeQfnPk3n53qzIHUCBS7YJ0-1U,119
93
- unique_toolkit/language_model/functions.py,sha256=4-zOzLsdjcfeTy6alqkYEBl-oVWptz9xLi8C5vdbWEg,16769
79
+ unique_toolkit/language_model/functions.py,sha256=PNCmbYovhgMSkY89p7-3DunG6jIekaZPvhh3iplG1Vg,16720
94
80
  unique_toolkit/language_model/infos.py,sha256=eHln--Y5f6znFxknV6A8m-fRaEpH5-kmRh9m-ZWqco4,57188
95
81
  unique_toolkit/language_model/prompt.py,sha256=JSawaLjQg3VR-E2fK8engFyJnNdk21zaO8pPIodzN4Q,3991
96
82
  unique_toolkit/language_model/reference.py,sha256=nkX2VFz-IrUz8yqyc3G5jUMNwrNpxITBrMEKkbqqYoI,8583
@@ -132,7 +118,7 @@ unique_toolkit/tools/utils/execution/execution.py,sha256=vjG2Y6awsGNtlvyQAGCTthQ
132
118
  unique_toolkit/tools/utils/source_handling/schema.py,sha256=vzAyf6ZWNexjMO0OrnB8y2glGkvAilmGGQXd6zcDaKw,870
133
119
  unique_toolkit/tools/utils/source_handling/source_formatting.py,sha256=C7uayNbdkNVJdEARA5CENnHtNY1SU6etlaqbgHNyxaQ,9152
134
120
  unique_toolkit/tools/utils/source_handling/tests/test_source_formatting.py,sha256=oM5ZxEgzROrnX1229KViCAFjRxl9wCTzWZoinYSHleM,6979
135
- unique_toolkit-0.8.46.dist-info/LICENSE,sha256=GlN8wHNdh53xwOPg44URnwag6TEolCjoq3YD_KrWgss,193
136
- unique_toolkit-0.8.46.dist-info/METADATA,sha256=Si5jc1037n0xGa7msbfCjbXlZn_-MPI6vC2WIAADycg,30796
137
- unique_toolkit-0.8.46.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
138
- unique_toolkit-0.8.46.dist-info/RECORD,,
121
+ unique_toolkit-0.8.47.dist-info/LICENSE,sha256=GlN8wHNdh53xwOPg44URnwag6TEolCjoq3YD_KrWgss,193
122
+ unique_toolkit-0.8.47.dist-info/METADATA,sha256=PKdBvaQu4A3agFQ6_HmD4qJH1jdFriLCBAgV8fjARCI,30928
123
+ unique_toolkit-0.8.47.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
124
+ unique_toolkit-0.8.47.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- from .constants import DOMAIN_NAME as DOMAIN_NAME
@@ -1,26 +0,0 @@
1
- from humps import camelize
2
- from pydantic import BaseModel, ConfigDict
3
-
4
- from unique_toolkit._common.validators import LMI, LanguageModelInfo
5
- from unique_toolkit.evaluators.schemas import (
6
- EvaluationMetricName,
7
- )
8
- from unique_toolkit.language_model.infos import (
9
- LanguageModelName,
10
- )
11
-
12
-
13
- class EvaluationMetricConfig(BaseModel):
14
- model_config = ConfigDict(
15
- alias_generator=camelize,
16
- populate_by_name=True,
17
- validate_default=True,
18
- )
19
-
20
- enabled: bool = False
21
- name: EvaluationMetricName
22
- language_model: LMI = LanguageModelInfo.from_name(
23
- LanguageModelName.AZURE_GPT_35_TURBO_0125,
24
- )
25
- custom_prompts: dict[str, str] = {}
26
- score_to_emoji: dict[str, str] = {}
@@ -1 +0,0 @@
1
- DOMAIN_NAME = "evaluators"
@@ -1,34 +0,0 @@
1
- from unique_toolkit.evaluators.config import EvaluationMetricConfig
2
- from unique_toolkit.evaluators.context_relevancy.prompts import (
3
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
4
- CONTEXT_RELEVANCY_METRIC_USER_MSG,
5
- )
6
- from unique_toolkit.evaluators.schemas import (
7
- EvaluationMetricInputFieldName,
8
- EvaluationMetricName,
9
- )
10
- from unique_toolkit.language_model.infos import LanguageModelInfo
11
- from unique_toolkit.language_model.service import LanguageModelName
12
-
13
- SYSTEM_MSG_KEY = "systemPrompt"
14
- USER_MSG_KEY = "userPrompt"
15
-
16
- # Required input fields for context relevancy evaluation
17
- context_relevancy_required_input_fields = [
18
- EvaluationMetricInputFieldName.INPUT_TEXT,
19
- EvaluationMetricInputFieldName.CONTEXT_TEXTS,
20
- ]
21
-
22
-
23
- default_config = EvaluationMetricConfig(
24
- enabled=False,
25
- name=EvaluationMetricName.CONTEXT_RELEVANCY,
26
- language_model=LanguageModelInfo.from_name(
27
- LanguageModelName.AZURE_GPT_35_TURBO_0125
28
- ),
29
- score_to_emoji={"LOW": "🟢", "MEDIUM": "🟡", "HIGH": "🔴"},
30
- custom_prompts={
31
- SYSTEM_MSG_KEY: CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
32
- USER_MSG_KEY: CONTEXT_RELEVANCY_METRIC_USER_MSG,
33
- },
34
- )
@@ -1,31 +0,0 @@
1
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG = """
2
- You will receive an input and a set of contexts.
3
- Your task is to evaluate how relevant the contexts are to the input text.
4
-
5
- Use the following rating scale to generate a score:
6
- [low] - The contexts are not relevant to the input.
7
- [medium] - The contexts are somewhat relevant to the input.
8
- [high] - The contexts are highly relevant to the input.
9
-
10
- Your answer must be in JSON format:
11
- {
12
- "reason": Your explanation of your judgement of the evaluation,
13
- "value": decision, must be one of the following ["low", "medium", "high"]
14
- }
15
- """
16
-
17
- CONTEXT_RELEVANCY_METRIC_USER_MSG = """
18
- Here is the data:
19
-
20
- Input:
21
- '''
22
- $input_text
23
- '''
24
-
25
- Contexts:
26
- '''
27
- $context_texts
28
- '''
29
-
30
- Answer as JSON:
31
- """
@@ -1,53 +0,0 @@
1
- from logging import Logger
2
-
3
- from unique_toolkit.app.schemas import Event
4
- from unique_toolkit.evaluators.config import EvaluationMetricConfig
5
- from unique_toolkit.evaluators.context_relevancy.constants import default_config
6
- from unique_toolkit.evaluators.context_relevancy.utils import (
7
- check_context_relevancy_async,
8
- )
9
- from unique_toolkit.evaluators.schemas import (
10
- EvaluationMetricInput,
11
- EvaluationMetricResult,
12
- )
13
-
14
-
15
- class ContextRelevancyEvaluator:
16
- def __init__(
17
- self,
18
- event: Event,
19
- logger: Logger,
20
- ):
21
- self.event = event
22
- self.logger = logger
23
-
24
- async def run(
25
- self,
26
- input: EvaluationMetricInput,
27
- config: EvaluationMetricConfig = default_config,
28
- ) -> EvaluationMetricResult | None:
29
- """
30
- Analyzes the level of relevancy of a context by comparing
31
- it with the input text.
32
-
33
- Args:
34
- input (EvaluationMetricInput): The input for the metric.
35
- config (EvaluationMetricConfig): The configuration for the metric.
36
-
37
- Returns:
38
- EvaluationMetricResult | None: The result of the evaluation, indicating the level of context relevancy.
39
- Returns None if the metric is not enabled.
40
-
41
- Raises:
42
- EvaluatorException: If required fields are missing or an error occurs during evaluation.
43
- """
44
- if config.enabled is False:
45
- self.logger.info("Context relevancy metric is not enabled.")
46
- return None
47
-
48
- return await check_context_relevancy_async(
49
- company_id=self.event.company_id,
50
- input=input,
51
- config=config,
52
- logger=self.logger,
53
- )
@@ -1,156 +0,0 @@
1
- import logging
2
- from string import Template
3
-
4
- from unique_toolkit.evaluators.config import (
5
- EvaluationMetricConfig,
6
- )
7
- from unique_toolkit.evaluators.context_relevancy.constants import (
8
- SYSTEM_MSG_KEY,
9
- USER_MSG_KEY,
10
- context_relevancy_required_input_fields,
11
- )
12
- from unique_toolkit.evaluators.context_relevancy.prompts import (
13
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
14
- CONTEXT_RELEVANCY_METRIC_USER_MSG,
15
- )
16
- from unique_toolkit.evaluators.exception import EvaluatorException
17
- from unique_toolkit.evaluators.output_parser import (
18
- parse_eval_metric_result,
19
- )
20
- from unique_toolkit.evaluators.schemas import (
21
- EvaluationMetricInput,
22
- EvaluationMetricName,
23
- EvaluationMetricResult,
24
- )
25
- from unique_toolkit.language_model import LanguageModelName
26
- from unique_toolkit.language_model.schemas import (
27
- LanguageModelMessages,
28
- LanguageModelSystemMessage,
29
- LanguageModelUserMessage,
30
- )
31
- from unique_toolkit.language_model.service import LanguageModelService
32
-
33
- logger = logging.getLogger(__name__)
34
-
35
-
36
- async def check_context_relevancy_async(
37
- company_id: str,
38
- evaluation_metric_input: EvaluationMetricInput,
39
- config: EvaluationMetricConfig,
40
- logger: logging.Logger = logger,
41
- ) -> EvaluationMetricResult | None:
42
- """Analyzes the relevancy of the context provided for the given evaluation_metric_input and output.
43
-
44
- The analysis classifies the context relevancy level as:
45
- - low
46
- - medium
47
- - high
48
-
49
- This method performs the following steps:
50
- 1. Logs the start of the analysis using the provided `logger`.
51
- 2. Validates the required fields in the `evaluation_metric_input` data.
52
- 3. Retrieves the messages using the `_get_msgs` method.
53
- 4. Calls `LanguageModelService.complete_async_util` to get a completion result.
54
- 5. Parses and returns the evaluation metric result based on the content of the completion result.
55
-
56
- Args:
57
- company_id (str): The company ID for the analysis.
58
- evaluation_metric_input (EvaluationMetricInput): The evaluation_metric_input data used for evaluation, including the generated output and reference information.
59
- config (EvaluationMetricConfig): Configuration settings for the evaluation.
60
- logger (Optional[logging.Logger], optional): The logger used for logging information and errors. Defaults to the logger for the current module.
61
-
62
- Returns:
63
- EvaluationMetricResult | None: The result of the evaluation, indicating the level of context relevancy. Returns `None` if an error occurs.
64
-
65
- Raises:
66
- EvaluatorException: If required fields are missing or an error occurs during the evaluation.
67
-
68
- """
69
- model_group_name = (
70
- config.language_model.name.value
71
- if isinstance(config.language_model.name, LanguageModelName)
72
- else config.language_model.name
73
- )
74
- logger.info(f"Analyzing context relevancy with {model_group_name}.")
75
-
76
- evaluation_metric_input.validate_required_fields(
77
- context_relevancy_required_input_fields,
78
- )
79
-
80
- if (
81
- evaluation_metric_input.context_texts
82
- and len(evaluation_metric_input.context_texts) == 0
83
- ):
84
- error_message = "No context texts provided."
85
- raise EvaluatorException(
86
- user_message=error_message,
87
- error_message=error_message,
88
- )
89
-
90
- try:
91
- msgs = _get_msgs(evaluation_metric_input, config)
92
- result = await LanguageModelService.complete_async_util(
93
- company_id=company_id,
94
- messages=msgs,
95
- model_name=model_group_name,
96
- )
97
- result_content = result.choices[0].message.content
98
- if not result_content:
99
- error_message = "Context relevancy evaluation did not return a result."
100
- raise EvaluatorException(
101
- error_message=error_message,
102
- user_message=error_message,
103
- )
104
- return parse_eval_metric_result(
105
- result_content, # type: ignore
106
- EvaluationMetricName.CONTEXT_RELEVANCY,
107
- )
108
- except Exception as e:
109
- error_message = "Error occurred during context relevancy metric analysis"
110
- raise EvaluatorException(
111
- error_message=f"{error_message}: {e}",
112
- user_message=error_message,
113
- exception=e,
114
- )
115
-
116
-
117
- def _get_msgs(
118
- evaluation_metric_input: EvaluationMetricInput,
119
- config: EvaluationMetricConfig,
120
- ) -> LanguageModelMessages:
121
- """Composes the messages for context relevancy analysis.
122
-
123
- The messages are based on the provided evaluation_metric_input and configuration.
124
-
125
- Args:
126
- evaluation_metric_input (EvaluationMetricInput): The evaluation_metric_input data that includes context texts for the analysis.
127
- config (EvaluationMetricConfig): The configuration settings for composing messages.
128
-
129
- Returns:
130
- LanguageModelMessages: The composed messages as per the provided evaluation_metric_input and configuration.
131
-
132
- """
133
- system_msg_content = _get_system_prompt(config)
134
- system_msg = LanguageModelSystemMessage(content=system_msg_content)
135
-
136
- user_msg_templ = Template(_get_user_prompt(config))
137
- user_msg_content = user_msg_templ.substitute(
138
- evaluation_metric_input_text=evaluation_metric_input.evaluation_metric_input_text,
139
- contexts_text=evaluation_metric_input.get_joined_context_texts(),
140
- )
141
- user_msg = LanguageModelUserMessage(content=user_msg_content)
142
- return LanguageModelMessages([system_msg, user_msg])
143
-
144
-
145
- def _get_system_prompt(config: EvaluationMetricConfig):
146
- return config.custom_prompts.setdefault(
147
- SYSTEM_MSG_KEY,
148
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
149
- )
150
-
151
-
152
- def _get_user_prompt(config: EvaluationMetricConfig):
153
- return config.custom_prompts.setdefault(
154
- USER_MSG_KEY,
155
- CONTEXT_RELEVANCY_METRIC_USER_MSG,
156
- )
@@ -1,5 +0,0 @@
1
- from unique_toolkit._common.exception import CommonException
2
-
3
-
4
- class EvaluatorException(CommonException):
5
- pass
@@ -1,41 +0,0 @@
1
- from unique_toolkit.evaluators.config import EvaluationMetricConfig
2
- from unique_toolkit.evaluators.hallucination.prompts import (
3
- HALLUCINATION_METRIC_SYSTEM_MSG,
4
- HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
5
- HALLUCINATION_METRIC_USER_MSG,
6
- HALLUCINATION_METRIC_USER_MSG_DEFAULT,
7
- )
8
- from unique_toolkit.evaluators.schemas import (
9
- EvaluationMetricInputFieldName,
10
- EvaluationMetricName,
11
- )
12
- from unique_toolkit.language_model.infos import (
13
- LanguageModelInfo,
14
- LanguageModelName,
15
- )
16
-
17
- SYSTEM_MSG_KEY = "systemPrompt"
18
- USER_MSG_KEY = "userPrompt"
19
- SYSTEM_MSG_DEFAULT_KEY = "systemPromptDefault"
20
- USER_MSG_DEFAULT_KEY = "userPromptDefault"
21
-
22
-
23
- hallucination_metric_default_config = EvaluationMetricConfig(
24
- enabled=False,
25
- name=EvaluationMetricName.HALLUCINATION,
26
- language_model=LanguageModelInfo.from_name(LanguageModelName.AZURE_GPT_4_0613),
27
- score_to_emoji={"LOW": "🟢", "MEDIUM": "🟡", "HIGH": "🔴"},
28
- custom_prompts={
29
- SYSTEM_MSG_KEY: HALLUCINATION_METRIC_SYSTEM_MSG,
30
- USER_MSG_KEY: HALLUCINATION_METRIC_USER_MSG,
31
- SYSTEM_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
32
- USER_MSG_DEFAULT_KEY: HALLUCINATION_METRIC_USER_MSG_DEFAULT,
33
- },
34
- )
35
-
36
- hallucination_required_input_fields = [
37
- EvaluationMetricInputFieldName.INPUT_TEXT,
38
- EvaluationMetricInputFieldName.CONTEXT_TEXTS,
39
- EvaluationMetricInputFieldName.HISTORY_MESSAGES,
40
- EvaluationMetricInputFieldName.OUTPUT_TEXT,
41
- ]
@@ -1,79 +0,0 @@
1
- HALLUCINATION_METRIC_SYSTEM_MSG = """
2
- You will receive a question, references, a conversation between a user and an agent, and an output.
3
- The output is the answer to the question.
4
- Your task is to evaluate if the output is fully supported by the information provided in the references and conversation, and provide explanations on your judgement in 2 sentences.
5
-
6
- Use the following entailment scale to generate a score:
7
- [low] - All information in output is supported by the references/conversation, or extractions from the references/conversation.
8
- [medium] - The output is supported by the references/conversation to some extent, but there is at least some information in the output that is not discussed in the references/conversation. For example, if an instruction asks about two concepts and the references/conversation only discusses either of them, it should be considered a [medium] hallucination level.
9
- [high] - The output contains information that is not part of the references/conversation, is unrelated to the references/conversation, or contradicts the references/conversation.
10
-
11
- Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check whether the output is supported by the references/conversation, and not whether the output is correct or not. Also do not evaluate if the references/conversation contain further information that is not part of the output but could be relevant to the qestion.
12
-
13
- Your answer must be in JSON format:
14
- {
15
- "reason": Your explanation of your judgement of the evaluation,
16
- "value": decision, must be one of the following: ["high", "medium", "low"]
17
- }
18
- """
19
-
20
- HALLUCINATION_METRIC_USER_MSG = """
21
- Here is the data:
22
-
23
- Input:
24
- '''
25
- $input_text
26
- '''
27
-
28
- References:
29
- '''
30
- $contexts_text
31
- '''
32
-
33
- Conversation:
34
- '''
35
- $history_messages_text
36
- '''
37
-
38
- Output:
39
- '''
40
- $output_text
41
- '''
42
-
43
- Answer as JSON:
44
- """
45
-
46
- HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT = """
47
- You will receive a question and an output.
48
- The output is the answer to the question.
49
- The situation is that no references could be found to answer the question. Your task is to evaluate if the output contains any information to answer the question,
50
- and provide a short explanations of your reasoning in 2 sentences. Also mention in your explanation that no references were provided to answer the question.
51
-
52
- Use the following entailment scale to generate a score:
53
- [low] - The output does not contain any information to answer the question.
54
- [medium] - The output contains some information to answer the question, but does not answer the question entirely.
55
- [high] - The output answers the question.
56
-
57
- It is not considered an answer when the output relates to the questions subject. Make sure to not use any external information/knowledge to judge whether the output is true or not. Only check that the output does not answer the question, and not whether the output is correct or not.
58
- Your answer must be in JSON format:
59
- {
60
- "reason": Your explanation of your reasoning of the evaluation,
61
- "value": decision, must be one of the following: ["low", "medium", "high"]
62
- }
63
- """
64
-
65
- HALLUCINATION_METRIC_USER_MSG_DEFAULT = """
66
- Here is the data:
67
-
68
- Input:
69
- '''
70
- $input_text
71
- '''
72
-
73
- Output:
74
- '''
75
- $output_text
76
- '''
77
-
78
- Answer as JSON:
79
- """
@@ -1,58 +0,0 @@
1
- import logging
2
-
3
- from unique_toolkit.app.schemas import Event
4
- from unique_toolkit.evaluators.config import (
5
- EvaluationMetricConfig,
6
- )
7
- from unique_toolkit.evaluators.hallucination.constants import (
8
- hallucination_metric_default_config,
9
- )
10
- from unique_toolkit.evaluators.hallucination.utils import check_hallucination_async
11
- from unique_toolkit.evaluators.schemas import (
12
- EvaluationMetricInput,
13
- EvaluationMetricResult,
14
- )
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- class HallucinationEvaluator:
20
- def __init__(self, event: Event, logger: logging.Logger = logger):
21
- self.event = event
22
- self.logger = logger
23
-
24
- async def run(
25
- self,
26
- input: EvaluationMetricInput,
27
- config: EvaluationMetricConfig = hallucination_metric_default_config,
28
- ) -> EvaluationMetricResult | None:
29
- """
30
- Analyzes the level of hallucination in the generated output by comparing it with the input
31
- and the provided contexts or history. The analysis classifies the hallucination level as:
32
- - low
33
- - medium
34
- - high
35
-
36
- If no contexts or history are referenced in the generated output, the method verifies
37
- that the output does not contain any relevant information to answer the question.
38
-
39
- This method calls `check_hallucination_async` to perform the actual analysis. The `check_hallucination_async`
40
- function handles the evaluation using the company ID from the event, the provided input, and the configuration.
41
-
42
- Args:
43
- input (EvaluationMetricInput): The input data used for evaluation, including the generated output and reference information.
44
- config (EvaluationMetricConfig, optional): Configuration settings for the evaluation. Defaults to `hallucination_metric_default_config`.
45
-
46
- Returns:
47
- EvaluationMetricResult | None: The result of the evaluation, indicating the level of hallucination. Returns `None` if the analysis cannot be performed.
48
-
49
- Raises:
50
- EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
51
- """
52
- if config.enabled is False:
53
- self.logger.info("Hallucination metric is not enabled.")
54
- return None
55
-
56
- return await check_hallucination_async(
57
- company_id=self.event.company_id, input=input, config=config
58
- )
@@ -1,212 +0,0 @@
1
- import logging
2
- from string import Template
3
-
4
- from unique_toolkit.evaluators.config import (
5
- EvaluationMetricConfig,
6
- )
7
- from unique_toolkit.evaluators.exception import EvaluatorException
8
- from unique_toolkit.evaluators.hallucination.constants import (
9
- SYSTEM_MSG_DEFAULT_KEY,
10
- SYSTEM_MSG_KEY,
11
- USER_MSG_DEFAULT_KEY,
12
- USER_MSG_KEY,
13
- hallucination_required_input_fields,
14
- )
15
- from unique_toolkit.evaluators.output_parser import (
16
- parse_eval_metric_result,
17
- )
18
- from unique_toolkit.evaluators.schemas import (
19
- EvaluationMetricInput,
20
- EvaluationMetricName,
21
- EvaluationMetricResult,
22
- )
23
- from unique_toolkit.language_model import LanguageModelName
24
- from unique_toolkit.language_model.schemas import (
25
- LanguageModelMessages,
26
- LanguageModelSystemMessage,
27
- LanguageModelUserMessage,
28
- )
29
- from unique_toolkit.language_model.service import LanguageModelService
30
-
31
- from .prompts import (
32
- HALLUCINATION_METRIC_SYSTEM_MSG,
33
- HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
34
- HALLUCINATION_METRIC_USER_MSG,
35
- HALLUCINATION_METRIC_USER_MSG_DEFAULT,
36
- )
37
-
38
- logger = logging.getLogger(__name__)
39
-
40
-
41
- async def check_hallucination_async(
42
- company_id: str,
43
- input: EvaluationMetricInput,
44
- config: EvaluationMetricConfig,
45
- logger: logging.Logger = logger,
46
- ) -> EvaluationMetricResult | None:
47
- """Analyze the level of hallucination in the generated output.
48
-
49
- by comparing it with the provided input
50
- and the contexts or history. The analysis classifies the hallucination level as:
51
- - low
52
- - medium
53
- - high
54
-
55
- If no contexts or history are referenced in the generated output, the method checks that the output
56
- does not contain any relevant information to answer the question.
57
-
58
- This method performs the following steps:
59
- 1. Checks if the hallucination metric is enabled using the provided `config`.
60
- 2. Logs the start of the analysis using the provided `logger`.
61
- 3. Validates the required fields in the `input` data.
62
- 4. Retrieves the messages using the `_get_msgs` method.
63
- 5. Calls `LanguageModelService.complete_async_util` to get a completion result.
64
- 6. Parses and returns the evaluation metric result based on the content of the completion result.
65
-
66
- Args:
67
- company_id (str): The company ID for the analysis.
68
- input (EvaluationMetricInput): The input data used for evaluation, including the generated output and reference information.
69
- config (EvaluationMetricConfig, optional): Configuration settings for the evaluation. Defaults to `hallucination_metric_default_config`.
70
- logger (Optional[logging.Logger], optional): The logger used for logging information and errors. Defaults to the logger for the current module.
71
-
72
- Returns:
73
- EvaluationMetricResult | None: The result of the evaluation, indicating the level of hallucination. Returns `None` if the metric is not enabled or if an error occurs.
74
-
75
- Raises:
76
- EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
77
-
78
- """
79
- model_group_name = (
80
- config.language_model.name.value
81
- if isinstance(config.language_model.name, LanguageModelName)
82
- else config.language_model.name
83
- )
84
- logger.info(f"Analyzing level of hallucination with {model_group_name}.")
85
-
86
- input.validate_required_fields(hallucination_required_input_fields)
87
-
88
- try:
89
- msgs = _get_msgs(input, config, logger)
90
- result = await LanguageModelService.complete_async_util(
91
- company_id=company_id,
92
- messages=msgs,
93
- model_name=model_group_name,
94
- )
95
- result_content = result.choices[0].message.content
96
- if not result_content:
97
- error_message = "Hallucination evaluation did not return a result."
98
- raise EvaluatorException(
99
- error_message=error_message,
100
- user_message=error_message,
101
- )
102
- return parse_eval_metric_result(
103
- result_content, # type: ignore
104
- EvaluationMetricName.HALLUCINATION,
105
- )
106
- except Exception as e:
107
- error_message = "Error occurred during hallucination metric analysis"
108
- raise EvaluatorException(
109
- error_message=f"{error_message}: {e}",
110
- user_message=error_message,
111
- exception=e,
112
- )
113
-
114
-
115
- def _get_msgs(
116
- evaluation_metric_input: EvaluationMetricInput,
117
- config: EvaluationMetricConfig,
118
- logger: logging.Logger,
119
- ):
120
- """Composes the messages for hallucination analysis based on the provided evaluation_metric_input and configuration.
121
-
122
- This method decides how to compose the messages based on the availability of context texts and history
123
- message texts in the `evaluation_metric_input`
124
-
125
- Args:
126
- evaluation_metric_input (EvaluationMetricInput): The evaluation_metric_input data that includes context texts and history message texts
127
- for the analysis.
128
- config (EvaluationMetricConfig): The configuration settings for composing messages.
129
- logger (Optional[logging.Logger], optional): The logger used for logging debug information.
130
- Defaults to the logger for the current module.
131
-
132
- Returns:
133
- The composed messages as per the provided evaluation_metric_input and configuration. The exact type and structure
134
- depend on the implementation of the `compose_msgs` and `compose_msgs_default` methods.
135
-
136
- """
137
- if (
138
- evaluation_metric_input.context_texts
139
- or evaluation_metric_input.history_messages
140
- ):
141
- logger.debug("Using context / history for hallucination evaluation.")
142
- return _compose_msgs(evaluation_metric_input, config)
143
- logger.debug("No contexts and history provided for hallucination evaluation.")
144
- return _compose_msgs_default(evaluation_metric_input, config)
145
-
146
-
147
- def _compose_msgs(
148
- evaluation_metric_input: EvaluationMetricInput,
149
- config: EvaluationMetricConfig,
150
- ):
151
- """Composes the hallucination analysis messages."""
152
- system_msg_content = _get_system_prompt_with_contexts(config)
153
- system_msg = LanguageModelSystemMessage(content=system_msg_content)
154
-
155
- user_msg_templ = Template(_get_user_prompt_with_contexts(config))
156
- user_msg_content = user_msg_templ.substitute(
157
- evaluation_metric_input_text=evaluation_metric_input.evaluation_metric_input_text,
158
- contexts_text=evaluation_metric_input.get_joined_context_texts(
159
- tag_name="reference",
160
- ),
161
- history_messages_text=evaluation_metric_input.get_joined_history_texts(
162
- tag_name="conversation",
163
- ),
164
- output_text=evaluation_metric_input.output_text,
165
- )
166
- user_msg = LanguageModelUserMessage(content=user_msg_content)
167
- return LanguageModelMessages([system_msg, user_msg])
168
-
169
-
170
- def _compose_msgs_default(
171
- evaluation_metric_input: EvaluationMetricInput,
172
- config: EvaluationMetricConfig,
173
- ):
174
- """Composes the hallucination analysis prompt without messages."""
175
- system_msg_content = _get_system_prompt_default(config)
176
- system_msg = LanguageModelSystemMessage(content=system_msg_content)
177
-
178
- user_msg_templ = Template(_get_user_prompt_default(config))
179
- user_msg_content = user_msg_templ.substitute(
180
- evaluation_metric_input_text=evaluation_metric_input.evaluation_metric_input_text,
181
- output_text=evaluation_metric_input.output_text,
182
- )
183
- user_msg = LanguageModelUserMessage(content=user_msg_content)
184
- return LanguageModelMessages([system_msg, user_msg])
185
-
186
-
187
- def _get_system_prompt_with_contexts(config: EvaluationMetricConfig):
188
- return config.custom_prompts.setdefault(
189
- SYSTEM_MSG_KEY,
190
- HALLUCINATION_METRIC_SYSTEM_MSG,
191
- )
192
-
193
-
194
- def _get_user_prompt_with_contexts(config: EvaluationMetricConfig):
195
- return config.custom_prompts.setdefault(
196
- USER_MSG_KEY,
197
- HALLUCINATION_METRIC_USER_MSG,
198
- )
199
-
200
-
201
- def _get_system_prompt_default(config: EvaluationMetricConfig):
202
- return config.custom_prompts.setdefault(
203
- SYSTEM_MSG_DEFAULT_KEY,
204
- HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
205
- )
206
-
207
-
208
- def _get_user_prompt_default(config: EvaluationMetricConfig):
209
- return config.custom_prompts.setdefault(
210
- USER_MSG_DEFAULT_KEY,
211
- HALLUCINATION_METRIC_USER_MSG_DEFAULT,
212
- )
@@ -1,30 +0,0 @@
1
- from unique_toolkit.evaluators.exception import EvaluatorException
2
- from unique_toolkit.evaluators.schemas import (
3
- EvaluationMetricName,
4
- EvaluationMetricResult,
5
- )
6
- from unique_toolkit.language_model.utils import convert_string_to_json
7
-
8
-
9
- def parse_eval_metric_result(
10
- result: str,
11
- metric_name: EvaluationMetricName,
12
- ):
13
- """
14
- Parses the evaluation metric result.
15
- """
16
-
17
- try:
18
- parsed_result = convert_string_to_json(result)
19
- except Exception as e:
20
- error_message = "Error occurred during parsing the evaluation metric result"
21
- raise EvaluatorException(
22
- user_message=f"{error_message}.",
23
- error_message=f"{error_message}: {str(e)}",
24
- )
25
-
26
- return EvaluationMetricResult(
27
- name=metric_name,
28
- value=parsed_result.get("value", "None"),
29
- reason=parsed_result.get("reason", "None"),
30
- )
@@ -1,82 +0,0 @@
1
- from enum import Enum
2
- from typing import Optional
3
-
4
- from pydantic import BaseModel
5
-
6
- from unique_toolkit.chat import ChatMessage
7
- from unique_toolkit.evaluators.exception import EvaluatorException
8
-
9
-
10
- class EvaluationMetricName(Enum):
11
- HALLUCINATION = "hallucination"
12
- CONTEXT_RELEVANCY = "relevancy"
13
-
14
-
15
- class EvaluationMetricInputFieldName(str, Enum):
16
- INPUT_TEXT = "input_text"
17
- CONTEXT_TEXTS = "context_texts"
18
- HISTORY_MESSAGES = "history_messages"
19
- OUTPUT_TEXT = "output_text"
20
-
21
-
22
- class EvaluationMetricInput(BaseModel):
23
- """
24
- Input for any metric evaluation. Depending on the metric, the input can be different.
25
- """
26
-
27
- input_text: Optional[str] = None
28
- context_texts: Optional[list[str]] = None
29
- history_messages: Optional[list[ChatMessage]] = None
30
- output_text: Optional[str] = None
31
-
32
- def get_joined_context_texts(self, tag_name: str = "reference") -> str:
33
- """
34
- Concatenates context_texts.
35
- """
36
- if not self.context_texts:
37
- return f"<No {tag_name} texts provided>"
38
-
39
- return "\n".join(
40
- [
41
- f"<{tag_name}-{index}>{text}</{tag_name}-{index}>"
42
- for index, text in enumerate(self.context_texts)
43
- ]
44
- )
45
-
46
- def get_history_message_text(self, chat_message: ChatMessage):
47
- return f"{chat_message.role.value}: {chat_message.content}"
48
-
49
- def get_history_message_texts(self):
50
- if not self.history_messages:
51
- return []
52
- return [self.get_history_message_text(msg) for msg in self.history_messages]
53
-
54
- def get_joined_history_texts(self, tag_name: str = "conversation") -> str:
55
- """
56
- Concatenates history message texts.
57
- """
58
- if not self.history_messages:
59
- return f"<No {tag_name} texts provided>"
60
-
61
- return "\n".join(self.get_history_message_texts())
62
-
63
- def validate_required_fields(
64
- self, required_fields: list[EvaluationMetricInputFieldName]
65
- ):
66
- """
67
- Validates the input fields for the hallucination metric.
68
- """
69
- for field in required_fields:
70
- value = getattr(self, field)
71
- if value is None:
72
- error_message = f"Missing required input field: {field}"
73
- raise EvaluatorException(
74
- user_message=error_message,
75
- error_message=error_message,
76
- )
77
-
78
-
79
- class EvaluationMetricResult(BaseModel):
80
- name: EvaluationMetricName
81
- value: str
82
- reason: str